# Import Packages

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Looking at the Data

In [2]:
df = pd.read_csv('../../../data/chocolate.csv')
df

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
0,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.50
2,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25
3,797,A. Morin,France,2012,Peru,Peru,63%,"4- B,S,C,L","fruity, melon, roasty",3.75
4,797,A. Morin,France,2012,Bolivia,Bolivia,70%,"4- B,S,C,L","vegetal, nutty",3.50
...,...,...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,Raw,80%,"4- B,S*,C,Sa","waxy, cloying, vegetal",2.75
2358,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,"3- B,S,C","strong nutty, marshmallow",3.75
2359,2170,Zotter,Austria,2018,Belize,Maya Mtn,72%,"3- B,S,C","muted, roasty, accessible",3.50
2360,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,"3- B,S,C","fatty, mild nuts, mild fruit",3.25


# Changing column name

In [3]:
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ','_')

In [4]:
df

Unnamed: 0,ref,company_(manufacturer),company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,ingredients,most_memorable_characteristics,rating
0,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.50
2,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25
3,797,A. Morin,France,2012,Peru,Peru,63%,"4- B,S,C,L","fruity, melon, roasty",3.75
4,797,A. Morin,France,2012,Bolivia,Bolivia,70%,"4- B,S,C,L","vegetal, nutty",3.50
...,...,...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,Raw,80%,"4- B,S*,C,Sa","waxy, cloying, vegetal",2.75
2358,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,"3- B,S,C","strong nutty, marshmallow",3.75
2359,2170,Zotter,Austria,2018,Belize,Maya Mtn,72%,"3- B,S,C","muted, roasty, accessible",3.50
2360,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,"3- B,S,C","fatty, mild nuts, mild fruit",3.25


# Drop nulls, columns and change object to float

In [5]:
df = df.dropna()

In [6]:
df = df.drop(columns=['specific_bean_origin_or_bar_name', 'most_memorable_characteristics'])

In [7]:
df = df.replace('%', ' ', regex=True)

In [8]:
df['cocoa_percent'] = df['cocoa_percent'].astype(float)

In [9]:
df

Unnamed: 0,ref,company_(manufacturer),company_location,review_date,country_of_bean_origin,cocoa_percent,ingredients,rating
0,2454,5150,U.S.A.,2019,Madagascar,76.0,"3- B,S,C",3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,76.0,"3- B,S,C",3.50
2,2454,5150,U.S.A.,2019,Tanzania,76.0,"3- B,S,C",3.25
3,797,A. Morin,France,2012,Peru,63.0,"4- B,S,C,L",3.75
4,797,A. Morin,France,2012,Bolivia,70.0,"4- B,S,C,L",3.50
...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,80.0,"4- B,S*,C,Sa",2.75
2358,1996,Zotter,Austria,2017,Colombia,75.0,"3- B,S,C",3.75
2359,2170,Zotter,Austria,2018,Belize,72.0,"3- B,S,C",3.50
2360,2170,Zotter,Austria,2018,Congo,70.0,"3- B,S,C",3.25


# Filitering Data

In [10]:
location_to_keep = list(df['company_location'].value_counts()[:29].index)

In [11]:
filtered_df = df[df['company_location'].isin(location_to_keep)]

In [12]:
filtered_df

Unnamed: 0,ref,company_(manufacturer),company_location,review_date,country_of_bean_origin,cocoa_percent,ingredients,rating
0,2454,5150,U.S.A.,2019,Madagascar,76.0,"3- B,S,C",3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,76.0,"3- B,S,C",3.50
2,2454,5150,U.S.A.,2019,Tanzania,76.0,"3- B,S,C",3.25
3,797,A. Morin,France,2012,Peru,63.0,"4- B,S,C,L",3.75
4,797,A. Morin,France,2012,Bolivia,70.0,"4- B,S,C,L",3.50
...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,80.0,"4- B,S*,C,Sa",2.75
2358,1996,Zotter,Austria,2017,Colombia,75.0,"3- B,S,C",3.75
2359,2170,Zotter,Austria,2018,Belize,72.0,"3- B,S,C",3.50
2360,2170,Zotter,Austria,2018,Congo,70.0,"3- B,S,C",3.25


In [13]:
filtered_df.country_of_bean_origin.value_counts().head(32)

Venezuela             230
Dominican Republic    199
Peru                  199
Ecuador               189
Madagascar            153
Blend                 135
Nicaragua              87
Bolivia                72
Brazil                 72
Belize                 66
Colombia               65
Tanzania               64
Vietnam                62
Guatemala              56
Papua New Guinea       46
Mexico                 38
Trinidad               37
Costa Rica             30
Ghana                  30
U.S.A.                 28
Haiti                  26
Jamaica                22
Honduras               21
India                  20
Indonesia              16
Philippines            14
Grenada                13
Cuba                   12
Uganda                 12
Congo                  11
Fiji                   11
Sao Tome               10
Name: country_of_bean_origin, dtype: int64

In [14]:
country_to_keep = list(filtered_df['country_of_bean_origin'].value_counts()[:32].index)

In [15]:
filtered_df = filtered_df[filtered_df['country_of_bean_origin'].isin(country_to_keep)]

In [16]:
filtered_df 

Unnamed: 0,ref,company_(manufacturer),company_location,review_date,country_of_bean_origin,cocoa_percent,ingredients,rating
0,2454,5150,U.S.A.,2019,Madagascar,76.0,"3- B,S,C",3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,76.0,"3- B,S,C",3.50
2,2454,5150,U.S.A.,2019,Tanzania,76.0,"3- B,S,C",3.25
3,797,A. Morin,France,2012,Peru,63.0,"4- B,S,C,L",3.75
4,797,A. Morin,France,2012,Bolivia,70.0,"4- B,S,C,L",3.50
...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,80.0,"4- B,S*,C,Sa",2.75
2358,1996,Zotter,Austria,2017,Colombia,75.0,"3- B,S,C",3.75
2359,2170,Zotter,Austria,2018,Belize,72.0,"3- B,S,C",3.50
2360,2170,Zotter,Austria,2018,Congo,70.0,"3- B,S,C",3.25


In [17]:
filtered_df.ingredients.value_counts().head(7)

3- B,S,C        813
2- B,S          594
4- B,S,C,L      226
5- B,S,C,V,L    176
4- B,S,C,V      134
2- B,S*          29
4- B,S*,C,Sa     19
Name: ingredients, dtype: int64

In [18]:
ingredients_to_keep = list(filtered_df['ingredients'].value_counts()[:7].index)

In [19]:
filtered_df = filtered_df[filtered_df['ingredients'].isin(ingredients_to_keep)]

In [20]:
filtered_df 

Unnamed: 0,ref,company_(manufacturer),company_location,review_date,country_of_bean_origin,cocoa_percent,ingredients,rating
0,2454,5150,U.S.A.,2019,Madagascar,76.0,"3- B,S,C",3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,76.0,"3- B,S,C",3.50
2,2454,5150,U.S.A.,2019,Tanzania,76.0,"3- B,S,C",3.25
3,797,A. Morin,France,2012,Peru,63.0,"4- B,S,C,L",3.75
4,797,A. Morin,France,2012,Bolivia,70.0,"4- B,S,C,L",3.50
...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,80.0,"4- B,S*,C,Sa",2.75
2358,1996,Zotter,Austria,2017,Colombia,75.0,"3- B,S,C",3.75
2359,2170,Zotter,Austria,2018,Belize,72.0,"3- B,S,C",3.50
2360,2170,Zotter,Austria,2018,Congo,70.0,"3- B,S,C",3.25


# looking at data tpes before

In [21]:
df.dtypes

ref                         int64
company_(manufacturer)     object
company_location           object
review_date                 int64
country_of_bean_origin     object
cocoa_percent             float64
ingredients                object
rating                    float64
dtype: object