# Rating chocolates

---
## Importing modules

In [120]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import cohen_kappa_score, balanced_accuracy_score, make_scorer, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

---
## Preparation of data

In [121]:
data = pd.read_csv('flavors_of_cacao.csv')
data.columns

Index(['Company \n(Maker-if known)', 'Specific Bean Origin\nor Bar Name',
       'REF', 'Review\nDate', 'Cocoa\nPercent', 'Company\nLocation', 'Rating',
       'Bean\nType', 'Broad Bean\nOrigin'],
      dtype='object')

In [122]:
data.columns = data.columns.str.replace('\n', ' ').str.replace('  ', ' ')
data.columns[0]

'Company\xa0 (Maker-if known)'

In [123]:
data.columns = data.columns.str.replace('\xa0', '')
data.columns[0]

'Company (Maker-if known)'

In [124]:
temp = data[data.iloc[:, 0].str.contains('\(')].iloc[:, 0]
maker_of_chocolate = [temp.iloc[n][x+1:-1] for n, x in enumerate(temp.str.find('(').tolist())]

data['maker'] = 'Unknown'
data.iloc[temp.index, -1] = maker_of_chocolate
data.maker.unique()

array(['Unknown', 'Gianduja', 'Pralus', 'Simon Coll', 'Casa Luker',
       'Felchlin', 'A. Morin', 'Tulicorp', 'Theobroma Inversiones',
       'Ecuatoriana', 'Girard', 'Salgado', 'Kraft', 'Belcolade',
       'Michel Cluizel', 'Guittard', 'Claudio Corallo', 'Mars', 'AMMA',
       'Cortes', 'Bonnat', 'ICAM', 'Brasstown', 'Fruition', 'Manoa',
       'Millcreek', 'Mindo', 'Zokoko', 'Coppeneur', 'Cemoi', 'Valrhona',
       'Callebaut', 'Cinagra', 'Chocolaterie Robert', 'aka Cinagra',
       'Mesocacao', 'Malmo', 'aka Dead Dog', 'Wolter', 'aka Confecta',
       'aka Chocolaterie Robert', 'Grenada Chocolate Co.',
       'Compania Nacional', 'aka Bernrain', 'aka Vanillabeans',
       'Barry Callebaut'], dtype=object)

In [125]:
company = [temp.iloc[n][:x] for n, x in enumerate(temp.str.find('(').tolist())]

data['company'] = np.nan
data.iloc[temp.index, -1] = company
data.loc[data['company'].isnull(), 'company'] = data.loc[data['company'].isnull(), 'Company (Maker-if known)']
data.iloc[:,-1] = data.iloc[:,-1].str.strip()
assert len(data.company.unique()) == len(data.company.str.lower().unique())
print(len(data.company.unique()))
data.company.unique()

405


array(['A. Morin', 'Acalli', 'Adi', 'Aequare', 'Ah Cacao', "Akesson's",
       'Alain Ducasse', 'Alexandre', 'Altus aka Cao Artisan', 'Amano',
       'Amatller', 'Amazona', 'Ambrosia', 'Amedei', 'AMMA', 'Anahata',
       'Animas', 'Ara', 'Arete', 'Artisan du Chocolat', 'Askinosie',
       'Bahen & Co.', 'Bakau', 'Bar Au Chocolat', "Baravelli's", 'Batch',
       'Beau Cacao', 'Beehive', 'Belcolade', 'Bellflower', 'Belyzium',
       'Benoit Nihant', 'Bernachon', 'Beschle', 'Bisou',
       'Bittersweet Origins', 'Black Mountain', 'Black River', 'Blanxart',
       'Blue Bandana', 'Bonnat', 'Bouga Cacao', 'Bowler Man',
       "Brasstown aka It's Chocolate", 'Brazen', 'Breeze Mill', 'Bright',
       'Britarev', 'Bronx Grrl Chocolate', 'Burnt Fork Bend',
       'Cacao Arabuco', 'Cacao Atlanta', 'Cacao Barry', 'Cacao de Origen',
       'Cacao de Origin', 'Cacao Hunters', 'Cacao Market', 'Cacao Prieto',
       'Cacao Sampaka', 'Cacao Store', 'Cacaosuyo', 'Cacaoyere',
       'Callebaut', 'C-Amar

In [126]:
data.company = data.company.str.replace('Na�ve', 'Naive')
len(data.company.unique())

404

In [127]:
data = data.drop('Company (Maker-if known)', axis=1)
col = data.columns[0]
col

'Specific Bean Origin or Bar Name'

In [128]:
print(len(data[data[col].str.contains(',')]))
print(len(data[data[col].str.contains('\(')]))

583
6


In [129]:
bean_df_many_values = data[data[col].str.contains(',')]
bean_df_one_value = data[~data[col].str.contains(',')]
values = bean_df_many_values[col].str.split(',').str[0].value_counts()
values

Porcelana          21
Los Rios           17
Camino Verde P.    15
Maranon            12
Chuao              11
                   ..
Almendra Blanca     1
El Carmen           1
Asochivite          1
Epique              1
Indianer            1
Name: Specific Bean Origin or Bar Name, Length: 283, dtype: int64

In [130]:
data['specific_bean_origin'] = np.nan
data.loc[bean_df_many_values.index, 'specific_bean_origin'] = bean_df_many_values[col].str.split(',').str[0]
data.loc[bean_df_one_value.index, 'specific_bean_origin'] = data.loc[bean_df_one_value.index, col]
data

Unnamed: 0,Specific Bean Origin or Bar Name,REF,Review Date,Cocoa Percent,Company Location,Rating,Bean Type,Broad Bean Origin,maker,company,specific_bean_origin
0,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome,Unknown,A. Morin,Agua Grande
1,Kpime,1676,2015,70%,France,2.75,,Togo,Unknown,A. Morin,Kpime
2,Atsane,1676,2015,70%,France,3.00,,Togo,Unknown,A. Morin,Atsane
3,Akata,1680,2015,70%,France,3.50,,Togo,Unknown,A. Morin,Akata
4,Quilla,1704,2015,70%,France,3.50,,Peru,Unknown,A. Morin,Quilla
...,...,...,...,...,...,...,...,...,...,...,...
1790,Peru,647,2011,70%,Austria,3.75,,Peru,Unknown,Zotter,Peru
1791,Congo,749,2011,65%,Austria,3.00,Forastero,Congo,Unknown,Zotter,Congo
1792,Kerala State,749,2011,65%,Austria,3.50,Forastero,India,Unknown,Zotter,Kerala State
1793,Kerala State,781,2011,62%,Austria,3.25,,India,Unknown,Zotter,Kerala State


In [131]:
data = data.drop(col, axis=1)
del col
data.columns

Index(['REF', 'Review Date', 'Cocoa Percent', 'Company Location', 'Rating',
       'Bean Type', 'Broad Bean Origin', 'maker', 'company',
       'specific_bean_origin'],
      dtype='object')

In [132]:
data_columns = {column:column.lower().replace(' ', '_') for column in data.columns}
data = data.rename(columns=data_columns)
data

Unnamed: 0,ref,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin,maker,company,specific_bean_origin
0,1876,2016,63%,France,3.75,,Sao Tome,Unknown,A. Morin,Agua Grande
1,1676,2015,70%,France,2.75,,Togo,Unknown,A. Morin,Kpime
2,1676,2015,70%,France,3.00,,Togo,Unknown,A. Morin,Atsane
3,1680,2015,70%,France,3.50,,Togo,Unknown,A. Morin,Akata
4,1704,2015,70%,France,3.50,,Peru,Unknown,A. Morin,Quilla
...,...,...,...,...,...,...,...,...,...,...
1790,647,2011,70%,Austria,3.75,,Peru,Unknown,Zotter,Peru
1791,749,2011,65%,Austria,3.00,Forastero,Congo,Unknown,Zotter,Congo
1792,749,2011,65%,Austria,3.50,Forastero,India,Unknown,Zotter,Kerala State
1793,781,2011,62%,Austria,3.25,,India,Unknown,Zotter,Kerala State


In [133]:
data['cocoa_percent'] = data['cocoa_percent'].str.replace('%', '')
data

Unnamed: 0,ref,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin,maker,company,specific_bean_origin
0,1876,2016,63,France,3.75,,Sao Tome,Unknown,A. Morin,Agua Grande
1,1676,2015,70,France,2.75,,Togo,Unknown,A. Morin,Kpime
2,1676,2015,70,France,3.00,,Togo,Unknown,A. Morin,Atsane
3,1680,2015,70,France,3.50,,Togo,Unknown,A. Morin,Akata
4,1704,2015,70,France,3.50,,Peru,Unknown,A. Morin,Quilla
...,...,...,...,...,...,...,...,...,...,...
1790,647,2011,70,Austria,3.75,,Peru,Unknown,Zotter,Peru
1791,749,2011,65,Austria,3.00,Forastero,Congo,Unknown,Zotter,Congo
1792,749,2011,65,Austria,3.50,Forastero,India,Unknown,Zotter,Kerala State
1793,781,2011,62,Austria,3.25,,India,Unknown,Zotter,Kerala State


In [134]:
data.company_location.unique()

array(['France', 'U.S.A.', 'Fiji', 'Ecuador', 'Mexico', 'Switzerland',
       'Netherlands', 'Spain', 'Peru', 'Canada', 'Italy', 'Brazil',
       'U.K.', 'Australia', 'Wales', 'Belgium', 'Germany', 'Russia',
       'Puerto Rico', 'Venezuela', 'Colombia', 'Japan', 'New Zealand',
       'Costa Rica', 'South Korea', 'Amsterdam', 'Scotland', 'Martinique',
       'Sao Tome', 'Argentina', 'Guatemala', 'South Africa', 'Bolivia',
       'St. Lucia', 'Portugal', 'Singapore', 'Denmark', 'Vietnam',
       'Grenada', 'Israel', 'India', 'Czech Republic',
       'Domincan Republic', 'Finland', 'Madagascar', 'Philippines',
       'Sweden', 'Poland', 'Austria', 'Honduras', 'Nicaragua',
       'Lithuania', 'Niacragua', 'Chile', 'Ghana', 'Iceland', 'Eucador',
       'Hungary', 'Suriname', 'Ireland'], dtype=object)

In [135]:
data.bean_type.unique()

array(['\xa0', 'Criollo', 'Trinitario', 'Forastero (Arriba)', 'Forastero',
       'Forastero (Nacional)', 'Criollo, Trinitario',
       'Criollo (Porcelana)', 'Blend', 'Trinitario (85% Criollo)',
       'Forastero (Catongo)', 'Forastero (Parazinho)',
       'Trinitario, Criollo', 'CCN51', 'Criollo (Ocumare)', 'Nacional',
       'Criollo (Ocumare 61)', 'Criollo (Ocumare 77)',
       'Criollo (Ocumare 67)', 'Criollo (Wild)', 'Beniano', 'Amazon mix',
       'Trinitario, Forastero', 'Forastero (Arriba) ASS', 'Criollo, +',
       'Amazon', 'Amazon, ICS', 'EET', 'Blend-Forastero,Criollo',
       'Trinitario (Scavina)', 'Criollo, Forastero', 'Matina',
       'Forastero(Arriba, CCN)', 'Nacional (Arriba)',
       'Forastero (Arriba) ASSS', 'Forastero, Trinitario',
       'Forastero (Amelonado)', nan, 'Trinitario, Nacional',
       'Trinitario (Amelonado)', 'Trinitario, TCGA', 'Criollo (Amarru)'],
      dtype=object)

In [136]:
data = data.replace('\xa0', np.nan)
data

Unnamed: 0,ref,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin,maker,company,specific_bean_origin
0,1876,2016,63,France,3.75,,Sao Tome,Unknown,A. Morin,Agua Grande
1,1676,2015,70,France,2.75,,Togo,Unknown,A. Morin,Kpime
2,1676,2015,70,France,3.00,,Togo,Unknown,A. Morin,Atsane
3,1680,2015,70,France,3.50,,Togo,Unknown,A. Morin,Akata
4,1704,2015,70,France,3.50,,Peru,Unknown,A. Morin,Quilla
...,...,...,...,...,...,...,...,...,...,...
1790,647,2011,70,Austria,3.75,,Peru,Unknown,Zotter,Peru
1791,749,2011,65,Austria,3.00,Forastero,Congo,Unknown,Zotter,Congo
1792,749,2011,65,Austria,3.50,Forastero,India,Unknown,Zotter,Kerala State
1793,781,2011,62,Austria,3.25,,India,Unknown,Zotter,Kerala State


In [137]:
data.isnull().sum()

ref                       0
review_date               0
cocoa_percent             0
company_location          0
rating                    0
bean_type               888
broad_bean_origin        74
maker                     0
company                   0
specific_bean_origin      0
dtype: int64

In [138]:
b_b_origin_nans = data.copy()[data['broad_bean_origin'].isnull()].loc[:, 'specific_bean_origin']
b_b_origin_nans

77                 Nine
85        Toscano Black
86        Toscano Black
87        Toscano Black
144          Houseblend
             ...       
1766        House Blend
1774      Goddess Blend
1778                Raw
1780    Amazonas Frucht
1789           Indianer
Name: specific_bean_origin, Length: 74, dtype: object

In [139]:
data.loc[b_b_origin_nans.index, 'broad_bean_origin'] = b_b_origin_nans
data.isnull().sum()

ref                       0
review_date               0
cocoa_percent             0
company_location          0
rating                    0
bean_type               888
broad_bean_origin         0
maker                     0
company                   0
specific_bean_origin      0
dtype: int64

In [140]:
most_part_bean_type = data['bean_type'].str.replace('\(', ' ').str.replace(',', ' ').str.replace('-', ' ').str.split().str[0]
most_part_bean_type.value_counts()

  most_part_bean_type = data['bean_type'].str.replace('\(', ' ').str.replace(',', ' ').str.replace('-', ' ').str.split().str[0]


Trinitario    436
Criollo       213
Forastero     196
Blend          42
Nacional        5
Amazon          5
Beniano         3
EET             3
Matina          3
CCN51           1
Name: bean_type, dtype: int64

In [141]:
data['bean_type'] = most_part_bean_type
data

Unnamed: 0,ref,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin,maker,company,specific_bean_origin
0,1876,2016,63,France,3.75,,Sao Tome,Unknown,A. Morin,Agua Grande
1,1676,2015,70,France,2.75,,Togo,Unknown,A. Morin,Kpime
2,1676,2015,70,France,3.00,,Togo,Unknown,A. Morin,Atsane
3,1680,2015,70,France,3.50,,Togo,Unknown,A. Morin,Akata
4,1704,2015,70,France,3.50,,Peru,Unknown,A. Morin,Quilla
...,...,...,...,...,...,...,...,...,...,...
1790,647,2011,70,Austria,3.75,,Peru,Unknown,Zotter,Peru
1791,749,2011,65,Austria,3.00,Forastero,Congo,Unknown,Zotter,Congo
1792,749,2011,65,Austria,3.50,Forastero,India,Unknown,Zotter,Kerala State
1793,781,2011,62,Austria,3.25,,India,Unknown,Zotter,Kerala State


In [142]:
specific_bean_origin_df = pd.DataFrame(data.dropna().groupby('specific_bean_origin').bean_type.unique().reset_index())
specific_bean_origin_df['bean_type_'] = specific_bean_origin_df['bean_type'].str[0]
specific_bean_origin_df = specific_bean_origin_df.drop('bean_type', axis=1)

In [143]:
data = data.merge(specific_bean_origin_df.dropna(), how='outer', on=['specific_bean_origin'])
data.loc[data.bean_type.isnull(), 'bean_type'] = data.loc[data.bean_type.isnull(), 'bean_type_']
data.isnull().sum()

ref                       0
review_date               0
cocoa_percent             0
company_location          0
rating                    0
bean_type               445
broad_bean_origin         0
maker                     0
company                   0
specific_bean_origin      0
bean_type_              445
dtype: int64

In [144]:
company_bean_type_df = pd.DataFrame(data.dropna().groupby('company').bean_type.unique().reset_index())
company_bean_type_df['bean_type_'] = company_bean_type_df['bean_type'].str[0]
company_bean_type_df = company_bean_type_df.drop('bean_type', axis=1)

In [145]:
data = data.iloc[:, :-1].merge(company_bean_type_df.dropna(), how='outer', on=['company'])
data.loc[data.bean_type.isnull(), 'bean_type'] = data.loc[data.bean_type.isnull(), 'bean_type_']
data.isnull().sum()

ref                       0
review_date               0
cocoa_percent             0
company_location          0
rating                    0
bean_type               106
broad_bean_origin         0
maker                     0
company                   0
specific_bean_origin      0
bean_type_              106
dtype: int64

In [146]:
broad_bean_origin_df = pd.DataFrame(data.dropna().groupby('broad_bean_origin').bean_type.unique().reset_index())
broad_bean_origin_df['bean_type_'] = broad_bean_origin_df['bean_type'].str[0]
broad_bean_origin_df = broad_bean_origin_df.drop('bean_type', axis=1)

In [147]:
data = data.iloc[:, :-1].merge(broad_bean_origin_df.dropna(), how='outer', on=['broad_bean_origin'])
data.loc[data.bean_type.isnull(), 'bean_type'] = data.loc[data.bean_type.isnull(), 'bean_type_']
data.isnull().sum()

ref                      0
review_date              0
cocoa_percent            0
company_location         0
rating                   0
bean_type               13
broad_bean_origin        0
maker                    0
company                  0
specific_bean_origin     0
bean_type_              13
dtype: int64

In [148]:
data[data.bean_type.isnull()]

Unnamed: 0,ref,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin,maker,company,specific_bean_origin,bean_type_
1773,797,2012,55,France,2.75,,Nature,Unknown,Bernachon,Nature,
1775,552,2010,60,Colombia,3.0,,"Colombia, Ecuador",Unknown,Luker,Macondo,
1776,552,2010,46,Colombia,2.75,,"Colombia, Ecuador",Unknown,Luker,Selva,
1777,552,2010,58,Colombia,3.0,,"Colombia, Ecuador",Unknown,Luker,Misterio,
1778,1149,2013,42,Martinique,2.75,,Martinique,Girard,Chokolat Elot,Guadeloupe,
1779,48,2006,70,Belgium,1.0,,Sensations Intense,Kraft,Cote d' Or,Sensations Intense,
1783,701,2011,70,Australia,3.0,,"South America, Africa",Unknown,Haigh,South America and Africa,
1784,1494,2015,80,Honduras,2.75,,El Salvador,Unknown,Mesocacao,El Salvador,
1785,1494,2015,70,Honduras,3.0,,El Salvador,Unknown,Mesocacao,El Salvador,
1786,157,2007,85,Switzerland,3.0,,Excellence (US Version),Unknown,Lindt & Sprungli,Excellence (US Version),


In [149]:
data[data['maker'] == 'Girard']

Unnamed: 0,ref,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin,maker,company,specific_bean_origin,bean_type_
1778,1149,2013,42,Martinique,2.75,,Martinique,Girard,Chokolat Elot,Guadeloupe,


In [150]:
data[data['maker'] == 'Kraft']

Unnamed: 0,ref,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin,maker,company,specific_bean_origin,bean_type_
1779,48,2006,70,Belgium,1.0,,Sensations Intense,Kraft,Cote d' Or,Sensations Intense,


In [151]:
cocoa_percent_without_bean_type = data[data.bean_type.isnull()]['cocoa_percent'].unique()

bean_type_depending_on_percent_of_cocoa = []
for cocoa_per in cocoa_percent_without_bean_type:
    bean_type_depending_on_percent_of_cocoa.append((data[data['cocoa_percent'] == cocoa_per]['bean_type'].mode().values.tolist()))

In [152]:
cocoa_percent_bean_type_df = pd.DataFrame(data={'cocoa_percent': cocoa_percent_without_bean_type,
                                                'bean_type_': bean_type_depending_on_percent_of_cocoa})
cocoa_percent_bean_type_df.iloc[:, -1] = cocoa_percent_bean_type_df.iloc[:, -1].str[0]
cocoa_percent_bean_type_df

Unnamed: 0,cocoa_percent,bean_type_
0,55,Forastero
1,60,Trinitario
2,46,
3,58,Forastero
4,42,
5,70,Trinitario
6,80,Trinitario
7,85,Trinitario
8,75,Trinitario
9,100,Trinitario


In [153]:
data = data.iloc[:, :-1].merge(cocoa_percent_bean_type_df.dropna(), how='outer', on=['cocoa_percent'])
data.loc[data.bean_type.isnull(), 'bean_type'] = data.loc[data.bean_type.isnull(), 'bean_type_']
data.isnull().sum()

ref                       0
review_date               0
cocoa_percent             0
company_location          0
rating                    0
bean_type                 2
broad_bean_origin         0
maker                     0
company                   0
specific_bean_origin      0
bean_type_              706
dtype: int64

In [154]:
data[data['bean_type'].isnull()]

Unnamed: 0,ref,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin,maker,company,specific_bean_origin,bean_type_
1793,552,2010,46,Colombia,2.75,,"Colombia, Ecuador",Unknown,Luker,Selva,
1794,1149,2013,42,Martinique,2.75,,Martinique,Girard,Chokolat Elot,Guadeloupe,


In [155]:
print(data[data['company_location'] == 'Colombia'].bean_type.mode())
print(data[data['company_location'] == 'Martinique'].bean_type.mode())

0    Trinitario
dtype: object
Series([], dtype: object)


In [156]:
print(data.bean_type.mode())

0    Trinitario
dtype: object


In [157]:
data['bean_type'] = data['bean_type'].fillna('Trinitario')
data = data.drop('bean_type_', axis=1)
data.isnull().sum()

ref                     0
review_date             0
cocoa_percent           0
company_location        0
rating                  0
bean_type               0
broad_bean_origin       0
maker                   0
company                 0
specific_bean_origin    0
dtype: int64

## Finally our data is without nans and with more columns to get information from. We need to replace some values with "Less than 4", to have less encoded values in future

In [158]:
company_location_to_replace = data.company_location.value_counts()[data['company_location'].value_counts() < 4].index
bean_type_to_replace = data.bean_type.value_counts()[data['bean_type'].value_counts() < 4].index
broad_bean_origin_to_replace = data.broad_bean_origin.value_counts()[data['broad_bean_origin'].value_counts() < 4].index
maker_to_replace = data.maker.value_counts()[data['maker'].value_counts() < 4].index
company_to_replace = data.company.value_counts()[data['company'].value_counts() < 4].index
specific_bean_origin_to_replace = data.specific_bean_origin.value_counts()[data['specific_bean_origin'].value_counts() < 4].index

In [159]:
data = data.replace(company_location_to_replace, 'Less_than_4')
data = data.replace(bean_type_to_replace, 'Less_than_4')
data = data.replace(broad_bean_origin_to_replace, 'Less_than_4')
data = data.replace(maker_to_replace, 'Less_than_4')
data = data.replace(company_to_replace, 'Less_than_4')
data = data.replace(specific_bean_origin_to_replace, 'Less_than_4')
data

Unnamed: 0,ref,review_date,cocoa_percent,company_location,rating,bean_type,broad_bean_origin,maker,company,specific_bean_origin
0,1876,2016,63,France,3.75,Less_than_4,Sao Tome,Unknown,A. Morin,Less_than_4
1,1019,2013,63,France,4.00,Less_than_4,Peru,Unknown,A. Morin,Less_than_4
2,797,2012,63,France,3.75,Less_than_4,Peru,Unknown,A. Morin,Peru
3,1510,2015,63,France,4.00,Less_than_4,Peru,Unknown,Valrhona,Piura
4,1430,2014,63,U.S.A.,3.50,Less_than_4,Peru,Unknown,Maverick,Less_than_4
...,...,...,...,...,...,...,...,...,...,...
1790,1458,2015,57,U.S.A.,2.75,Forastero,Uganda,Unknown,Less_than_4,Uganda
1791,81,2006,99,France,2.00,Less_than_4,Less_than_4,Unknown,Less_than_4,Less_than_4
1792,915,2012,99,U.S.A.,3.25,Forastero,Less_than_4,Unknown,TCHO,Less_than_4
1793,552,2010,46,Colombia,2.75,Less_than_4,Less_than_4,Unknown,Luker,Less_than_4


---
## Let's check dtypes (if necessary change it) and prepare our data to be able to feed it into model

In [160]:
data.dtypes

ref                       int64
review_date               int64
cocoa_percent            object
company_location         object
rating                  float64
bean_type                object
broad_bean_origin        object
maker                    object
company                  object
specific_bean_origin     object
dtype: object

In [161]:
data[['ref', 'review_date']] = data[['ref', 'review_date']].astype(np.int32)
data['rating'] = data['rating'].astype(np.float32)
data['cocoa_percent'] = data['cocoa_percent'].astype(np.float32, errors='raise')
data['rating']
data.dtypes

ref                       int32
review_date               int32
cocoa_percent           float32
company_location         object
rating                  float32
bean_type                object
broad_bean_origin        object
maker                    object
company                  object
specific_bean_origin     object
dtype: object

In [162]:
X = data.drop('rating', axis=1)
y = data['rating']
y.value_counts()

3.50    392
3.00    341
3.25    303
2.75    259
3.75    210
2.50    127
4.00     98
2.00     32
2.25     14
1.50     10
1.00      4
1.75      3
5.00      2
Name: rating, dtype: int64

There is no possibility to generalize well for these ratings:


In [163]:
y.value_counts()[y.value_counts() < 15].index.to_list()

[2.25, 1.5, 1.0, 1.75, 5.0]

So what about we make more generalizable ratings?

In [164]:
y = y.replace(1, 'less_than_2')
y = y.replace(1.5, 'less_than_2')
y = y.replace(1.75, 'less_than_2')
y = y.replace(5, '4_or_more')
y = y.replace(4, '4_or_more')
y = y.replace(2.25, 2)
y.value_counts()

3.5            392
3.0            341
3.25           303
2.75           259
3.75           210
2.5            127
4_or_more      100
2               46
less_than_2     17
Name: rating, dtype: int64

Now better

In [165]:
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(y.astype(np.str))
encoded_y

array([7, 8, 7, ..., 5, 3, 3])

In [166]:
encoded_X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(encoded_X, encoded_y, test_size=0.2, random_state=42, stratify=encoded_y)

In [167]:
encoded_X

Unnamed: 0,ref,review_date,cocoa_percent,company_location_Amsterdam,company_location_Argentina,company_location_Austria,company_location_Belgium,company_location_Brazil,company_location_Canada,company_location_Colombia,...,specific_bean_origin_Trinidad,specific_bean_origin_Tumaco,specific_bean_origin_Tumbes,specific_bean_origin_Uganda,specific_bean_origin_Upala,specific_bean_origin_Vanua Levu,specific_bean_origin_Vanuatu,specific_bean_origin_Venezuela,specific_bean_origin_Vietnam,specific_bean_origin_Xoconusco
0,1876,2016,63.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1019,2013,63.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,797,2012,63.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1510,2015,63.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1430,2014,63.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1790,1458,2015,57.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1791,81,2006,99.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1792,915,2012,99.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1793,552,2010,46.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [168]:
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled.iloc[:, :3] = scaler.fit_transform(X_train.iloc[:, :3])

X_test_scaled = X_test.copy()
X_test_scaled.iloc[:, :3] = scaler.transform(X_test.iloc[:, :3])

X_train_scaled

Unnamed: 0,ref,review_date,cocoa_percent,company_location_Amsterdam,company_location_Argentina,company_location_Austria,company_location_Belgium,company_location_Brazil,company_location_Canada,company_location_Colombia,...,specific_bean_origin_Trinidad,specific_bean_origin_Tumaco,specific_bean_origin_Tumbes,specific_bean_origin_Uganda,specific_bean_origin_Upala,specific_bean_origin_Vanua Levu,specific_bean_origin_Vanuatu,specific_bean_origin_Venezuela,specific_bean_origin_Vietnam,specific_bean_origin_Xoconusco
1315,-0.814685,-0.806682,-0.425241,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1020,0.645342,0.559753,-0.266301,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1128,1.176096,0.901362,-0.266301,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
672,0.235955,0.218144,-0.266301,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
442,-0.340085,-0.123464,-1.060999,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,0.514918,0.559753,0.051578,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
992,0.715989,0.559753,-0.266301,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
122,1.618089,1.584579,0.528396,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1366,0.391739,0.559753,0.369457,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The data is prepared.
## It's time for our baseline_model

In [172]:
metric_scorers = {'cohen_kappa': make_scorer(cohen_kappa_score),
                  'matthew_coef': make_scorer(matthews_corrcoef),
                  'accuracy': make_scorer(balanced_accuracy_score)}

In [173]:
def get_kfold_scores(model, X_train, y_train, n_splits=3):
    kfold = StratifiedKFold(n_splits=n_splits)
    c_k_scores, m_coef_scores, b_acc_scores = [], [], []
    for train_index, test_index in kfold.split(X_train, y_train):
        model.fit(X_train.iloc[train_index], pd.Series(y_train)[train_index])
        c_k_score = cohen_kappa_score(model.predict(X_train.iloc[test_index]), pd.Series(y_train)[test_index])
        m_coef_score = matthews_corrcoef(pd.Series(y_train)[test_index], model.predict(X_train.iloc[test_index]))
        b_acc_score = balanced_accuracy_score(pd.Series(y_train)[test_index], model.predict(X_train.iloc[test_index]))
        
        c_k_scores.append(c_k_score)
        m_coef_scores.append(m_coef_score)
        b_acc_scores.append(b_acc_score)
    
    return c_k_scores, m_coef_scores, b_acc_scores

def print_all_scores_and_mean(scores_):
    print(scores_)
    print(np.mean(scores_, axis=1))

In [174]:
baseline_model = LogisticRegression(random_state=42, max_iter=10e4)
baseline_scores = get_kfold_scores(baseline_model, X_train_scaled, y_train)
print_all_scores_and_mean(baseline_scores)

([0.06524327868852442, 0.07062867747854151, 0.11320544692219214], [0.0656861973073651, 0.07119817635833463, 0.11426146512872487], [0.13563866274863717, 0.15654193561482563, 0.17149387235384678])
[0.0830258  0.08371528 0.15455816]


Let's check the initial performance of SVM, decision trees and some ensemble models

In [184]:
svm = SVC(random_state=42)
svm_scores = get_kfold_scores(svm, X_train_scaled, y_train)
print_all_scores_and_mean(svm_scores)

([0.0846952087607522, 0.06275910901631832, 0.06254596564629733], [0.0924710922072807, 0.06679055892014642, 0.06716919592765043], [0.1448810850984764, 0.12695168492269943, 0.13484319424536817])
[0.07000009 0.07547695 0.13555865]


In [185]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree_scores = get_kfold_scores(decision_tree, X_train_scaled, y_train)
print_all_scores_and_mean(decision_tree_scores)

([0.0730864865427231, 0.07871756293527332, 0.06970588693228719], [0.07323475137120182, 0.07879299021609068, 0.06975922673197829], [0.15008923301762175, 0.20060247770605827, 0.17420610719204072])
[0.07383665 0.07392899 0.17496594]


In [186]:
ada = AdaBoostClassifier(random_state=42)
ada_scores = get_kfold_scores(ada, X_train_scaled, y_train)
print_all_scores_and_mean(ada_scores)

([0.039417955464287946, 0.039089080521259034, 0.03052320722126245], [0.04442285237081593, 0.04475626898654489, 0.04848020187523019], [0.14761275090985237, 0.15582348534607698, 0.11223443223443223])
[0.03634341 0.04588644 0.13855689]


In [187]:
rf = RandomForestClassifier(random_state=42)
rf_scores = get_kfold_scores(rf, X_train_scaled, y_train)
print_all_scores_and_mean(rf_scores)

([0.10981499758509905, 0.06269607277381983, 0.08299651531317509], [0.1110011634084711, 0.0629558659497414, 0.08331719099751196], [0.17663700560119996, 0.16173918595060965, 0.16640528080425782])
[0.0851692  0.08575807 0.16826049]


In [188]:
xgb = XGBClassifier(use_label_encoder=False)
xgb_scores = get_kfold_scores(xgb, X_train_scaled, y_train)
print_all_scores_and_mean(xgb_scores)

([0.07554175669722452, 0.056944191254980736, 0.05882291168949816], [0.07604532325490368, 0.05703616491017125, 0.05900625702894459], [0.16666087653939315, 0.1871826307631934, 0.13712288315549187])
[0.06376962 0.06402925 0.16365546]


We will focus on RandomForest.

In [None]:
rf = RandomForestClassifier(random_state=42)
parameters = {'n_estimators': np.linspace(10, 5000, num=2500, dtype=int),
              'max_depth': np.linspace(1, 500, num=250, dtype=int),
              'min_samples_split': [2, 3, 4, 5],
              'min_samples_leaf': [1, 2, 3, 4, 5]}
random_clf = RandomizedSearchCV(rf, parameters, cv=3, n_iter=1500, scoring=metric_scorers, refit='cohen_kappa', n_jobs=-1, verbose=1)
random_clf.fit(X_train_scaled, y_train)

In [None]:
rf_results = pd.DataFrame(random_clf.cv_results_).sort_values(['rank_test_cohen_kappa'])
rf_results

In [None]:
rf_results.loc[0, 'params']

In [None]:
rf = RandomForestClassifier(random_state=42)
parameters = {'n_estimators': np.linspace(1600, 2000, num=10, dtype=int),
              'max_depth': np.linspace(400, 1000, num=250, dtype=int),
              'min_samples_split': [2, 3, 4, 5],
              'min_samples_leaf': [1, 2, 3, 4, 5]}
random_clf = GridSearchCV(rf, parameters, cv=3, scoring=metric_scorers, refit='cohen_kappa', n_jobs=-1, verbose=1)
random_clf.fit(X_train_scaled, y_train)