This notebook is for cleaning and modeling

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("data/chocolate.csv") 

In [3]:
df

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
0,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.50
2,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25
3,797,A. Morin,France,2012,Peru,Peru,63%,"4- B,S,C,L","fruity, melon, roasty",3.75
4,797,A. Morin,France,2012,Bolivia,Bolivia,70%,"4- B,S,C,L","vegetal, nutty",3.50
...,...,...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,Raw,80%,"4- B,S*,C,Sa","waxy, cloying, vegetal",2.75
2358,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75%,"3- B,S,C","strong nutty, marshmallow",3.75
2359,2170,Zotter,Austria,2018,Belize,Maya Mtn,72%,"3- B,S,C","muted, roasty, accessible",3.50
2360,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70%,"3- B,S,C","fatty, mild nuts, mild fruit",3.25


In [4]:
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ','_')

In [5]:
df = df.replace('%', ' ', regex=True)
df['cocoa_percent'] = df['cocoa_percent'].astype(float)

In [6]:
# Change the Rating column to a string
df['rating'] = df['rating'].astype(float)
# Use 1's and 0's to find out what is a good rating of chocolate
df['rating'] = [1 if r >= 3.00 else 0 for r in df['rating']]
df

Unnamed: 0,ref,company_(manufacturer),company_location,review_date,country_of_bean_origin,specific_bean_origin_or_bar_name,cocoa_percent,ingredients,most_memorable_characteristics,rating
0,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76.0,"3- B,S,C","cocoa, blackberry, full body",1
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76.0,"3- B,S,C","cocoa, vegetal, savory",1
2,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76.0,"3- B,S,C","rich cocoa, fatty, bready",1
3,797,A. Morin,France,2012,Peru,Peru,63.0,"4- B,S,C,L","fruity, melon, roasty",1
4,797,A. Morin,France,2012,Bolivia,Bolivia,70.0,"4- B,S,C,L","vegetal, nutty",1
...,...,...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,Raw,80.0,"4- B,S*,C,Sa","waxy, cloying, vegetal",0
2358,1996,Zotter,Austria,2017,Colombia,"APROCAFA, Acandi",75.0,"3- B,S,C","strong nutty, marshmallow",1
2359,2170,Zotter,Austria,2018,Belize,Maya Mtn,72.0,"3- B,S,C","muted, roasty, accessible",1
2360,2170,Zotter,Austria,2018,Congo,Mountains of the Moon,70.0,"3- B,S,C","fatty, mild nuts, mild fruit",1


In [7]:
df.isna().sum()

ref                                  0
company_(manufacturer)               0
company_location                     0
review_date                          0
country_of_bean_origin               0
specific_bean_origin_or_bar_name     0
cocoa_percent                        0
ingredients                         88
most_memorable_characteristics       0
rating                               0
dtype: int64

In [10]:
df_dropped = df.drop(['specific_bean_origin_or_bar_name','most_memorable_characteristics'],axis=1)

In [11]:
df_dropped

Unnamed: 0,ref,company_(manufacturer),company_location,review_date,country_of_bean_origin,cocoa_percent,ingredients,rating
0,2454,5150,U.S.A.,2019,Madagascar,76.0,"3- B,S,C",1
1,2458,5150,U.S.A.,2019,Dominican Republic,76.0,"3- B,S,C",1
2,2454,5150,U.S.A.,2019,Tanzania,76.0,"3- B,S,C",1
3,797,A. Morin,France,2012,Peru,63.0,"4- B,S,C,L",1
4,797,A. Morin,France,2012,Bolivia,70.0,"4- B,S,C,L",1
...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,80.0,"4- B,S*,C,Sa",0
2358,1996,Zotter,Austria,2017,Colombia,75.0,"3- B,S,C",1
2359,2170,Zotter,Austria,2018,Belize,72.0,"3- B,S,C",1
2360,2170,Zotter,Austria,2018,Congo,70.0,"3- B,S,C",1


In [12]:
df_dropped

Unnamed: 0,ref,company_(manufacturer),company_location,review_date,country_of_bean_origin,cocoa_percent,ingredients,rating
0,2454,5150,U.S.A.,2019,Madagascar,76.0,"3- B,S,C",1
1,2458,5150,U.S.A.,2019,Dominican Republic,76.0,"3- B,S,C",1
2,2454,5150,U.S.A.,2019,Tanzania,76.0,"3- B,S,C",1
3,797,A. Morin,France,2012,Peru,63.0,"4- B,S,C,L",1
4,797,A. Morin,France,2012,Bolivia,70.0,"4- B,S,C,L",1
...,...,...,...,...,...,...,...,...
2357,1205,Zotter,Austria,2014,Blend,80.0,"4- B,S*,C,Sa",0
2358,1996,Zotter,Austria,2017,Colombia,75.0,"3- B,S,C",1
2359,2170,Zotter,Austria,2018,Belize,72.0,"3- B,S,C",1
2360,2170,Zotter,Austria,2018,Congo,70.0,"3- B,S,C",1


In [13]:
df_dropped = df_dropped.dropna()

In [14]:
char=Counter(" ".join(df["most_memorable_characteristics"]).split()).most_common(25)

In [15]:
char

[('sweet,', 213),
 ('cocoa', 199),
 ('mild', 198),
 ('creamy,', 174),
 ('nutty,', 163),
 ('cocoa,', 157),
 ('sandy,', 154),
 ('fruit,', 140),
 ('sour', 127),
 ('fatty,', 123),
 ('roasty,', 117),
 ('intense,', 112),
 ('floral,', 108),
 ('earthy,', 105),
 ('spicy,', 103),
 ('rich', 97),
 ('nutty', 97),
 ('fruit', 96),
 ('off', 94),
 ('roasty', 93),
 ('earthy', 80),
 ('sweet', 74),
 ('sticky,', 71),
 ('vanilla,', 67),
 ('dried', 67)]

In [16]:
Characters_ratings1={}
for characteristic in char:
    c = characteristic[0]
    avg_rating = df[df['most_memorable_characteristics'].str.contains(c)]['rating'].mean()
    Characters_ratings1[c] = avg_rating

In [20]:
Characters_ratings1

{'sweet,': 0.6681614349775785,
 'cocoa': 0.9130434782608695,
 'mild': 0.8743961352657005,
 'creamy,': 0.9152542372881356,
 'nutty,': 0.8466257668711656,
 'cocoa,': 0.9096385542168675,
 'sandy,': 0.7295597484276729,
 'fruit,': 0.8356164383561644,
 'sour': 0.6402116402116402,
 'fatty,': 0.7142857142857143,
 'roasty,': 0.7833333333333333,
 'intense,': 0.7652173913043478,
 'floral,': 0.7837837837837838,
 'earthy,': 0.6915887850467289,
 'spicy,': 0.9047619047619048,
 'rich': 0.958041958041958,
 'nutty': 0.8467432950191571,
 'fruit': 0.8590604026845637,
 'off': 0.5392156862745098,
 'roasty': 0.8130841121495327,
 'earthy': 0.679144385026738,
 'sweet': 0.6835016835016835,
 'sticky,': 0.7123287671232876,
 'vanilla,': 0.39705882352941174,
 'dried': 0.9253731343283582}

In [21]:
top_words = list({k: v for k, v in sorted(Characters_ratings1.items(), key=lambda item: item[1])}.keys())[-15:]
top_words

['intense,',
 'roasty,',
 'floral,',
 'roasty',
 'fruit,',
 'nutty,',
 'nutty',
 'fruit',
 'mild',
 'spicy,',
 'cocoa,',
 'cocoa',
 'creamy,',
 'dried',
 'rich']

In [22]:
top_word_present = []

# loop through memorable_characteristics_list in dataframe
for top in top_words:
    print(top)
# If any words from top_words is in that list, append 1 to top_word_present
if(top_words is df["most_memorable_characteristics_ls"] ):
    top_words.append(1)

# If not append 0
#else: top_words.append(0)
# Create new column with top_word_present
df['top_word_present'] = top_word_present 

intense,
roasty,
floral,
roasty
fruit,
nutty,
nutty
fruit
mild
spicy,
cocoa,
cocoa
creamy,
dried
rich


KeyError: 'most_memorable_characteristics_ls'

# Modeling

In [None]:
# Target
y = df_dropped['rating']
# Features
X = df_dropped.drop('rating', axis=1)

In [None]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y, random_state=2021)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, random_state=2021)

In [None]:
num_cols = list(X.select_dtypes('number').columns)
num_cols

In [None]:
cat_cols = list(X.select_dtypes('object').columns)
cat_cols

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train[num_cols])

In [None]:
X_train_scaled = pd.DataFrame(scaler.transform(X_train[num_cols]), columns=num_cols)
X_val_scaled = pd.DataFrame(scaler.transform(X_val[num_cols]), columns=num_cols)

In [None]:
X_train_scaled

In [None]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
ohe.fit(X_train[cat_cols])

In [None]:
X_train_scaled = pd.DataFrame(scaler.transform(X_train[num_cols]), columns=num_cols)
X_val_scaled = pd.DataFrame(scaler.transform(X_val[num_cols]), columns=num_cols)

In [None]:
ohe.fit(X_train[cat_cols])

In [None]:
X_train_encoded = ohe.transform(X_train[cat_cols])
X_val_encoded = ohe.transform(X_val[cat_cols])

In [None]:
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=ohe.get_feature_names())
X_val_encoded_df = pd.DataFrame(X_val_encoded, columns=ohe.get_feature_names())

In [None]:
X_train_encoded_df

In [None]:
X_train_df = pd.concat([X_train_scaled, X_train_encoded_df], axis=1)
X_val_df = pd.concat([X_val_scaled, X_val_encoded_df], axis=1)

# First Model

In [None]:
logreg = LogisticRegression(solver='liblinear')

In [None]:
def modeling_function(model, X_train, y_train, X_val, y_val):

    # fit model on training data
    model.fit(X_train, y_train)

    # make predictions on training and validation data
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)

    # Print accuracy score
    print('Training accuracy: ', accuracy_score(y_train, train_preds))
    print('Validation accuracy: ', accuracy_score(y_val, val_preds))

    # return fitted model

In [None]:
logreg = modeling_function(logreg, X_train_df, y_train, X_val_df, y_val)

# Second Model 

In [None]:
logreg3 = LogisticRegression(solver='liblinear', penalty='l1', C=.5)

In [None]:
logreg3 = modeling_function(logreg3, X_train_df, y_train, X_val_df, y_val)

# Third Model 

In [None]:
dt= DecisionTreeClassifier(random_state=2021, max_depth=10)

In [None]:
dt = modeling_function(dt, X_train_df, y_train, X_val_df, y_val)

# Fourth Model

In [None]:
rf = RandomForestClassifier(random_state=2021)

In [None]:
rf = modeling_function(rf, X_train_df, y_train, X_val_df, y_val)