# Libraries

In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer

# Dataset

In [57]:
df = pd.read_csv('../Data Received/cookies.csv')

In [58]:
len(df.columns)

16

In [62]:
df.duplicated().sum()

12

In [60]:
df.isna().sum()

sugar to flour ratio     0
sugar index              5
bake temp                0
chill time               0
calories                 0
density                  0
pH                       0
grams baking soda        0
bake time               10
quality                  0
butter type              0
weight                   0
diameter                 0
mixins                   2
crunch factor            0
aesthetic appeal         0
dtype: int64

# 1. Create functions for cleaning Pipeline

## 1.1 Drop NaNs

In [3]:
# Drop rows with NaNs
# Since there are very few we drop all the rows
def drop_nans(df):
    X = df.dropna()
    return X

# Convert function to transformer to include in the pipeline
NaNDropper = FunctionTransformer(drop_nans, validate=False)


## 1.2 Outlier check

In [5]:
def drop_outliers(df):
    # Drop cookies with Cal < 6 
    df = df.drop(df[df['calories'] <= 6].index)

    # Drop degrees because data makes no sense
    df = df.drop('bake temp', axis = 1)

    # Drop density above 1.1
    df = df.drop(df[df['density'] > 1.1].index)

    # Drop pH above 15
    df = df.drop(df[df['pH'] > 10].index)

    # Drop weight below 0 
    X = df.drop(df[df['weight'] < 0].index)
    
    # Drop aesthetic appeal
    
    X = df.drop('aesthetic appeal', axis = 1)
    
    return X

# Convert function to transformer to include in the pipeline
OutlierDropper = FunctionTransformer(drop_outliers, validate=False)


## 1.3 Encode categ columns

In [6]:
def categ_encoder(df):
    # Encode mixins column
    mixin_list = ["nuts","chocolate","raisins","oats","peanut butter"]
    for i in mixin_list:
        df[i]=""

    for i in mixin_list:
        df[i]=df["mixins"].str.contains(i).astype(int)

    X = df.drop(['mixins'], axis = 1)
    
    return X

# Convert function to transformer to include in the pipeline
CategEncoder = FunctionTransformer(categ_encoder, validate=False)

In [7]:
def onehot_encoder(df):
    # One hote encode 
    df['butter type'] =pd.get_dummies(df['butter type'],drop_first=True)
    X = df
    
    return X
    
# Convert function to transformer to include in the pipeline
OneHotEncoder = FunctionTransformer(onehot_encoder, validate=False)

# 2. Create the Pipeline

In [8]:
from sklearn.pipeline import Pipeline

# Create Pipeline

pipeline_one = Pipeline([
    ('nan_dropper', NaNDropper),
    ('outlier_dropper', OutlierDropper),
    ('categ_encoder', CategEncoder),
    ('one_hot_encoder', OneHotEncoder)
])

In [9]:
df = pipeline_one.fit_transform(df)

In [10]:
df

Unnamed: 0,sugar to flour ratio,sugar index,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,diameter,crunch factor,nuts,chocolate,raisins,oats,peanut butter
0,0.25,9.5,15.0,136.0,0.99367,8.10,0.44,12.1,8,1,15.2,7,1.30,0,0,1,0,0
1,0.23,3.3,34.0,113.0,0.99429,8.16,0.48,8.4,7,1,12.4,7,1.71,0,0,1,0,0
2,0.18,1.9,33.0,106.0,0.98746,8.21,0.83,14.0,9,1,9.4,7,1.78,1,1,0,0,0
3,0.18,10.5,41.0,124.0,0.99630,8.14,0.35,10.5,7,1,12.2,7,1.59,0,1,0,0,0
4,0.24,2.4,6.0,33.0,0.99740,8.09,0.57,9.4,5,0,19.8,7,1.30,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5193,0.66,2.0,15.0,88.0,0.99357,8.66,0.60,11.5,6,0,11.6,7,1.36,0,0,0,0,1
5194,0.16,1.5,10.0,97.0,0.99170,8.48,0.44,10.7,8,1,10.0,7,1.80,0,0,1,0,0
5195,0.25,14.3,79.0,149.0,0.99750,8.12,0.37,9.7,7,1,13.4,7,1.07,0,1,0,1,0
5196,0.33,1.4,35.0,136.0,0.99380,8.63,0.78,10.3,8,1,13.8,7,1.77,0,1,0,0,0


### Training a Model

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

In [12]:
X = df.drop('quality', axis=1)
y = df['quality']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [14]:
#####

In [15]:
from sklearn.model_selection import GridSearchCV

# initialize model (no parameters)
rfc = RandomForestRegressor()

# define grid search
rfc_search = GridSearchCV(estimator = rfc,
                          param_grid = {"n_estimators":[1000]},
                            scoring="r2",
                            cv=10)

# all possible metrics here:
# from sklearn.metrics import SCORERS
# sorted(SCORERS.keys())

In [16]:
rfc_search.fit(X_train,y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_j

In [17]:
rfc_pred_01 = rfc_search.predict(X_test)

In [18]:
r2 = r2_score(y_test,rfc_pred_01)
print(r2)

0.748230740278615


In [19]:
mse = MSE(y_test, rfc_pred_01)

In [20]:
rmse = mse**(0.5)
print(rmse)

0.649554930202286


In [54]:
####

In [137]:
rfc = RandomForestRegressor(n_estimators=2000)

In [138]:
rfc.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=2000, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [139]:
rfc_pred = rfc.predict(X_test)

In [140]:
r2 = r2_score(y_test,rfc_pred)
print(r2)

0.7637634928827076


In [141]:
mse = MSE(y_test, rfc_pred)

In [142]:
rmse = mse**(0.5)
print(rmse)

0.65366031713964


In [64]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
# Instantiate gb
gb = GradientBoostingRegressor(random_state=123)
param_grid_gb= {'learning_rate': [0.01,0.03],
                  'max_depth'    : [4,6,8,10] 
                 }
gb_gs = GridSearchCV(gb, param_grid_gb, cv=10, scoring="r2")
gb_gs.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                

In [75]:
gb_gs.best_score_

0.7341139691149093

In [79]:
gb_pred = gb_gs.predict(df_val)

In [81]:
gb_pred = pd.DataFrame({'results':gb_pred})

In [82]:
gb_pred

Unnamed: 0,results
0,7.313393
1,8.031364
2,7.615203
3,8.257084
4,7.765867
...,...
774,5.340720
775,5.144067
776,5.562395
777,4.806857


In [83]:
gb_pred.to_csv('../Data/GradientBoostingResults.csv')

### Validation Dataset Imported

In [34]:
validation = pd.read_csv('../Data/cookies_validate.csv')

In [35]:
validation.head(1)

Unnamed: 0,id,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,diameter,mixins,crunch factor,aesthetic appeal
0,14,0.62,19.25,400.0,41.0,172.0,1.0002,7.98,0.67,9.7,0,melted,16.6,7,"nuts, chocolate",1.85,3


#### Applying Pipeline to Validation Dataset

In [36]:
df_val = pipeline_one.fit_transform(validation)

In [37]:
df_val

Unnamed: 0,id,sugar to flour ratio,sugar index,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,diameter,crunch factor,nuts,chocolate,raisins,oats,peanut butter
0,14,0.62,19.25,41.0,172.0,1.00020,7.98,0.67,9.7,0,1,16.6,7,1.85,1,1,0,0,0
1,26,0.35,1.00,35.0,146.0,0.99300,8.45,0.44,10.0,0,1,13.8,7,1.43,0,1,0,0,0
2,30,0.39,10.40,20.0,142.0,0.99740,8.20,0.53,10.0,0,1,17.0,7,1.57,0,1,0,0,0
3,52,0.33,1.10,21.0,82.0,0.99100,8.32,0.46,10.9,0,1,12.4,7,1.44,0,1,0,1,0
4,100,0.37,13.50,52.0,192.0,0.99750,8.00,0.44,9.1,0,1,14.8,7,1.51,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
774,1442,0.02,1.90,18.0,30.0,0.99712,8.40,0.75,9.8,0,0,13.8,7,1.01,1,1,0,1,0
775,1445,0.19,5.20,19.0,98.0,0.99713,8.16,0.52,9.6,0,0,14.8,7,1.68,1,0,0,1,0
776,1461,0.00,2.10,6.0,13.0,0.99664,8.59,0.61,10.0,0,0,12.4,7,1.38,1,0,0,1,0
777,1467,0.08,2.30,19.0,32.0,0.99648,8.52,0.57,11.0,0,0,13.4,7,1.63,0,0,0,0,1


In [38]:
df_val=df_val.drop('quality', axis=1)

In [39]:
df_val = df_val.drop('id', axis=1)

In [40]:
df_val.head(1)

Unnamed: 0,sugar to flour ratio,sugar index,chill time,calories,density,pH,grams baking soda,bake time,butter type,weight,diameter,crunch factor,nuts,chocolate,raisins,oats,peanut butter
0,0.62,19.25,41.0,172.0,1.0002,7.98,0.67,9.7,1,16.6,7,1.85,1,1,0,0,0


### Predictions

In [44]:
rfc_pred = rfc_search.predict(df_val)

In [45]:
rfc_pred

array([7.642, 8.193, 7.628, 8.359, 7.424, 7.394, 7.863, 7.305, 7.655,
       7.894, 7.638, 7.95 , 7.268, 7.789, 8.395, 7.593, 8.185, 7.687,
       8.104, 8.091, 7.929, 8.405, 8.359, 7.424, 7.409, 6.609, 6.831,
       7.331, 6.974, 7.252, 8.11 , 7.854, 7.439, 7.721, 7.981, 8.368,
       9.187, 8.037, 6.765, 7.569, 8.001, 6.948, 8.102, 7.21 , 7.446,
       7.524, 7.469, 8.132, 7.508, 7.934, 6.952, 7.53 , 7.552, 7.62 ,
       7.908, 7.516, 7.356, 7.722, 8.213, 8.906, 7.668, 7.862, 6.931,
       8.224, 8.037, 7.845, 7.523, 7.532, 7.422, 8.226, 8.068, 8.316,
       7.116, 6.991, 8.124, 7.03 , 7.771, 7.767, 7.343, 7.827, 7.623,
       8.066, 7.428, 8.924, 7.074, 8.34 , 8.285, 7.951, 7.776, 8.928,
       7.743, 7.285, 8.19 , 7.971, 7.828, 7.713, 7.978, 8.128, 7.68 ,
       7.207, 8.809, 7.867, 9.019, 8.511, 8.686, 7.842, 7.552, 7.313,
       7.571, 8.761, 7.795, 7.037, 7.196, 7.482, 7.507, 6.951, 7.906,
       7.902, 8.778, 7.334, 7.789, 7.59 , 6.84 , 7.672, 7.942, 7.747,
       7.135, 8.013,

In [46]:
rfc_pred = pd.DataFrame({'results':rfc_pred})

In [48]:
rfc_pred.to_csv('../Data/RandonForestResults.csv')

In [50]:
rfc_pred

Unnamed: 0,results
0,7.642
1,8.193
2,7.628
3,8.359
4,7.424
...,...
774,5.148
775,5.024
776,4.829
777,4.875
