# Gradient Boosting Classifier

In [58]:
%run cleanData.ipynb
%run cleanData_test.ipynb

In [59]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

In [60]:
#Convert data into float to be used in our algorithm

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

df_train=clean_dataset(df_train)
df_test =clean_dataset(df_test)

In [61]:
#We are using 80-20 split for train-test
VALID_SIZE = 0.2
#We also use random state for reproducibility
RANDOM_STATE = 2018

train, valid = train_test_split(df_train, test_size=VALID_SIZE, random_state=RANDOM_STATE, shuffle=True )

We're going to split the data between predictors and target

In [62]:
predictors = ['age',
'gender',
'scentLover',
'ecoInterest',
'MrPropre',
'Antikal',
'Ariel',
'Dash',
'pods',
'powder',
'liquid',
'electricToothbrush',
'likesPets',
'hasPet',
'daysSinceActivity',
'nbChildren',
'magasin',
'moyenneSurface',
'superMarket',
'hyperMarket',
'drive',
'hardDiscount']
target = 'washDishes'

In [63]:
train_X = train[predictors]
train_Y = train[target].values
valid_X = valid[predictors]
valid_Y = valid[target].values

We're going to test the algorithm with different learning rates:

In [64]:
lr_list = [0.1, 0.25, 0.5, 0.75, 1]
no_estimators_list = [100, 200, 300, 400, 500, 600]

bestScore = 0
bestParams = (0,0)

for lr in lr_list:
    for n in no_estimators_list:
        clf = GradientBoostingClassifier(n_estimators = n, learning_rate = lr)
        clf.fit(train_X, train_Y)

        print("Learning rate:", lr)
        print("Number of estimators:", n)
        print("Accuracy (training): {0:.3f}".format(clf.score(train_X, train_Y)))
        print("Accuracy (validation): {0:.3f}".format(clf.score(valid_X, valid_Y)))

    if clf.score(valid_X, valid_Y) > bestScore:
        bestScore = clf.score(valid_X, valid_Y)
        bestParams = (lr, n)

print("The best score (validation accuracy) is:", bestScore)
print("Parameters (learning rate, nb of estimators):", bestParams)

Learning rate: 0.1
Number of estimators: 600
Accuracy (training): 0.715
Accuracy (validation): 0.600
Learning rate: 0.25
Number of estimators: 600
Accuracy (training): 0.773
Accuracy (validation): 0.597


KeyboardInterrupt: 