# Extreme Gradient Boosting Classifier

## Classical imports and getting the training dataset

In [1]:
# Necessary imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Fitting models
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
# Scoring functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from xgboost import XGBClassifier

In [2]:
#Getting the dataset
%run cleanData.ipynb

In [3]:
# Checking that the dataset is ok
df

Unnamed: 0,age,gender,scentLover,ecoInterest,washDishes,MrPropre,Antikal,Ariel,Dash,pods,...,likesPets,hasPet,daysSinceActivity,nbChildren,magasin,moyenneSurface,superMarket,hyperMarket,drive,hardDiscount
0,0.077922,1,0.000000,0.387479,1,-1,-1,-1,-1,-1,...,0.9,1,0.234694,0.2,-1,-1,-1,-1,-1,-1
1,0.636364,1,0.378258,0.666667,1,1,1,1,-1,1,...,0.3,-1,0.014914,0.6,-1,-1,-1,-1,-1,-1
2,0.350649,1,0.000000,0.666667,1,-1,-1,1,1,-1,...,0.9,1,0.094976,0.2,-1,-1,-1,-1,-1,-1
3,0.220779,1,0.333333,0.333333,-1,-1,-1,1,1,1,...,0.9,1,0.259812,0.4,-1,-1,-1,-1,-1,-1
4,0.259740,1,0.378258,0.000000,-1,1,1,1,1,-1,...,0.9,1,0.116954,0.0,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0.454545,1,0.378258,0.387479,1,-1,-1,-1,-1,1,...,0.9,1,0.000785,1.0,-1,-1,-1,-1,-1,-1
11996,0.324675,1,0.333333,0.333333,-1,-1,-1,-1,-1,-1,...,0.9,1,0.124019,0.0,-1,-1,-1,-1,-1,-1
11997,0.311688,1,0.378258,0.666667,-1,-1,-1,-1,-1,-1,...,0.9,1,0.018838,0.6,-1,-1,-1,-1,-1,-1
11998,0.272727,-1,0.378258,0.387479,1,-1,-1,-1,-1,-1,...,0.9,1,0.197017,0.2,-1,-1,-1,-1,-1,-1


In [4]:
# A last clean to have float64 values
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

df=clean_dataset(df)

## Extreme Gradient Boosting Model

In [5]:
#To see what the best parameters are
def best_model(model):
    print(model.best_score_)    
    print(model.best_params_)
    print(model.best_estimator_)

In [6]:
#splitting our dataset into one set of training and one set of test
y = df.washDishes.values
x = df.drop(['washDishes'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=0)

In [10]:
#Extreme Gradient boosting classifier
param_grid = {'max_depth': [5,6,7,8], 'gamma': [0.01,0.001,0.001],'min_child_weight':[1,5,10], 'learning_rate': [0.05,0.1, 0.2, 0.3], 'n_estimators':[5,10,20,100]}
xgb_grid = GridSearchCV(XGBClassifier(eval_metric="error"), param_grid, cv=5, refit=True, verbose=0)
xgb_grid.fit(x_train,y_train)
acc = xgb_grid.score(x_test,y_test)*100
print(best_model(xgb_grid))
print("Test Accuracy {:.2f}%".format(acc))

0.6263318094105277
{'gamma': 0.01, 'learning_rate': 0.2, 'max_depth': 6, 'min_child_weight': 10, 'n_estimators': 10}
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='error',
              gamma=0.01, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=6, min_child_weight=10, missing=nan,
              monotone_constraints='()', n_estimators=10, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)
None
Test Accuracy 62.76%




KeyboardInterrupt: 



KeyboardInterrupt: 

## Prediction

In [11]:
#Predicting
%run cleanData_test.ipynb
df_test

Unnamed: 0,age,gender,scentLover,ecoInterest,MrPropre,Antikal,Ariel,Dash,pods,powder,...,likesPets,hasPet,daysSinceActivity,nbChildren,magasin,moyenneSurface,superMarket,hyperMarket,drive,hardDiscount
0,0.270270,1,0.383884,0.666667,1,-1,1,1,-1,-1,...,0.5,-1,0.064902,1.0,-1,-1,-1,-1,-1,-1
1,0.364865,1,0.383884,0.666667,-1,-1,1,-1,-1,-1,...,0.9,1,0.157985,0.4,-1,-1,-1,-1,-1,-1
2,0.121622,1,0.383884,0.384828,-1,-1,-1,-1,-1,-1,...,0.9,1,0.105038,0.2,-1,-1,-1,-1,-1,-1
3,0.648649,1,0.383884,0.384828,1,-1,-1,-1,-1,-1,...,0.3,-1,0.044406,0.6,-1,-1,-1,-1,-1,-1
4,0.324324,1,0.000000,0.333333,-1,-1,1,1,1,-1,...,0.9,1,0.008540,0.4,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.432432,1,1.000000,0.666667,-1,-1,1,1,-1,-1,...,0.9,1,0.008540,0.0,-1,-1,-1,-1,-1,-1
3996,0.337838,1,0.333333,0.666667,-1,-1,-1,-1,1,-1,...,0.9,1,0.129804,0.0,-1,-1,1,-1,-1,-1
3997,0.270270,1,1.000000,1.000000,-1,-1,1,1,1,-1,...,0.9,1,0.008540,0.4,-1,1,-1,-1,-1,-1
3998,0.270270,1,0.333333,0.384828,1,1,1,1,-1,-1,...,0.3,-1,0.227156,0.4,-1,-1,-1,-1,-1,-1


In [None]:
df3=pd.read_csv(r"DS_CentraleSupelec_ST42021/DS_CentraleSupelec_test.csv") #to get the userID
y_pred = xgb_grid.fit(x_train, y_train).predict(df_test) #the prediction
sol=pd.DataFrame({'userId':df3['userId'],'washDishes':y_pred}) #fusion with userId
sol["washDishes"] = sol["washDishes"].apply(lambda e: 'Auto' if e == 1 else 'Hand')#replacing the -1 and 1 with the original values
sol.to_csv('./DS_CentraleSupelec_ST42021/xgbsolution.csv', index=False)
sol

Unnamed: 0,userId,washDishes
0,use_yFJKoOUCJ7ED8c8Xeov1TzdfpRL,Hand
1,use_t1WpMt2fprbjEteXNSZ1TzdyvAi,Hand
2,use_w9jT58WcsP3tUkPCA0A1TzgYdeE,Hand
3,use_onifqa3LnOsjZOhPYfN1TzdfkfA,Hand
4,use_KgP1NlkmR24pbU9KRwK1Tzdqm8u,Hand
...,...,...
3995,use_yXWhQEP4R6vvYOeJzz61TZU7JRs,Hand
3996,use_MafnxsXODJV5oVhfi1h1TzdgYQT,Hand
3997,use_Pbui34WhFhPjNeX6TEz1TzdZdVv,Hand
3998,use_sDxhHoOEzHyrAzWSPnM1TzdYVjB,Hand
