In [2]:
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder

In [4]:
train = pd.read_csv(r'train_dataset.csv', low_memory=False)
valid = pd.read_csv(r'valid_dataset.csv', low_memory=False)

In [60]:
X_train = train.drop(columns=['y_train_g1', 'y_train_g2'])
y_train_G1 = train['y_train_g1']
y_train_G2 = train['y_train_g2']
X_valid = valid.drop(columns=['y_val_g1', 'y_val_g2'])
y_valid_G1 = valid['y_val_g1']
y_valid_G2 = valid['y_val_g2']
X_train.shape, y_train_G1.shape, y_train_G2.shape, X_valid.shape, y_valid_G1.shape, y_valid_G2.shape

((113932, 938), (113932,), (113932,), (28483, 938), (28483,), (28483,))

In [8]:
from sklearn import metrics
def classifier_metrics (y_test=None, y_preds=None, average='weighted', model='XGBoost', zero_division=0):
    """Return Accuracy, Recall, Precision and F-1 score. 
    Average can take two arguments : macro or weighted """

    acc = metrics.accuracy_score(y_test, y_preds)
    rec = metrics.recall_score(y_test, y_preds, average = average, zero_division=zero_division)
    prc = metrics.precision_score(y_test, y_preds, average = average, zero_division=zero_division)
    f1  = metrics.f1_score(y_test, y_preds, average = average, zero_division=zero_division)
    print (f"{model} Classification Metrics :")
    print ("-------------------")
    print('Accuracy : {:.2f}%'.format(acc*100))
    print('Recall : {:.2f}%'.format(rec*100))
    print('Precision : {:.2f}%'.format(prc*100))
    print('F1-score : {:.2f}%'.format(f1*100))
    print('\n')

In [17]:
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
              colsample_bynode=0.1, colsample_bytree=1,
              eval_metric='mlogloss', gamma=2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=35,
              n_estimators=1250, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=0.5,
              reg_lambda=0.2, scale_pos_weight=None, subsample=0.9,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
eval_set = [(X_train, y_train_G1), (X_valid, y_valid_G1)]
%time model.fit(X_train, y_train_G1, eval_set=eval_set, early_stopping_rounds=50)

	validation_0-mlogloss:0.26045	validation_1-mlogloss:0.39726
[952]	validation_0-mlogloss:0.26044	validation_1-mlogloss:0.39725
[953]	validation_0-mlogloss:0.26037	validation_1-mlogloss:0.39723
[954]	validation_0-mlogloss:0.26034	validation_1-mlogloss:0.39722
[955]	validation_0-mlogloss:0.26032	validation_1-mlogloss:0.39722
[956]	validation_0-mlogloss:0.26032	validation_1-mlogloss:0.39722
[957]	validation_0-mlogloss:0.26032	validation_1-mlogloss:0.39721
[958]	validation_0-mlogloss:0.26030	validation_1-mlogloss:0.39721
[959]	validation_0-mlogloss:0.26029	validation_1-mlogloss:0.39721
[960]	validation_0-mlogloss:0.26028	validation_1-mlogloss:0.39721
[961]	validation_0-mlogloss:0.26024	validation_1-mlogloss:0.39720
[962]	validation_0-mlogloss:0.26020	validation_1-mlogloss:0.39720
[963]	validation_0-mlogloss:0.26019	validation_1-mlogloss:0.39720
[964]	validation_0-mlogloss:0.26017	validation_1-mlogloss:0.39721
[965]	validation_0-mlogloss:0.26014	validation_1-mlogloss:0.39720
[966]	validatio

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
              colsample_bynode=0.1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=2, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=35, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1250, n_jobs=-1,
              num_class=9, num_parallel_tree=1, objective='multi:softprob',
              random_state=1, reg_alpha=0.5, reg_lambda=0.2,
              scale_pos_weight=None, subsample=0.9, tree_method='hist',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [18]:
%time y_pred_G1_train = model.predict(X_train)
%time y_preds_G1_val = model.predict(X_valid)
classifier_metrics(y_test=y_train_G1, y_preds=y_pred_G1_train, average='macro', model='XGBoost', zero_division=0)
classifier_metrics(y_test=y_valid_G1, y_preds=y_preds_G1_val, average='macro', model='XGBoost', zero_division=0)

Wall time: 26.4 s
Wall time: 6.8 s
XGBoost Classification Metrics :
-------------------
Accuracy : 92.42%
Recall : 91.58%
Precision : 92.30%
F1-score : 91.86%


XGBoost Classification Metrics :
-------------------
Accuracy : 87.48%
Recall : 86.20%
Precision : 87.04%
F1-score : 86.51%




In [21]:
print(metrics.classification_report(y_valid_G1, y_preds_G1_val))

              precision    recall  f1-score   support

           0       0.78      0.94      0.85      3283
           1       0.87      0.83      0.85      2955
           2       0.81      0.79      0.80      2943
           3       0.91      0.88      0.89      1772
           4       0.90      0.89      0.89      3449
           5       0.80      0.77      0.79      2399
           6       0.94      0.91      0.92      3761
           7       0.90      0.83      0.86      1764
           8       0.93      0.93      0.93      6157

    accuracy                           0.87     28483
   macro avg       0.87      0.86      0.87     28483
weighted avg       0.88      0.87      0.87     28483



In [112]:
model_0 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1,
              eval_metric='mlogloss', gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=40,
              min_child_weight=1,
              n_estimators=500, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=0,
              reg_lambda=0, scale_pos_weight=None, subsample=1,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
eval_set = [(X_train, y_train_G1), (X_valid, y_valid_G1)]
%time model_0.fit(X_train, y_train_G1, eval_set=eval_set)

mlogloss:0.17149	validation_1-mlogloss:0.40814
[198]	validation_0-mlogloss:0.17149	validation_1-mlogloss:0.40814
[199]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[200]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[201]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[202]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[203]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[204]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[205]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[206]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[207]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[208]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[209]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[210]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[211]	validation_0-mlogloss:0.17148	validation_1-mlogloss:0.40814
[212]	validation_0-mlogloss:0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.3, max_delta_step=0,
              max_depth=40, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=-1,
              num_class=9, num_parallel_tree=1, objective='multi:softprob',
              random_state=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=None,
              subsample=1, tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [32]:
model_2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
              colsample_bynode=0.9, colsample_bytree=0.5,
              eval_metric='mlogloss', gamma=5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=75,
              min_child_weight=2,
              n_estimators=300, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=2, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
eval_set = [(X_train, y_train_G1), (X_valid, y_valid_G1)]
%time model_2.fit(X_train, y_train_G1, eval_set=eval_set, early_stopping_rounds=50)

[0]	validation_0-mlogloss:1.58137	validation_1-mlogloss:1.59016
[1]	validation_0-mlogloss:1.30734	validation_1-mlogloss:1.32221
[2]	validation_0-mlogloss:1.12407	validation_1-mlogloss:1.14390
[3]	validation_0-mlogloss:0.99404	validation_1-mlogloss:1.01654
[4]	validation_0-mlogloss:0.89296	validation_1-mlogloss:0.91861
[5]	validation_0-mlogloss:0.82021	validation_1-mlogloss:0.84786
[6]	validation_0-mlogloss:0.76256	validation_1-mlogloss:0.79199
[7]	validation_0-mlogloss:0.71422	validation_1-mlogloss:0.74462
[8]	validation_0-mlogloss:0.67414	validation_1-mlogloss:0.70554
[9]	validation_0-mlogloss:0.64105	validation_1-mlogloss:0.67356
[10]	validation_0-mlogloss:0.61199	validation_1-mlogloss:0.64557
[11]	validation_0-mlogloss:0.58923	validation_1-mlogloss:0.62369
[12]	validation_0-mlogloss:0.57028	validation_1-mlogloss:0.60611
[13]	validation_0-mlogloss:0.55270	validation_1-mlogloss:0.58948
[14]	validation_0-mlogloss:0.53819	validation_1-mlogloss:0.57608
[15]	validation_0-mlogloss:0.52579	

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
              colsample_bynode=0.9, colsample_bytree=0.5,
              eval_metric='mlogloss', gamma=5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=75,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=2, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [33]:
model_2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
              colsample_bynode=0.9, colsample_bytree=0.3,
              eval_metric='mlogloss', gamma=3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=40,
              min_child_weight=1.5,
              n_estimators=750, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=2, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
eval_set = [(X_train, y_train_G1), (X_valid, y_valid_G1)]
%time model_2.fit(X_train, y_train_G1, eval_set=eval_set, early_stopping_rounds=50)

mlogloss:0.33010	validation_1-mlogloss:0.41933
[448]	validation_0-mlogloss:0.33010	validation_1-mlogloss:0.41933
[449]	validation_0-mlogloss:0.33010	validation_1-mlogloss:0.41933
[450]	validation_0-mlogloss:0.33001	validation_1-mlogloss:0.41929
[451]	validation_0-mlogloss:0.32985	validation_1-mlogloss:0.41922
[452]	validation_0-mlogloss:0.32985	validation_1-mlogloss:0.41922
[453]	validation_0-mlogloss:0.32967	validation_1-mlogloss:0.41918
[454]	validation_0-mlogloss:0.32967	validation_1-mlogloss:0.41918
[455]	validation_0-mlogloss:0.32968	validation_1-mlogloss:0.41918
[456]	validation_0-mlogloss:0.32967	validation_1-mlogloss:0.41917
[457]	validation_0-mlogloss:0.32967	validation_1-mlogloss:0.41917
[458]	validation_0-mlogloss:0.32967	validation_1-mlogloss:0.41918
[459]	validation_0-mlogloss:0.32967	validation_1-mlogloss:0.41918
[460]	validation_0-mlogloss:0.32968	validation_1-mlogloss:0.41918
[461]	validation_0-mlogloss:0.32967	validation_1-mlogloss:0.41917
[462]	validation_0-mlogloss:0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
              colsample_bynode=0.9, colsample_bytree=0.5,
              eval_metric='mlogloss', gamma=4, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.25, max_delta_step=0, max_depth=75,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=750, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=2, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [36]:
results_model_high_gamma = model_2.evals_result()

In [37]:
model_2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
              colsample_bynode=0.9, colsample_bytree=0.3,
              eval_metric='mlogloss', gamma=3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=40,
              min_child_weight=1.5,
              n_estimators=750, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=2, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
eval_set = [(X_train, y_train_G1), (X_valid, y_valid_G1)]
%time model_2.fit(X_train, y_train_G1, eval_set=eval_set, early_stopping_rounds=50)

-mlogloss:0.34540	validation_1-mlogloss:0.42363
[448]	validation_0-mlogloss:0.34536	validation_1-mlogloss:0.42361
[449]	validation_0-mlogloss:0.34531	validation_1-mlogloss:0.42359
[450]	validation_0-mlogloss:0.34524	validation_1-mlogloss:0.42357
[451]	validation_0-mlogloss:0.34515	validation_1-mlogloss:0.42354
[452]	validation_0-mlogloss:0.34503	validation_1-mlogloss:0.42351
[453]	validation_0-mlogloss:0.34501	validation_1-mlogloss:0.42351
[454]	validation_0-mlogloss:0.34498	validation_1-mlogloss:0.42350
[455]	validation_0-mlogloss:0.34488	validation_1-mlogloss:0.42346
[456]	validation_0-mlogloss:0.34485	validation_1-mlogloss:0.42344
[457]	validation_0-mlogloss:0.34483	validation_1-mlogloss:0.42342
[458]	validation_0-mlogloss:0.34476	validation_1-mlogloss:0.42340
[459]	validation_0-mlogloss:0.34476	validation_1-mlogloss:0.42340
[460]	validation_0-mlogloss:0.34475	validation_1-mlogloss:0.42339
[461]	validation_0-mlogloss:0.34464	validation_1-mlogloss:0.42334
[462]	validation_0-mlogloss:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
              colsample_bynode=0.9, colsample_bytree=0.3,
              eval_metric='mlogloss', gamma=3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=40,
              min_child_weight=1.5, missing=nan, monotone_constraints='()',
              n_estimators=750, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=2, reg_alpha=1,
              reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [38]:
model_3 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
              colsample_bynode=0.9, colsample_bytree=0.3,
              eval_metric='mlogloss', gamma=2.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=40,
              min_child_weight=1.5,
              n_estimators=750, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=2, reg_alpha=1,
              reg_lambda=0.5, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)
eval_set = [(X_train, y_train_G1), (X_valid, y_valid_G1)]
%time model_3.fit(X_train, y_train_G1, eval_set=eval_set, early_stopping_rounds=50)

mlogloss:0.32196	validation_1-mlogloss:0.41661
[448]	validation_0-mlogloss:0.32196	validation_1-mlogloss:0.41660
[449]	validation_0-mlogloss:0.32191	validation_1-mlogloss:0.41658
[450]	validation_0-mlogloss:0.32186	validation_1-mlogloss:0.41654
[451]	validation_0-mlogloss:0.32178	validation_1-mlogloss:0.41650
[452]	validation_0-mlogloss:0.32168	validation_1-mlogloss:0.41649
[453]	validation_0-mlogloss:0.32168	validation_1-mlogloss:0.41649
[454]	validation_0-mlogloss:0.32162	validation_1-mlogloss:0.41645
[455]	validation_0-mlogloss:0.32154	validation_1-mlogloss:0.41644
[456]	validation_0-mlogloss:0.32152	validation_1-mlogloss:0.41644
[457]	validation_0-mlogloss:0.32152	validation_1-mlogloss:0.41644
[458]	validation_0-mlogloss:0.32145	validation_1-mlogloss:0.41643
[459]	validation_0-mlogloss:0.32140	validation_1-mlogloss:0.41640
[460]	validation_0-mlogloss:0.32129	validation_1-mlogloss:0.41636
[461]	validation_0-mlogloss:0.32126	validation_1-mlogloss:0.41635
[462]	validation_0-mlogloss:0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.9,
              colsample_bynode=0.9, colsample_bytree=0.3,
              eval_metric='mlogloss', gamma=2.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.15, max_delta_step=0, max_depth=40,
              min_child_weight=1.5, missing=nan, monotone_constraints='()',
              n_estimators=750, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=2, reg_alpha=1,
              reg_lambda=0.5, scale_pos_weight=None, subsample=0.8,
              tree_method='hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [40]:
model_3 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
              colsample_bynode=0.1, colsample_bytree=1,
              eval_metric='mlogloss', gamma=2.5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=35,
              n_estimators=1000, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=1,
              reg_lambda=0.2, scale_pos_weight=None, subsample=0.9,
              tree_method='hist', use_label_encoder=False, min_child_weight=1,
              validate_parameters=1, verbosity=None)
eval_set = [(X_train, y_train_G1), (X_valid, y_valid_G1)]
%time model_3.fit(X_train, y_train_G1, eval_set=eval_set, early_stopping_rounds=50)

mlogloss:0.30806	validation_1-mlogloss:0.41134
[698]	validation_0-mlogloss:0.30799	validation_1-mlogloss:0.41133
[699]	validation_0-mlogloss:0.30796	validation_1-mlogloss:0.41131
[700]	validation_0-mlogloss:0.30794	validation_1-mlogloss:0.41130
[701]	validation_0-mlogloss:0.30791	validation_1-mlogloss:0.41129
[702]	validation_0-mlogloss:0.30784	validation_1-mlogloss:0.41128
[703]	validation_0-mlogloss:0.30781	validation_1-mlogloss:0.41128
[704]	validation_0-mlogloss:0.30777	validation_1-mlogloss:0.41126
[705]	validation_0-mlogloss:0.30776	validation_1-mlogloss:0.41126
[706]	validation_0-mlogloss:0.30776	validation_1-mlogloss:0.41126
[707]	validation_0-mlogloss:0.30771	validation_1-mlogloss:0.41125
[708]	validation_0-mlogloss:0.30765	validation_1-mlogloss:0.41122
[709]	validation_0-mlogloss:0.30762	validation_1-mlogloss:0.41121
[710]	validation_0-mlogloss:0.30755	validation_1-mlogloss:0.41118
[711]	validation_0-mlogloss:0.30754	validation_1-mlogloss:0.41117
[712]	validation_0-mlogloss:0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
              colsample_bynode=0.1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=2.5, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=35, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=-1,
              num_class=9, num_parallel_tree=1, objective='multi:softprob',
              random_state=1, reg_alpha=1, reg_lambda=0.2,
              scale_pos_weight=None, subsample=0.9, tree_method='hist',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [121]:
model_3.get_params()

{'objective': 'multi:softprob',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 0.75,
 'colsample_bynode': 0.1,
 'colsample_bytree': 1,
 'gamma': 2.5,
 'gpu_id': -1,
 'importance_type': 'gain',
 'interaction_constraints': '',
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 35,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_parallel_tree': 1,
 'random_state': 1,
 'reg_alpha': 1,
 'reg_lambda': 0.2,
 'scale_pos_weight': None,
 'subsample': 0.9,
 'tree_method': 'hist',
 'validate_parameters': 1,
 'verbosity': None,
 'eval_metric': 'mlogloss',
 'num_class': 9}

In [46]:
def filter_preds (y_probas, tresholds):
  preds = []
  for p in y_probas:
    result = np.argwhere(p > tresholds)
    if result.size > 0: preds.append(int(result[0]))
    else : preds.append(9)
  return preds

In [None]:
y_probas_G1_train = model.predict_proba(X_train)

In [49]:
y_probas_G1_valid = model.predict_proba(X_valid)

In [50]:
tresh_g1 = np.array([0.6, 0.5, 0.5, 0.6, 0.5, 0.5, 0.5, 0.5, 0.5])
filtered_pred_train = filter_preds(y_probas_G1_train, tresh_g1)
filtered_pred_valid = filter_preds(y_probas_G1_valid, tresh_g1)

In [52]:
X_train_G2 = X_train.copy(deep=True)
X_train_G2['pred_G1'] = filtered_pred_train
X_valid_G2 = X_valid.copy(deep=True)
X_valid_G2['pred_G1'] = filtered_pred_valid
X_valid_G2

Unnamed: 0,ing_sel,ing_sucre,ing_eau,ing_farine de blé,ing_émulsifiant,ing_acide citrique,ing_conservateur,ing_huile de tournesol,ing_acidifiant,ing_lait,...,water,white,yaourt,yogourt,éclats,épeautre,épices,épinards,œufs,pred_G1
0,0.000000,0.128516,0.257031,0.0,0.0,0.000,0.0,0.0,0.000,0.0,...,0,0,0,0,0,0,0,0,0,6
1,0.000000,96.000000,0.000000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,...,0,0,0,0,0,0,0,0,0,8
2,0.000000,0.000000,0.000000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,...,0,0,0,0,0,0,0,0,0,4
3,0.000117,0.000234,0.000000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,...,0,0,0,0,0,0,0,0,0,9
4,10.937500,0.000000,0.000000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28478,3.307500,6.615000,16.800000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,...,0,0,0,0,0,0,0,0,0,5
28479,0.000000,18.750000,0.000000,0.0,0.0,9.375,0.0,0.0,9.375,0.0,...,0,0,0,0,0,0,0,0,0,8
28480,0.000000,0.000000,0.000000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,...,0,0,0,0,0,0,0,0,0,6
28481,0.000000,0.000000,9.375000,0.0,0.0,0.000,0.0,0.0,0.000,0.0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
model_G2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
              colsample_bynode=0.2, colsample_bytree=1,
              eval_metric='mlogloss', gamma=2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=45,
              n_estimators=400, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=1,
              reg_lambda=0.2, scale_pos_weight=None, subsample=0.9,
              tree_method='hist', use_label_encoder=False, min_child_weight=1,
              validate_parameters=1, verbosity=None)

eval_set = [(X_train_G2, y_train_G2), (X_valid_G2, y_valid_G2)]

%time model_G2.fit(X_train_G2, y_train_G2, eval_set=eval_set, early_stopping_rounds=50)

0-mlogloss:0.40936	validation_1-mlogloss:0.75426
[98]	validation_0-mlogloss:0.40896	validation_1-mlogloss:0.75408
[99]	validation_0-mlogloss:0.40851	validation_1-mlogloss:0.75387
[100]	validation_0-mlogloss:0.40810	validation_1-mlogloss:0.75375
[101]	validation_0-mlogloss:0.40785	validation_1-mlogloss:0.75361
[102]	validation_0-mlogloss:0.40754	validation_1-mlogloss:0.75358
[103]	validation_0-mlogloss:0.40716	validation_1-mlogloss:0.75344
[104]	validation_0-mlogloss:0.40688	validation_1-mlogloss:0.75345
[105]	validation_0-mlogloss:0.40659	validation_1-mlogloss:0.75327
[106]	validation_0-mlogloss:0.40602	validation_1-mlogloss:0.75310
[107]	validation_0-mlogloss:0.40563	validation_1-mlogloss:0.75289
[108]	validation_0-mlogloss:0.40540	validation_1-mlogloss:0.75280
[109]	validation_0-mlogloss:0.40512	validation_1-mlogloss:0.75273
[110]	validation_0-mlogloss:0.40479	validation_1-mlogloss:0.75261
[111]	validation_0-mlogloss:0.40447	validation_1-mlogloss:0.75252
[112]	validation_0-mlogloss:0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
              colsample_bynode=0.2, colsample_bytree=1, eval_metric='mlogloss',
              gamma=2, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=45, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=400, n_jobs=-1,
              num_class=9, num_parallel_tree=1, objective='multi:softprob',
              random_state=1, reg_alpha=1, reg_lambda=0.2,
              scale_pos_weight=None, subsample=0.9, tree_method='hist',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [56]:
model_G2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
              colsample_bynode=0.2, colsample_bytree=1,
              eval_metric='mlogloss', gamma=3, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=45,
              n_estimators=400, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=1, reg_alpha=0,
              reg_lambda=0.2, scale_pos_weight=None, subsample=0.9,
              tree_method='hist', use_label_encoder=False, min_child_weight=1,
              validate_parameters=1, verbosity=None)

eval_set = [(X_train_G2, y_train_G2), (X_valid_G2, y_valid_G2)]

%time model_G2.fit(X_train_G2, y_train_G2, eval_set=eval_set, early_stopping_rounds=50)

ion_0-mlogloss:0.37523	validation_1-mlogloss:0.75087
[98]	validation_0-mlogloss:0.37478	validation_1-mlogloss:0.75080
[99]	validation_0-mlogloss:0.37452	validation_1-mlogloss:0.75077
[100]	validation_0-mlogloss:0.37411	validation_1-mlogloss:0.75073
[101]	validation_0-mlogloss:0.37399	validation_1-mlogloss:0.75071
[102]	validation_0-mlogloss:0.37375	validation_1-mlogloss:0.75071
[103]	validation_0-mlogloss:0.37344	validation_1-mlogloss:0.75059
[104]	validation_0-mlogloss:0.37321	validation_1-mlogloss:0.75055
[105]	validation_0-mlogloss:0.37277	validation_1-mlogloss:0.75042
[106]	validation_0-mlogloss:0.37260	validation_1-mlogloss:0.75045
[107]	validation_0-mlogloss:0.37227	validation_1-mlogloss:0.75036
[108]	validation_0-mlogloss:0.37191	validation_1-mlogloss:0.75023
[109]	validation_0-mlogloss:0.37156	validation_1-mlogloss:0.75019
[110]	validation_0-mlogloss:0.37117	validation_1-mlogloss:0.75010
[111]	validation_0-mlogloss:0.37065	validation_1-mlogloss:0.75003
[112]	validation_0-mloglo

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.75,
              colsample_bynode=0.2, colsample_bytree=1, eval_metric='mlogloss',
              gamma=3, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=45, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=400, n_jobs=-1,
              num_class=9, num_parallel_tree=1, objective='multi:softprob',
              random_state=1, reg_alpha=0, reg_lambda=0.2,
              scale_pos_weight=None, subsample=0.9, tree_method='hist',
              use_label_encoder=False, validate_parameters=1, verbosity=None)

In [116]:
params_m1 = {'colsample_bylevel': 0.75,
 'colsample_bynode': 0.1,
 'colsample_bytree': 1,
 'gamma': 2,
 'learning_rate': 0.1,
 'max_depth': 35,
 'min_child_weight': 1,
 'n_estimators': 1250,
 'reg_alpha': 0.5,
 'reg_lambda': 0.2,
 'subsample': 0.9}
params_m2 = {'colsample_bylevel': 0.9,
 'colsample_bynode': 0.9,
 'colsample_bytree': 0.3,
 'gamma': 3,
 'learning_rate': 0.15,
 'max_depth': 40,
 'min_child_weight': 1.5,
 'n_estimators': 750,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'subsample': 0.8}
params_m3 = {'colsample_bylevel': 0.75,
 'colsample_bynode': 0.1,
 'colsample_bytree': 1,
 'gamma': 2.5,
 'learning_rate': 0.1,
 'max_depth': 35,
 'min_child_weight': 1,
 'n_estimators': 1000,
 'reg_alpha': 1,
 'reg_lambda': 0.2,
 'subsample': 0.9}
params_G2 = {'colsample_bylevel': 0.7,
 'colsample_bynode': 0.9,
 'colsample_bytree': 0.6,
 'gamma': 0.6,
 'learning_rate': 0.01,
 'max_depth': 25,
 'min_child_weight': 1,
 'n_estimators': 750,
 'reg_alpha': 0.1,
 'reg_lambda': 0.9,
 'subsample': 0.85}
G1_m0_train = model_0.evals_result()['validation_0']['mlogloss']
G1_m0_val = model_0.evals_result()['validation_1']['mlogloss']
G1_m1_train = model.evals_result()['validation_0']['mlogloss']
G1_m1_val = model.evals_result()['validation_1']['mlogloss']
G1_m2_train = model_2.evals_result()['validation_0']['mlogloss']
G1_m2_val = model_2.evals_result()['validation_1']['mlogloss']
G1_m3_train = model_3.evals_result()['validation_0']['mlogloss']
G1_m3_val = model_3.evals_result()['validation_1']['mlogloss']
G2_m1_train = model_G2.evals_result()['validation_0']['mlogloss']
G2_m1_val = model_G2.evals_result()['validation_1']['mlogloss']
eval_dic = {'G1_m0_train':G1_m0_train,
    'G1_m0_val':G1_m0_val,
    'G1_m1_train':G1_m1_train,
    'G1_m1_val':G1_m1_val,
    'G1_m2_train':G1_m2_train,
    'G1_m2_val':G1_m2_val,
    'G1_m3_train':G1_m3_train,
    'G1_m3_val':G1_m3_val,
    'G2_m1_train':G2_m1_train,
    'G2_m1_val':G2_m1_val
    }
eval_results = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in eval_dic.items() ]))
eval_results

Unnamed: 0,G1_m0_train,G1_m0_val,G1_m1_train,G1_m1_val,G1_m2_train,G1_m2_val,G1_m3_train,G1_m3_val,G2_m1_train,G2_m1_val
0,1.338668,1.393464,1.940748,1.946282,1.948323,1.952129,1.944311,1.949343,3.535336,3.539181
1,1.033854,1.123256,1.752118,1.762737,1.757101,1.763914,1.759596,1.767668,3.443887,3.450869
2,0.842647,0.956043,1.606451,1.621549,1.617025,1.626686,1.616579,1.627724,3.358879,3.369727
3,0.708687,0.839585,1.484926,1.503921,1.495653,1.507731,1.498569,1.512085,3.282269,3.296546
4,0.611934,0.756053,1.382988,1.405404,1.391524,1.405068,1.398461,1.415006,3.215210,3.231686
...,...,...,...,...,...,...,...,...,...,...
1245,,,0.254332,0.396238,,,,,,
1246,,,0.254332,0.396239,,,,,,
1247,,,0.254332,0.396239,,,,,,
1248,,,0.254300,0.396235,,,,,,


In [117]:
eval_results.to_csv('eval_results_xgboost.csv', index=False)

In [62]:
print(metrics.classification_report(y_valid_G2, y_preds_valid_G2))

              precision    recall  f1-score   support

           0       0.84      0.77      0.80       672
           1       0.69      0.59      0.64       433
           2       0.86      0.88      0.87      2574
           3       0.84      0.80      0.82       671
           4       0.87      0.82      0.84       473
           5       0.82      0.79      0.80      1356
           6       0.91      0.90      0.90      1404
           7       0.85      0.89      0.87       930
           8       0.82      0.71      0.76       712
           9       0.89      0.89      0.89      1164
          10       0.81      0.61      0.70       291
          11       0.89      0.85      0.87       129
          12       0.89      0.84      0.86       608
          13       0.86      0.83      0.85       942
          14       0.77      0.87      0.82       505
          15       0.63      0.44      0.52        27
          16       0.81      0.78      0.80       784
          17       0.87    

In [68]:
model_G2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
              colsample_bynode=0.9, colsample_bytree=0.6,
              eval_metric='mlogloss', gamma=0.6, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=25,
              min_child_weight=1, monotone_constraints='()',
              n_estimators=750, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0.1,
              reg_lambda=0.9, scale_pos_weight=None, subsample=0.85,
              tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

eval_set = [(X_train_G2, y_train_G2), (X_valid_G2, y_valid_G2)]

%time model_G2.fit(X_train_G2, y_train_G2, eval_set=eval_set, early_stopping_rounds=50)

gloss:0.48857	validation_1-mlogloss:0.76270
[448]	validation_0-mlogloss:0.48799	validation_1-mlogloss:0.76236
[449]	validation_0-mlogloss:0.48744	validation_1-mlogloss:0.76204
[450]	validation_0-mlogloss:0.48685	validation_1-mlogloss:0.76172
[451]	validation_0-mlogloss:0.48629	validation_1-mlogloss:0.76140
[452]	validation_0-mlogloss:0.48573	validation_1-mlogloss:0.76108
[453]	validation_0-mlogloss:0.48518	validation_1-mlogloss:0.76079
[454]	validation_0-mlogloss:0.48458	validation_1-mlogloss:0.76048
[455]	validation_0-mlogloss:0.48403	validation_1-mlogloss:0.76018
[456]	validation_0-mlogloss:0.48349	validation_1-mlogloss:0.75986
[457]	validation_0-mlogloss:0.48296	validation_1-mlogloss:0.75956
[458]	validation_0-mlogloss:0.48242	validation_1-mlogloss:0.75926
[459]	validation_0-mlogloss:0.48191	validation_1-mlogloss:0.75895
[460]	validation_0-mlogloss:0.48139	validation_1-mlogloss:0.75866
[461]	validation_0-mlogloss:0.48085	validation_1-mlogloss:0.75837
[462]	validation_0-mlogloss:0.48

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.7,
              colsample_bynode=0.9, colsample_bytree=0.6,
              eval_metric='mlogloss', gamma=0.6, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=25,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=750, n_jobs=-1, num_class=9, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0.1,
              reg_lambda=0.9, scale_pos_weight=None, subsample=0.85,
              tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [69]:
pred_test = model_G2.predict(X_valid_G2)
print(metrics.classification_report(y_valid_G2, pred_test))

              precision    recall  f1-score   support

           0       0.86      0.76      0.81       672
           1       0.72      0.65      0.68       433
           2       0.86      0.89      0.87      2574
           3       0.84      0.80      0.82       671
           4       0.88      0.82      0.85       473
           5       0.82      0.80      0.81      1356
           6       0.91      0.89      0.90      1404
           7       0.86      0.89      0.88       930
           8       0.84      0.70      0.76       712
           9       0.91      0.89      0.90      1164
          10       0.85      0.63      0.72       291
          11       0.93      0.86      0.89       129
          12       0.89      0.85      0.87       608
          13       0.86      0.83      0.85       942
          14       0.79      0.88      0.84       505
          15       0.63      0.44      0.52        27
          16       0.81      0.78      0.79       784
          17       0.87    

In [71]:
import pickle
pkl_file_G2 = open(r'C:\Users\Antoine\Coding Bootcamp\Open Food Facts\Notebooks\Model Evaluation\label_encoder_g2.pkl', 'rb')
le_G2 = pickle.load(pkl_file_G2)
pkl_file_G2.close()

In [73]:
print(metrics.classification_report(le_G2.inverse_transform(y_valid_G2), le_G2.inverse_transform(pred_test)))

                                  precision    recall  f1-score   support

                      appetizers       0.86      0.76      0.81       672
artificially sweetened beverages       0.72      0.65      0.68       433
              biscuits and cakes       0.86      0.89      0.87      2574
                           bread       0.84      0.80      0.82       671
               breakfast cereals       0.88      0.82      0.85       473
                         cereals       0.82      0.80      0.81      1356
                          cheese       0.91      0.89      0.90      1404
              chocolate products       0.86      0.89      0.88       930
                  dairy desserts       0.84      0.70      0.76       712
            dressings and sauces       0.91      0.89      0.90      1164
                    dried fruits       0.85      0.63      0.72       291
                            eggs       0.93      0.86      0.89       129
                            fats     

In [120]:
model.save_model('xgboost_G1_m1.model')
model_2.save_model('xgboost_G1_m2.model')
model_3.save_model('xgboost_G1_m3.model')
model_G2.save_model('xgboost_G2_m1.model')