## Ada Boost
## best score 0.56
### 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier

### 2. Data

In [2]:
train = pd.read_csv('../Data/train_with_BMI.csv')
test = pd.read_csv('../Data/test_with_BMI.csv')
val = pd.read_csv('../Data/val_with_BMI.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

### 3. Random parameters

In [3]:
ab = AdaBoostClassifier()
ab.fit(X_train, y_train)
y_pred1 = ab.predict(X_val)
class_report1 = classification_report(y_val, y_pred1)
print("Classification Report for AdaBoost Classifier: \n", class_report1)
cm1 = confusion_matrix(y_val, y_pred1)
print("Confusion Matrix for AdaBoost Classifier: \n", cm1)



Classification Report for AdaBoost Classifier: 
               precision    recall  f1-score   support

           0       0.39      0.87      0.54       525
           1       0.50      0.21      0.29       624
           2       0.00      0.00      0.00       604
           3       0.36      0.93      0.52       714
           4       0.91      1.00      0.95       837
           5       0.67      0.07      0.13       532
           6       0.45      0.11      0.18       523

    accuracy                           0.50      4359
   macro avg       0.47      0.45      0.37      4359
weighted avg       0.49      0.50      0.41      4359

Confusion Matrix for AdaBoost Classifier: 
 [[456  66   0   3   0   0   0]
 [473 128   0  12   0   5   6]
 [ 20   0   0 532  30   0  22]
 [  1   0   1 664  45   0   3]
 [  0   1   0   0 833   0   3]
 [186  38   0 232   1  37  38]
 [ 42  24   0 380   5  13  59]]


Score: 0.50

### 4. Hyperparameter tuning

In [4]:
ab = AdaBoostClassifier()

param_grid = {'n_estimators' : [10, 50, 100, 500],
'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1.0]}


grid_search = GridSearchCV(estimator = ab, param_grid = param_grid, scoring="accuracy", n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters: ", best_params)

best_score = grid_search.best_score_
print("Best score: ", best_score)

cv_results = grid_search.cv_results_
print("Best model Cross validation results: ", cv_results['mean_test_score'][grid_search.best_index_])

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters:  {'learning_rate': 1.0, 'n_estimators': 10}
Best score:  0.6757758011128289
Best model Cross validation results:  0.6757758011128289




Best parameters:  {'learning_rate': 0.1, 'n_estimators': 50}  
Best score:  0.6710269770547462  
Best model Cross validation results:  0.6710269770547462  
No overfitting.

### 5. Best model - test

In [5]:
best = AdaBoostClassifier(n_estimators=50, learning_rate=0.1)
best.fit(X_train, y_train)
y_pred2 = best.predict(X_val)
class_report2 = classification_report(y_val, y_pred2)
print("Classification report: \n", class_report2)
conf_matrix2 = confusion_matrix(y_val, y_pred2)
print("Confusion matrix: \n", conf_matrix2)



Classification report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       525
           1       0.42      0.62      0.50       624
           2       0.39      0.85      0.53       604
           3       0.97      0.10      0.18       714
           4       0.99      0.86      0.92       837
           5       0.52      0.73      0.61       532
           6       0.67      0.72      0.70       523

    accuracy                           0.56      4359
   macro avg       0.57      0.55      0.49      4359
weighted avg       0.61      0.56      0.51      4359

Confusion matrix: 
 [[  0 513   0   0   0  10   2]
 [  0 388   1   0   0 229   6]
 [  0   2 511   2   4  26  59]
 [  0   0 638  70   1   0   5]
 [  0   1 112   0 723   0   1]
 [  0  22  11   0   0 387 112]
 [  0   5  53   0   0  87 378]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Result on validation data: 0.56 => **overfitting**.