## Ada Boost
## best score 0.56
### 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier



### 2. Data

In [2]:
train = pd.read_csv('Data/our_train.csv')
test = pd.read_csv('Data/our_test.csv')
val = pd.read_csv('Data/our_val.csv')
df = pd.read_csv('Data/train.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

CAEC_dict = {'no': 0, 'Sometimes': 0.33, 'Frequently': 0.66, 'Always': 1 }
CALC_dict = {'no': 0, 'Sometimes': 0.5, 'Frequently': 1}
X_train['CAEC'] = X_train['CAEC'].map(CAEC_dict)
X_train['CALC'] = X_train['CALC'].map(CALC_dict)
X_test['CAEC'] = X_test['CAEC'].map(CAEC_dict)
X_test['CALC'] = X_test['CALC'].map(CALC_dict)
X_val['CAEC'] = X_val['CAEC'].map(CAEC_dict)
X_val['CALC'] = X_val['CALC'].map(CALC_dict)
cat_cols2 = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'MTRANS']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat_cols2:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.fit_transform(X_test[col])
    X_val[col] = le.fit_transform(X_val[col])

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
y_val = le.fit_transform(y_val)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

### 3. Random parameters

In [3]:
ab = AdaBoostClassifier()
ab.fit(X_train, y_train)
y_pred1 = ab.predict(X_val)
class_report1 = classification_report(y_val, y_pred1)
print("Classification Report for AdaBoost Classifier: \n", class_report1)
cm1 = confusion_matrix(y_val, y_pred1)
print("Confusion Matrix for AdaBoost Classifier: \n", cm1)

Classification Report for AdaBoost Classifier: 
               precision    recall  f1-score   support

           0       0.64      0.15      0.24       525
           1       0.41      0.63      0.50       624
           2       0.50      0.00      0.00       604
           3       0.41      0.98      0.58       714
           4       0.00      0.00      0.00       837
           5       0.56      0.53      0.54       532
           6       0.42      0.86      0.57       523

    accuracy                           0.44      4359
   macro avg       0.42      0.45      0.35      4359
weighted avg       0.39      0.44      0.33      4359

Confusion Matrix for AdaBoost Classifier: 
 [[ 79 444   0   0   0   1   1]
 [ 39 392   0   0   0 176  17]
 [  0   3   1 171   0  11 418]
 [  0   0   1 701   0   0  12]
 [  0   1   0 835   0   0   1]
 [  3  85   0   0   0 281 163]
 [  2  30   0   6   0  33 452]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Score: 0.44

### 4. Hyperparameter tuning

In [5]:
ab = AdaBoostClassifier()

param_grid = {'n_estimators' : [10, 50, 100, 500],
'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1.0]}


grid_search = GridSearchCV(estimator = ab, param_grid = param_grid, scoring="accuracy", n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters: ", best_params)

best_score = grid_search.best_score_
print("Best score: ", best_score)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters:  {'learning_rate': 0.1, 'n_estimators': 500}
Best score:  0.4820459454109236


Best parameters:  {'learning_rate': 0.1, 'n_estimators': 500}  
Best score:  0.4820459454109236

### 5. Best model

In [6]:
best = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)
best.fit(X_train, y_train)
y_pred2 = best.predict(X_val)
class_report2 = classification_report(y_val, y_pred2)
print("Classification report: \n", class_report2)
conf_matrix2 = confusion_matrix(y_val, y_pred2)
print("Confusion matrix: \n", conf_matrix2)

Classification report: 
               precision    recall  f1-score   support

           0       0.64      0.15      0.24       525
           1       0.33      0.83      0.47       624
           2       0.50      0.35      0.41       604
           3       0.66      0.91      0.77       714
           4       0.98      0.91      0.94       837
           5       0.32      0.16      0.21       532
           6       0.66      0.29      0.40       523

    accuracy                           0.56      4359
   macro avg       0.58      0.51      0.49      4359
weighted avg       0.61      0.56      0.53      4359

Confusion matrix: 
 [[ 79 365   1   0   0  80   0]
 [ 39 516   0   2   3  61   3]
 [  0  89 210 263   6   1  35]
 [  0   1  62 648   3   0   0]
 [  0   0  73   0 763   1   0]
 [  3 364  13  26   2  84  40]
 [  2 234  61  41   2  33 150]]
