## Naive Bayes 
## best score 0.75
### 1. Libraries

In [1]:
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Data

In [2]:
train = pd.read_csv('../Data/train_with_BMI.csv')
test = pd.read_csv('../Data/test_with_BMI.csv')
val = pd.read_csv('../Data/val_with_BMI.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

### 3. Random parameters

In [3]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred1 = nb.predict(X_val)
class_report1 = classification_report(y_val, y_pred1)
print("Classification report: \n", class_report1)
conf_matrix1 = confusion_matrix(y_val, y_pred1)
print("Confusion matrix: \n", conf_matrix1)

Classification report: 
               precision    recall  f1-score   support

           0       0.83      0.93      0.87       525
           1       0.78      0.66      0.72       624
           2       0.57      0.60      0.58       604
           3       0.74      0.97      0.84       714
           4       0.95      1.00      0.97       837
           5       0.66      0.46      0.54       532
           6       0.53      0.43      0.48       523

    accuracy                           0.75      4359
   macro avg       0.72      0.72      0.72      4359
weighted avg       0.74      0.75      0.74      4359

Confusion matrix: 
 [[487  26   1   1   6   3   1]
 [ 97 412   3   1  25  61  25]
 [  2   3 361 171   8  20  39]
 [  0   0  16 691   0   0   7]
 [  1   0   1   1 833   1   0]
 [  2  61  86   3   1 246 133]
 [  0  26 167  61   1  41 227]]


Score: 0.75

### 4. Hyperparameter tuning

In [4]:
nb = GaussianNB()

param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

grid_search = GridSearchCV(estimator = nb, param_grid = param_grid, scoring="accuracy", n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters: ", best_params)

best_score = grid_search.best_score_
print("Best score: ", best_score)

cv_results = grid_search.cv_results_
print("Best model CV results: ", cv_results['mean_test_score'][grid_search.best_index_])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters:  {'var_smoothing': 0.001873817422860383}
Best score:  0.7370261044015787
Best model CV results:  0.7370261044015787


Best parameters:  {'var_smoothing': 0.002848035868435802}  
Best score:  0.7430138441758812  
Best model CV results:  0.7430138441758812  
No overfitting.

### 5. Best model - test

In [5]:
best = GaussianNB(var_smoothing=0.002848035868435802)
best.fit(X_train, y_train)
y_pred2 = best.predict(X_val)
class_report2 = classification_report(y_val, y_pred2)
print("Classification report: \n", class_report2)
conf_matrix2 = confusion_matrix(y_val, y_pred2)
print("Confusion matrix: \n", conf_matrix2)

Classification report: 
               precision    recall  f1-score   support

           0       0.82      0.94      0.87       525
           1       0.78      0.66      0.71       624
           2       0.56      0.60      0.58       604
           3       0.75      0.96      0.84       714
           4       0.96      1.00      0.98       837
           5       0.66      0.45      0.54       532
           6       0.52      0.42      0.47       523

    accuracy                           0.75      4359
   macro avg       0.72      0.72      0.71      4359
weighted avg       0.74      0.75      0.74      4359

Confusion matrix: 
 [[493  26   2   0   0   3   1]
 [105 413   4   2  16  59  25]
 [  2   3 363 168   9  20  39]
 [  0   0  19 688   0   0   7]
 [  1   1   0   1 833   1   0]
 [  2  62  90   3   4 242 129]
 [  0  27 174  59   2  40 221]]


Best score: 0.76 - no overfitting.