## Naive Bayes 
## best score 0.66
### 1. Libraries

In [2]:
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

### 2. Data

In [3]:
train = pd.read_csv('Data/selected_train.csv')
test = pd.read_csv('Data/selected_test.csv')
val = pd.read_csv('Data/selected_val.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

### 3. Random parameters

In [4]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred1 = nb.predict(X_val)
class_report1 = classification_report(y_val, y_pred1)
print("Classification report: \n", class_report1)
conf_matrix1 = confusion_matrix(y_val, y_pred1)
print("Confusion matrix: \n", conf_matrix1)

Classification report: 
               precision    recall  f1-score   support

           0       0.85      0.88      0.87       525
           1       0.76      0.70      0.73       624
           2       0.59      0.58      0.58       604
           3       0.73      0.97      0.83       714
           4       0.93      1.00      0.96       837
           5       0.71      0.52      0.60       532
           6       0.55      0.44      0.49       523

    accuracy                           0.75      4359
   macro avg       0.73      0.73      0.72      4359
weighted avg       0.74      0.75      0.74      4359

Confusion matrix: 
 [[464  42   2   0  14   2   1]
 [ 78 434   2   0  35  53  22]
 [  2   3 350 185   8  14  42]
 [  0   0  16 690   0   0   8]
 [  0   0   0   1 835   1   0]
 [  2  68  67   4   4 276 111]
 [  0  21 159  67   2  45 229]]


Score: 0.75

### 4. Hyperparameter tuning

In [5]:
nb = GaussianNB()

param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

grid_search = GridSearchCV(estimator = nb, param_grid = param_grid, scoring="accuracy", n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters: ", best_params)

best_score = grid_search.best_score_
print("Best score: ", best_score)

cv_results = grid_search.cv_results_
print("Best model CV results: ", cv_results['mean_test_score'][grid_search.best_index_])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters:  {'var_smoothing': 0.002848035868435802}
Best score:  0.7430138441758812
Best model CV results:  0.7430138441758812


Best parameters:  {'var_smoothing': 0.002848035868435802}  
Best score:  0.7430138441758812  
Best model CV results:  0.7430138441758812  
No overfitting.

### 5. Best model - test

In [6]:
best = GaussianNB(var_smoothing=0.002848035868435802)
best.fit(X_train, y_train)
y_pred2 = best.predict(X_val)
class_report2 = classification_report(y_val, y_pred2)
print("Classification report: \n", class_report2)
conf_matrix2 = confusion_matrix(y_val, y_pred2)
print("Confusion matrix: \n", conf_matrix2)

Classification report: 
               precision    recall  f1-score   support

           0       0.85      0.91      0.88       525
           1       0.77      0.72      0.74       624
           2       0.57      0.59      0.58       604
           3       0.73      0.96      0.83       714
           4       0.96      1.00      0.98       837
           5       0.70      0.50      0.59       532
           6       0.54      0.42      0.48       523

    accuracy                           0.76      4359
   macro avg       0.73      0.73      0.73      4359
weighted avg       0.75      0.76      0.75      4359

Confusion matrix: 
 [[477  43   2   0   0   2   1]
 [ 79 450   2   0  15  54  24]
 [  2   3 356 181   9  12  41]
 [  0   0  21 686   0   0   7]
 [  1   0   0   1 834   1   0]
 [  2  68  72   3   7 268 112]
 [  0  21 171  63   3  44 221]]


Best score: 0.76 - no overfitting.