## Naive Bayes 
## best score 0.66
### 1. Libraries

In [1]:
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV



### 2. Data

In [2]:
train = pd.read_csv('Data/our_train.csv')
test = pd.read_csv('Data/our_test.csv')
val = pd.read_csv('Data/our_val.csv')
df = pd.read_csv('Data/train.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

CAEC_dict = {'no': 0, 'Sometimes': 0.33, 'Frequently': 0.66, 'Always': 1 }
CALC_dict = {'no': 0, 'Sometimes': 0.5, 'Frequently': 1}
X_train['CAEC'] = X_train['CAEC'].map(CAEC_dict)
X_train['CALC'] = X_train['CALC'].map(CALC_dict)
X_test['CAEC'] = X_test['CAEC'].map(CAEC_dict)
X_test['CALC'] = X_test['CALC'].map(CALC_dict)
X_val['CAEC'] = X_val['CAEC'].map(CAEC_dict)
X_val['CALC'] = X_val['CALC'].map(CALC_dict)
cat_cols2 = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'MTRANS']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat_cols2:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.fit_transform(X_test[col])
    X_val[col] = le.fit_transform(X_val[col])

y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)
y_val = le.fit_transform(y_val)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

### 3. Random parameters

In [4]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred1 = nb.predict(X_val)
class_report1 = classification_report(y_val, y_pred1)
print("Classification report: \n", class_report1)
conf_matrix1 = confusion_matrix(y_val, y_pred1)
print("Confusion matrix: \n", conf_matrix1)

Classification report: 
               precision    recall  f1-score   support

           0       0.69      0.84      0.76       525
           1       0.62      0.45      0.52       624
           2       0.39      0.63      0.48       604
           3       0.72      0.93      0.81       714
           4       0.95      1.00      0.97       837
           5       0.63      0.31      0.42       532
           6       0.54      0.24      0.34       523

    accuracy                           0.66      4359
   macro avg       0.65      0.63      0.61      4359
weighted avg       0.67      0.66      0.64      4359

Confusion matrix: 
 [[442  42  30   0   6   5   0]
 [180 281  67   2  25  43  26]
 [  2   2 379 161   8  25  27]
 [  0   0  44 662   0   0   8]
 [  1   0   0   1 833   2   0]
 [ 19  79 195  26   1 166  46]
 [  1  49 258  63   1  24 127]]


Score: 0.66

### 4. Hyperparameter tuning

In [5]:
nb = GaussianNB()

param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

grid_search = GridSearchCV(estimator = nb, param_grid = param_grid, scoring="accuracy", n_jobs=-1, cv=3, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters: ", best_params)

best_score = grid_search.best_score_
print("Best score: ", best_score)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters:  {'var_smoothing': 0.006579332246575682}
Best score:  0.6577424609194437


Best parameters:  {'var_smoothing': 0.006579332246575682}  
Best score:  0.6577424609194437

### 5. Best model

In [6]:
best = GaussianNB(var_smoothing=0.006579332246575682)
best.fit(X_train, y_train)
y_pred2 = best.predict(X_val)
class_report2 = classification_report(y_val, y_pred2)
print("Classification report: \n", class_report2)
conf_matrix2 = confusion_matrix(y_val, y_pred2)
print("Confusion matrix: \n", conf_matrix2)

Classification report: 
               precision    recall  f1-score   support

           0       0.68      0.84      0.75       525
           1       0.62      0.44      0.52       624
           2       0.39      0.64      0.49       604
           3       0.73      0.92      0.82       714
           4       0.94      1.00      0.97       837
           5       0.63      0.31      0.41       532
           6       0.54      0.24      0.34       523

    accuracy                           0.66      4359
   macro avg       0.65      0.63      0.61      4359
weighted avg       0.67      0.66      0.64      4359

Confusion matrix: 
 [[443  41  30   0   6   5   0]
 [183 276  69   2  25  43  26]
 [  2   3 389 150   9  24  27]
 [  0   0  47 659   0   0   8]
 [  1   0   0   1 833   2   0]
 [ 21  75 192  26   8 164  46]
 [  2  49 260  60   2  23 127]]


Best score: 0.66