## Stochastic Gradient Descent
## best score
### 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

### 2. Data

In [2]:
train = pd.read_csv('Data/train_with_BMI.csv')
test = pd.read_csv('Data/test_with_BMI.csv')
val = pd.read_csv('Data/val_with_BMI.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

### 3. Random parameters

In [3]:
sgd = SGDClassifier(random_state=42)
sgd.fit(X_train, y_train)
y_pred1 = sgd.predict(X_val)
class_report1 = classification_report(y_val, y_pred1)
print("Classification report:\n", class_report1)

conf_matrix1 = confusion_matrix(y_val, y_pred1)
print("Confusion matrix:\n", conf_matrix1)

Classification report:
               precision    recall  f1-score   support

           0       0.84      0.94      0.89       525
           1       0.63      0.42      0.50       624
           2       0.55      0.64      0.59       604
           3       0.86      0.97      0.91       714
           4       0.99      1.00      0.99       837
           5       0.40      0.58      0.48       532
           6       0.44      0.21      0.29       523

    accuracy                           0.71      4359
   macro avg       0.67      0.68      0.66      4359
weighted avg       0.70      0.71      0.69      4359

Confusion matrix:
 [[493  15   5   1   0   7   4]
 [ 87 260  56   0   0 159  62]
 [  2  12 384  91   5  84  26]
 [  0   0  12 694   0   5   3]
 [  1   1   0   1 834   0   0]
 [  1  85  87   2   0 309  48]
 [  0  37 152  22   1 200 111]]


Score: 0.68  
### 4. Hyperparameter tuning

In [5]:
sgd = SGDClassifier(random_state=42)

param_grid = {
    'loss':['log_loss'],
    'penalty':['elasticnet'],
    'alpha':np.logspace(-4, 4, 10),
    'l1_ratio':[0.05,0.06,0.07,0.08,0.09,0.1,0.12,0.13,0.14,0.15,0.2]
}

grid_search = GridSearchCV(sgd, param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
cv_results = grid_search.cv_results_

print("Best parameters: ", best_params)
print("Best score: ", best_score)
print("Best model cross-validation results: ", cv_results['mean_test_score'][grid_search.best_index_])

Fitting 3 folds for each of 110 candidates, totalling 330 fits
Best parameters:  {'alpha': 0.000774263682681127, 'l1_ratio': 0.2, 'loss': 'log_loss', 'penalty': 'elasticnet'}
Best score:  0.7481065944082109
Best model cross-validation results:  0.7481065944082109


Best parameters:  {'alpha': 0.000774263682681127, 'l1_ratio': 0.05, 'loss': 'log', 'penalty': 'elasticnet'}  
Best score:  0.7346864544210931  
Best model cross-validation results:  0.7346864544210931  
No overfitting.

### 5. Best model - test

In [6]:
best = SGDClassifier(random_state=42, **best_params)
best.fit(X_train, y_train)
y_pred2 = best.predict(X_val)
class_report2 = classification_report(y_val, y_pred2)
print("Classification report:\n", class_report2)
conf_matrix2 = confusion_matrix(y_val, y_pred2)
print("Confusion matrix:\n", conf_matrix2)

Classification report:
               precision    recall  f1-score   support

           0       0.88      0.89      0.88       525
           1       0.60      0.67      0.64       624
           2       0.64      0.65      0.64       604
           3       0.85      0.96      0.90       714
           4       0.98      1.00      0.99       837
           5       0.57      0.46      0.51       532
           6       0.55      0.45      0.50       523

    accuracy                           0.75      4359
   macro avg       0.72      0.73      0.72      4359
weighted avg       0.74      0.75      0.75      4359

Confusion matrix:
 [[468  47   1   1   0   3   5]
 [ 62 419  22   0   1  65  55]
 [  2  10 394  98  11  40  49]
 [  0   0  23 683   1   1   6]
 [  1   1   0   1 834   0   0]
 [  0 151  60   0   0 244  77]
 [  0  65 118  25   2  77 236]]


Best score: 0.74