## Stochastic Gradient Descent
## best score
### 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

### 2. Data

In [2]:
train = pd.read_csv('Data/selected_train.csv')
test = pd.read_csv('Data/selected_test.csv')
val = pd.read_csv('Data/selected_val.csv')

X_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']
X_test = test.drop('NObeyesdad', axis=1)
y_test = test['NObeyesdad']
X_val = val.drop('NObeyesdad', axis=1)
y_val = val['NObeyesdad']

### 3. Random parameters

In [3]:
sgd = SGDClassifier(random_state=42)
sgd.fit(X_train, y_train)
y_pred1 = sgd.predict(X_val)
class_report1 = classification_report(y_val, y_pred1)
print("Classification report:\n", class_report1)

conf_matrix1 = confusion_matrix(y_val, y_pred1)
print("Confusion matrix:\n", conf_matrix1)

Classification report:
               precision    recall  f1-score   support

           0       0.83      0.95      0.88       525
           1       0.48      0.57      0.52       624
           2       0.48      0.72      0.57       604
           3       0.89      0.93      0.91       714
           4       0.99      1.00      0.99       837
           5       0.33      0.23      0.27       532
           6       0.40      0.12      0.18       523

    accuracy                           0.68      4359
   macro avg       0.63      0.64      0.62      4359
weighted avg       0.66      0.68      0.66      4359

Confusion matrix:
 [[500  17   3   1   0   3   1]
 [102 353  59   0   0  67  43]
 [  2  30 433  61   4  59  15]
 [  0   2  43 667   0   1   1]
 [  1   1   0   1 834   0   0]
 [  1 252 123   2   0 122  32]
 [  0  73 247  18   2 121  62]]


Score: 0.68  
### 4. Hyperparameter tuning

In [6]:
sgd = SGDClassifier(random_state=42)

param_grid = {
    'loss':['log'],
    'penalty':['elasticnet'],
    'alpha':np.logspace(-4, 4, 10),
    'l1_ratio':[0.05,0.06,0.07,0.08,0.09,0.1,0.12,0.13,0.14,0.15,0.2]
}

grid_search = GridSearchCV(sgd, param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
cv_results = grid_search.cv_results_

print("Best parameters: ", best_params)
print("Best score: ", best_score)
print("Best model cross-validation results: ", cv_results['mean_test_score'][grid_search.best_index_])

Fitting 3 folds for each of 110 candidates, totalling 330 fits
Best parameters:  {'alpha': 0.000774263682681127, 'l1_ratio': 0.05, 'loss': 'log', 'penalty': 'elasticnet'}
Best score:  0.7346864544210931
Best model cross-validation results:  0.7346864544210931




Best parameters:  {'alpha': 0.000774263682681127, 'l1_ratio': 0.05, 'loss': 'log', 'penalty': 'elasticnet'}  
Best score:  0.7346864544210931  
Best model cross-validation results:  0.7346864544210931  
No overfitting.

### 5. Best model - test

In [8]:
best = SGDClassifier(random_state=42, **best_params)
best.fit(X_train, y_train)
y_pred2 = best.predict(X_val)
class_report2 = classification_report(y_val, y_pred2)
print("Classification report:\n", class_report2)
conf_matrix2 = confusion_matrix(y_val, y_pred2)
print("Confusion matrix:\n", conf_matrix2)



Classification report:
               precision    recall  f1-score   support

           0       0.87      0.92      0.89       525
           1       0.62      0.66      0.64       624
           2       0.60      0.62      0.61       604
           3       0.82      0.95      0.88       714
           4       0.98      1.00      0.99       837
           5       0.50      0.36      0.42       532
           6       0.52      0.45      0.49       523

    accuracy                           0.74      4359
   macro avg       0.70      0.71      0.70      4359
weighted avg       0.72      0.74      0.73      4359

Confusion matrix:
 [[484  29   2   0   0   2   8]
 [ 70 413  22   0   1  59  59]
 [  2  10 374 121  10  40  47]
 [  0   0  27 678   1   1   7]
 [  1   1   0   1 834   0   0]
 [  0 171  75   2   2 189  93]
 [  0  47 123  28   2  87 236]]


Best score: 0.74