# Stochastic Gradient Descent Classifier Evaluation
## Hyperparameter tuning with grid search and cross-validation
<br>
## Import Python Libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%load_ext autoreload

## Import Utility Classes

In [2]:
%autoreload 2
import sys
sys.path.append('..')

from utils.data_loader import DataLoader
from utils.term_counter_helper import TermFrequency, TfIdf
from utils.data_frame_helper import DataFrameHelper
from utils.model_evaluation_helper import ModelEvaluationHelper


## Load Data

In [3]:
loader = DataLoader()
content = loader.load_data("../Data/trainingdata.txt")

Number of Sentences:  5485



In [4]:
label_names = np.array(["A", "B", "C", "D", "E", "F", "G", "H"])
df = loader.get_data_frame(content, label_names)
df_helper = DataFrameHelper(df, label_names)

## Input Dataset in DataFrame Format

In [5]:
df_helper.df.head()

Unnamed: 0,Labels,Text
0,A,champion products ch approves stock split cham...
1,B,computer terminal systems cpml completes sale ...
2,A,cobanco inc cbco year net shr cts vs dlrs net ...
3,A,am international inc am nd qtr jan oper shr lo...
4,A,brown forman inc bfd th qtr net shr one dlr vs...


## Term Frequency Featurization

In [6]:
min_df = 2

tf = TermFrequency(
    label_names,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    stop_words='english',
    ngram_range=(1, 1),
    analyzer='word',
    max_df=0.8,
    min_df = min_df,
    max_features=None,
    vocabulary=None)

tf.vectorize_corpus(df_helper.raw_text)
tf.vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## Split Dataset into Trainset and Testset

In [7]:
tf_model_evaluation = ModelEvaluationHelper(
    tf.X, df_helper.y, test_size=0.3, random_state=3)

## Select Hyperparameter Range for Gird Search

In [8]:
hyperparameters = {
    'alpha': (1e-04, 1e-05, 1e-06),
    'penalty': ('l2',  'l1', 'elasticnet'),
}

## Select SGD Classifier and Scoring Metric

In [9]:
models = ["linearSVM", "logisticRegression", "neuralNet", "modified_huber", "squared_hinge"]
scores = ['precision_weighted', 'recall_weighted', 'f1_weighted', 'auc_roc']

## Learn SVM: Run Grid Search with Cross Validation

In [10]:
tf_model_evaluation.set_hyperparam_grid(hyperparameters)
tf_model_evaluation.cross_val_grid_search("linearSVM", "f1_weighted", cv = 5)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best score: 0.963
Best parameters set:
	alpha: 0.0001
	penalty: 'elasticnet'

Grid scores on training set:

0.960 (+/-0.008) for {'penalty': 'l2', 'alpha': 0.0001}
0.960 (+/-0.018) for {'penalty': 'l1', 'alpha': 0.0001}
0.963 (+/-0.013) for {'penalty': 'elasticnet', 'alpha': 0.0001}
0.959 (+/-0.005) for {'penalty': 'l2', 'alpha': 1e-05}
0.961 (+/-0.013) for {'penalty': 'l1', 'alpha': 1e-05}
0.960 (+/-0.013) for {'penalty': 'elasticnet', 'alpha': 1e-05}
0.960 (+/-0.011) for {'penalty': 'l2', 'alpha': 1e-06}
0.962 (+/-0.012) for {'penalty': 'l1', 'alpha': 1e-06}
0.959 (+/-0.013) for {'penalty': 'elasticnet', 'alpha': 1e-06}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          0       0.98      0.99      0.99       826
          1       0.98      0.98      0.98       511
   

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    5.1s finished


## TF-IDF Featurization

In [11]:
min_df = 2

tfidf = TfIdf(
    label_names,
    norm='l2',
    smooth_idf = True,
    sublinear_tf = False,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    stop_words='english',
    ngram_range=(1, 1),
    analyzer='word',
    max_df=0.8,
    min_df = min_df,
    max_features=None,
    vocabulary=None)

tfidf.vectorize_corpus(df_helper.raw_text)
tfidf.vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Split Dataset into Trainset and Testset

In [12]:
tfidf_model_evaluation = ModelEvaluationHelper(
    tfidf.X, df_helper.y, test_size=0.3, random_state=3)

## Linear SVM: Run Grid Search with Cross Validation

In [13]:
tfidf_model_evaluation.set_hyperparam_grid(hyperparameters)
tfidf_model_evaluation.cross_val_grid_search("linearSVM", "f1_weighted", cv = 5)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best score: 0.969
Best parameters set:
	alpha: 0.0001
	penalty: 'elasticnet'

Grid scores on training set:

0.968 (+/-0.010) for {'penalty': 'l2', 'alpha': 0.0001}
0.964 (+/-0.014) for {'penalty': 'l1', 'alpha': 0.0001}
0.969 (+/-0.010) for {'penalty': 'elasticnet', 'alpha': 0.0001}
0.966 (+/-0.014) for {'penalty': 'l2', 'alpha': 1e-05}
0.963 (+/-0.010) for {'penalty': 'l1', 'alpha': 1e-05}
0.965 (+/-0.015) for {'penalty': 'elasticnet', 'alpha': 1e-05}
0.960 (+/-0.015) for {'penalty': 'l2', 'alpha': 1e-06}
0.964 (+/-0.009) for {'penalty': 'l1', 'alpha': 1e-06}
0.964 (+/-0.011) for {'penalty': 'elasticnet', 'alpha': 1e-06}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          0       0.99      0.99      0.99       826
          1       0.96      0.99      0.97       511
   

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    4.9s finished


## Logistic Regression

In [14]:
tfidf_model_evaluation.cross_val_grid_search("logisticRegression", "f1_weighted", cv = 5)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best score: 0.970
Best parameters set:
	alpha: 1e-05
	penalty: 'l1'

Grid scores on training set:

0.955 (+/-0.018) for {'penalty': 'l2', 'alpha': 0.0001}
0.957 (+/-0.017) for {'penalty': 'l1', 'alpha': 0.0001}
0.955 (+/-0.017) for {'penalty': 'elasticnet', 'alpha': 0.0001}
0.966 (+/-0.012) for {'penalty': 'l2', 'alpha': 1e-05}
0.970 (+/-0.009) for {'penalty': 'l1', 'alpha': 1e-05}
0.969 (+/-0.010) for {'penalty': 'elasticnet', 'alpha': 1e-05}
0.966 (+/-0.006) for {'penalty': 'l2', 'alpha': 1e-06}
0.966 (+/-0.008) for {'penalty': 'l1', 'alpha': 1e-06}
0.964 (+/-0.010) for {'penalty': 'elasticnet', 'alpha': 1e-06}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          0       0.98      0.99      0.99       826
          1       0.97      0.98      0.97       511
          2 

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    4.9s finished


## Fully Connected 1-layer Neural Network

In [15]:
tfidf_model_evaluation.cross_val_grid_search("neuralNet", "f1_weighted", cv = 5)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best score: 0.967
Best parameters set:
	alpha: 1e-05
	penalty: 'l1'

Grid scores on training set:

0.957 (+/-0.013) for {'penalty': 'l2', 'alpha': 0.0001}
0.950 (+/-0.007) for {'penalty': 'l1', 'alpha': 0.0001}
0.957 (+/-0.011) for {'penalty': 'elasticnet', 'alpha': 0.0001}
0.957 (+/-0.008) for {'penalty': 'l2', 'alpha': 1e-05}
0.967 (+/-0.012) for {'penalty': 'l1', 'alpha': 1e-05}
0.961 (+/-0.005) for {'penalty': 'elasticnet', 'alpha': 1e-05}
0.961 (+/-0.010) for {'penalty': 'l2', 'alpha': 1e-06}
0.964 (+/-0.014) for {'penalty': 'l1', 'alpha': 1e-06}
0.961 (+/-0.014) for {'penalty': 'elasticnet', 'alpha': 1e-06}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          0       0.98      0.99      0.98       826
          1       0.97      0.97      0.97       511
          2 

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    5.0s finished
