# Various Classifier Evaluation
## Hyperparameter tuning with grid search and cross-validation
<br>
## Import Python Libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%load_ext autoreload

## Import Utility Classes

In [2]:
%autoreload 2
import sys
sys.path.append('..')

from utils.data_loader import DataLoader
from utils.term_counter_helper import TermFrequency, TfIdf
from utils.data_frame_helper import DataFrameHelper
from utils.model_evaluation_helper import ModelEvaluationHelper


## Load Data

In [3]:
loader = DataLoader()
content = loader.load_data("../Data/trainingdata.txt")

Number of Sentences:  5485



In [4]:
label_names = np.array(["A", "B", "C", "D", "E", "F", "G", "H"])
df = loader.get_data_frame(content, label_names)
df_helper = DataFrameHelper(df, label_names)

## Input Dataset in DataFrame Format

In [5]:
df_helper.df.head()

Unnamed: 0,Labels,Text
0,A,champion products ch approves stock split cham...
1,B,computer terminal systems cpml completes sale ...
2,A,cobanco inc cbco year net shr cts vs dlrs net ...
3,A,am international inc am nd qtr jan oper shr lo...
4,A,brown forman inc bfd th qtr net shr one dlr vs...


## Select Hyperparameter Range for Gird Search

## Select SGD Classifier and Scoring Metric

In [6]:
models = ["linearSVM", "logisticRegression", "neuralNet", "modified_huber", "squared_hinge"]
scores = ['precision_weighted', 'recall_weighted', 'f1_weighted', 'auc_roc']

## TF-IDF Featurization

In [15]:
min_df = 2

# tfidf = TfIdf(
#     label_names,
#     norm='l2',
#     smooth_idf = True,
#     sublinear_tf = False,
#     lowercase=True,
#     preprocessor=None,
#     tokenizer=None,
#     stop_words='english',
#     ngram_range=(1, 1),
#     analyzer='word',
#     max_df=0.8, # 0.5, 0.75, 1.0
#     min_df = min_df,
#     max_features=None,
#     vocabulary=None)

## Change to TF!
tfidf = TermFrequency(
    label_names,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    stop_words='english',
    ngram_range=(1, 1),
    analyzer='word',
    max_df=0.8,
    min_df = min_df,
    max_features=None,
    vocabulary=None)

tfidf.vectorize_corpus(df_helper.raw_text)
tfidf.vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## Split Dataset into Trainset and Testset

In [16]:
tfidf_model_evaluation = ModelEvaluationHelper(
    tfidf.X, df_helper.y, test_size=0.3, random_state=3)

In [17]:
hyperparameters = {
    'alpha': (1.0, .1, .01, .001),
}

## Linear SVM: Run Grid Search with Cross Validation

In [18]:
tfidf_model_evaluation.set_hyperparam_grid(hyperparameters)
tfidf_model_evaluation.cross_val_grid_search("multinomialNB", "accuracy", cv = 5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best score: 0.945
Best parameters set:
	alpha: 1.0

Grid scores on training set:

0.945 (+/-0.019) for {'alpha': 1.0}
0.943 (+/-0.018) for {'alpha': 0.1}
0.939 (+/-0.023) for {'alpha': 0.01}
0.931 (+/-0.025) for {'alpha': 0.001}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          0       0.99      0.96      0.97       826
          1       0.93      0.97      0.95       511
          2       0.75      0.89      0.81        63
          3       1.00      0.79      0.88        42
          4       0.60      0.30      0.40        10
          5       0.95      0.96      0.96        83
          6       0.85      0.79      0.82        57
          7       0.70      0.78      0.74        54

avg / total       0.94      0.94      0.94      1646




[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.2s finished


In [19]:
tfidf_model_evaluation.cross_val_grid_search("bernoulliNB", "accuracy", cv = 5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best score: 0.892
Best parameters set:
	alpha: 0.001

Grid scores on training set:

0.799 (+/-0.023) for {'alpha': 1.0}
0.878 (+/-0.017) for {'alpha': 0.1}
0.891 (+/-0.021) for {'alpha': 0.01}
0.892 (+/-0.026) for {'alpha': 0.001}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          0       0.94      0.95      0.95       826
          1       0.89      0.92      0.91       511
          2       0.62      0.71      0.67        63
          3       0.97      0.71      0.82        42
          4       1.00      0.30      0.46        10
          5       0.92      0.88      0.90        83
          6       0.74      0.70      0.72        57
          7       0.64      0.59      0.62        54

avg / total       0.90      0.90      0.90      1646




[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.2s finished


In [30]:
"""
Compute chi-squared stats between each non-negative feature and class.
Recall that the chi-square test measures dependence between stochastic variables,
so using this function “weeds out” the features that are the most likely to be independent
of class and therefore irrelevant for classification.
"""
feature_names = tfidf.vectorizer.get_feature_names()
select_chi2 = 100
print("Extracting %d best features by a chi-squared test" %
      select_chi2)
ch2 = SelectKBest(chi2, k=select_chi2)
X_train = ch2.fit_transform(tfidf_model_evaluation.X_train, tfidf_model_evaluation.y_train)
X_test = ch2.transform(tfidf_model_evaluation.X_test)

# keep selected feature names
feature_names = np.asarray([feature_names[i] for i in ch2.get_support(indices=True)])

Extracting 100 best features by a chi-squared test


In [None]:
"multinomialNB"