# Various Classifier Evaluation
## Hyperparameter tuning with grid search and cross-validation
<br>
## Import Python Libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%load_ext autoreload

## Import Utility Classes

In [2]:
%autoreload 2
import sys
sys.path.append('..')

from utils.data_loader import DataLoader
from utils.term_counter_helper import TermFrequency, TfIdf
from utils.data_frame_helper import DataFrameHelper
from utils.model_evaluation_helper import ModelEvaluationHelper
from utils.classifier_helper import Classifier

## Load Data

In [3]:
loader = DataLoader()
content = loader.load_data("../Data/trainingdata.txt")

Number of Sentences:  5485



In [4]:
label_names = np.array(["A", "B", "C", "D", "E", "F", "G", "H"])
df = loader.get_data_frame(content, label_names)
df_helper = DataFrameHelper(df, label_names)

## Input Dataset in DataFrame Format

In [5]:
df_helper.df.head()

Unnamed: 0,Labels,Text
0,A,champion products ch approves stock split cham...
1,B,computer terminal systems cpml completes sale ...
2,A,cobanco inc cbco year net shr cts vs dlrs net ...
3,A,am international inc am nd qtr jan oper shr lo...
4,A,brown forman inc bfd th qtr net shr one dlr vs...


## Select Hyperparameter Range for Gird Search

## Select SGD Classifier and Scoring Metric

In [6]:
models = ["linearSVM", "logisticRegression", "neuralNet", "modified_huber", "squared_hinge"]
scores = ['precision_weighted', 'recall_weighted', 'f1_weighted', 'auc_roc']

In [7]:
min_df = 2

# Change to TF!
tf = TermFrequency(
    label_names,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    stop_words='english',
    ngram_range=(1, 1),
    analyzer='word',
    max_df=0.8,
    min_df = min_df,
    max_features=None,
    vocabulary=None)

tf.vectorize_corpus(df_helper.raw_text)
tf.vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [8]:
tf_model_evaluation = ModelEvaluationHelper(
    tf.X, df_helper.y, test_size=0.3, random_state=3, label_names = df_helper.label_names)

In [9]:
hyperparameters = {
    'alpha': (1.0, .1, .01, .001),
}

In [10]:
clf = Classifier()
multinomialNB = clf.multinomial_NB_clf("multinomialNB")

In [11]:
tf_model_evaluation.set_hyperparam_grid(hyperparameters)
tf_model_evaluation.cross_val_grid_search(multinomialNB, "accuracy", cv = 5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best score: 0.945
Best parameters set:
	alpha: 1.0

Grid scores on training set:

0.945 (+/-0.019) for {'alpha': 1.0}
0.943 (+/-0.018) for {'alpha': 0.1}
0.939 (+/-0.023) for {'alpha': 0.01}
0.931 (+/-0.025) for {'alpha': 0.001}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          A       0.99      0.96      0.97       826
          B       0.93      0.97      0.95       511
          C       0.75      0.89      0.81        63
          D       1.00      0.79      0.88        42
          E       0.60      0.30      0.40        10
          F       0.95      0.96      0.96        83
          G       0.85      0.79      0.82        57
          H       0.70      0.78      0.74        54

avg / total       0.94      0.94      0.94      1646




[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.2s finished


In [12]:
bernoulliNB = clf.multinomial_NB_clf("bernoulliNB")
tf_model_evaluation.cross_val_grid_search(bernoulliNB, "accuracy", cv = 5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best score: 0.892
Best parameters set:
	alpha: 0.001

Grid scores on training set:

0.799 (+/-0.023) for {'alpha': 1.0}
0.878 (+/-0.017) for {'alpha': 0.1}
0.891 (+/-0.021) for {'alpha': 0.01}
0.892 (+/-0.026) for {'alpha': 0.001}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          A       0.94      0.95      0.95       826
          B       0.89      0.92      0.91       511
          C       0.62      0.71      0.67        63
          D       0.97      0.71      0.82        42
          E       1.00      0.30      0.46        10
          F       0.92      0.88      0.90        83
          G       0.74      0.70      0.72        57
          H       0.64      0.59      0.62        54

avg / total       0.90      0.90      0.90      1646




[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.2s finished


## TF-IDF Featurization

In [13]:
min_df = 2

tfidf = TfIdf(
    label_names,
    norm='l2',
    smooth_idf = True,
    sublinear_tf = False,
    lowercase=True,
    preprocessor=None,
    tokenizer=None,
    stop_words='english',
    ngram_range=(1, 1),
    analyzer='word',
    max_df=0.8, # 0.5, 0.75, 1.0
    min_df = min_df,
    max_features=None,
    vocabulary=None)



tfidf.vectorize_corpus(df_helper.raw_text)
tfidf.vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Split Dataset into Trainset and Testset

In [14]:
tfidf_model_evaluation = ModelEvaluationHelper(
    tfidf.X, df_helper.y, test_size=0.3, random_state=3, label_names = df_helper.label_names)

In [15]:
hyperparameters = {
    'n_neighbors': (2, 5),
    'weights': ("uniform", "distance"),
    'metric': ("cosine", "euclidean", "minkowski")
}

In [16]:
# KNN
# weights: uniform", "distance"
# metric: “euclidean”, “manhattan”, “chebyshev”, “minkowski”, “wminkowski”, “seuclidean”, “mahalanobis”

knn = clf.multinomial_neighbors_clf("KNN")#, n_neighbors=5, weights="distance", metric="minkowski")
tfidf_model_evaluation.set_hyperparam_grid(hyperparameters)
tfidf_model_evaluation.cross_val_grid_search(knn, "f1_weighted", cv = 5)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   16.1s


Best score: 0.914
Best parameters set:
	metric: 'euclidean'
	n_neighbors: 5
	weights: 'distance'

Grid scores on training set:

0.853 (+/-0.023) for {'n_neighbors': 2, 'weights': 'uniform', 'metric': 'cosine'}
0.877 (+/-0.035) for {'n_neighbors': 2, 'weights': 'distance', 'metric': 'cosine'}
0.905 (+/-0.010) for {'n_neighbors': 5, 'weights': 'uniform', 'metric': 'cosine'}
0.903 (+/-0.018) for {'n_neighbors': 5, 'weights': 'distance', 'metric': 'cosine'}
0.853 (+/-0.023) for {'n_neighbors': 2, 'weights': 'uniform', 'metric': 'euclidean'}
0.889 (+/-0.027) for {'n_neighbors': 2, 'weights': 'distance', 'metric': 'euclidean'}
0.905 (+/-0.011) for {'n_neighbors': 5, 'weights': 'uniform', 'metric': 'euclidean'}
0.914 (+/-0.010) for {'n_neighbors': 5, 'weights': 'distance', 'metric': 'euclidean'}
0.853 (+/-0.023) for {'n_neighbors': 2, 'weights': 'uniform', 'metric': 'minkowski'}
0.889 (+/-0.027) for {'n_neighbors': 2, 'weights': 'distance', 'metric': 'minkowski'}
0.905 (+/-0.011) for {'n_neig

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   19.7s finished


             precision    recall  f1-score   support

          A       0.90      0.98      0.94       826
          B       0.96      0.85      0.90       511
          C       0.77      0.89      0.82        63
          D       0.94      0.74      0.83        42
          E       0.86      0.60      0.71        10
          F       0.93      0.93      0.93        83
          G       0.85      0.79      0.82        57
          H       0.77      0.67      0.71        54

avg / total       0.91      0.91      0.91      1646




In [17]:
hyperparameters = {
    'metric': ("cosine", "euclidean","l2")
}

In [18]:
# ‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’
NearestCentroid = clf.multinomial_neighbors_clf("NearestCentroid")#, n_neighbors=5, weights="distance", metric="minkowski")
tfidf_model_evaluation.set_hyperparam_grid(hyperparameters)
tfidf_model_evaluation.cross_val_grid_search(NearestCentroid, "f1_weighted", cv = 5)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




Best score: 0.861
Best parameters set:
	metric: 'cosine'

Grid scores on training set:

0.861 (+/-0.018) for {'metric': 'cosine'}
0.815 (+/-0.029) for {'metric': 'euclidean'}
0.815 (+/-0.029) for {'metric': 'l2'}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          A       1.00      0.82      0.90       826
          B       0.79      0.96      0.87       511
          C       0.88      0.97      0.92        63
          D       0.89      0.81      0.85        42
          E       0.62      1.00      0.77        10
          F       0.88      0.99      0.93        83
          G       0.68      0.93      0.79        57
          H       0.78      0.72      0.75        54

avg / total       0.90      0.88      0.88      1646




[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.2s finished


In [23]:
hyperparameters = {'gamma': [1e-1, 1e-2],
                   'C': [100, 1000, 10000]
                   }

In [24]:
gaussianSVM = clf.multinomial_SVM_clf("gaussianSVM", class_weight='balanced', degree=3, shrinking=True, probability=False)
tfidf_model_evaluation.set_hyperparam_grid(hyperparameters)
tfidf_model_evaluation.cross_val_grid_search(gaussianSVM, "accuracy", cv = 5)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.4min finished


Best score: 0.963
Best parameters set:
	C: 1000
	gamma: 0.1

Grid scores on training set:

0.962 (+/-0.014) for {'gamma': 0.1, 'C': 100}
0.957 (+/-0.013) for {'gamma': 0.01, 'C': 100}
0.963 (+/-0.013) for {'gamma': 0.1, 'C': 1000}
0.962 (+/-0.014) for {'gamma': 0.01, 'C': 1000}
0.963 (+/-0.013) for {'gamma': 0.1, 'C': 10000}
0.962 (+/-0.013) for {'gamma': 0.01, 'C': 10000}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          A       0.98      0.98      0.98       826
          B       0.94      0.99      0.96       511
          C       0.91      0.94      0.92        63
          D       1.00      0.81      0.89        42
          E       1.00      0.40      0.57        10
          F       1.00      0.99      0.99        83
          G       0.87      0.82      0.85        57
          H       0.85      0.72      0.78        54

In [25]:
polynomialSVM = clf.multinomial_SVM_clf("polynomialSVM", class_weight='balanced', degree=3, shrinking=True, probability=False)
tfidf_model_evaluation.set_hyperparam_grid(hyperparameters)
tfidf_model_evaluation.cross_val_grid_search(polynomialSVM, "accuracy", cv = 5)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  4.1min finished


Best score: 0.812
Best parameters set:
	C: 10000
	gamma: 0.1

Grid scores on training set:

0.477 (+/-0.189) for {'gamma': 0.1, 'C': 100}
0.089 (+/-0.194) for {'gamma': 0.01, 'C': 100}
0.681 (+/-0.031) for {'gamma': 0.1, 'C': 1000}
0.185 (+/-0.388) for {'gamma': 0.01, 'C': 1000}
0.812 (+/-0.028) for {'gamma': 0.1, 'C': 10000}
0.092 (+/-0.190) for {'gamma': 0.01, 'C': 10000}

Detailed classification report:

The model is trained on the full train set with cross-validation.
The scores are computed on the full test set.

             precision    recall  f1-score   support

          A       0.98      0.91      0.95       826
          B       0.66      0.99      0.79       511
          C       1.00      0.46      0.63        63
          D       1.00      0.17      0.29        42
          E       1.00      0.10      0.18        10
          F       1.00      0.42      0.59        83
          G       0.84      0.37      0.51        57
          H       0.94      0.30      0.45        5

## Linear SVM: Run Grid Search with Cross Validation

In [18]:
"""
Compute chi-squared stats between each non-negative feature and class.
Recall that the chi-square test measures dependence between stochastic variables,
so using this function “weeds out” the features that are the most likely to be independent
of class and therefore irrelevant for classification.
"""
feature_names = tfidf.vectorizer.get_feature_names()
select_chi2 = 100
print("Extracting %d best features by a chi-squared test" %
      select_chi2)
ch2 = SelectKBest(chi2, k=select_chi2)
X_train = ch2.fit_transform(tfidf_model_evaluation.X_train, tfidf_model_evaluation.y_train)
X_test = ch2.transform(tfidf_model_evaluation.X_test)

# keep selected feature names
feature_names = np.asarray([feature_names[i] for i in ch2.get_support(indices=True)])

Extracting 100 best features by a chi-squared test


NameError: name 'SelectKBest' is not defined

In [None]:
"multinomialNB"