<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Grid-Search" data-toc-modified-id="Grid-Search-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Grid Search</a></span></li><li><span><a href="#Best-params-result" data-toc-modified-id="Best-params-result-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Best params result</a></span></li></ul></div>

In [5]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [1]:
import numpy as np 
import pandas as pd
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from clustering_utils import *
from eda_utils import *
from myutils_V6 import *


In [2]:
train, test = load_data()
train, upsampling_info = upsampling_train(train)

train_text, train_label = train_augmentation(train, select_comb=None)
test_text, test_label = test['text'], test['label']

# test_text = test_text.apply(lambda x: normal_string(x))
# train_text = train_text.apply(lambda x: normal_string(x))


may use cols: 
 ['global_index', 'doc_path', 'label', 'reply', 'reference_one', 'reference_two', 'tag_reply', 'tag_reference_one', 'tag_reference_two', 'Subject', 'From', 'Lines', 'Organization', 'contained_emails', 'long_string', 'text', 'error_message']


In [3]:
####################################
### label mapper
####################################
labels = sorted(train_label.unique())
label_mapper = dict(zip(labels, range(len(labels))))
train_label = train_label.map(label_mapper)
test_label = test_label.map(label_mapper)
y_train = train_label
y_test = test_label

print(train_text.shape)
print(test_text.shape)
print(train_label.shape)
print(test_label.shape)

(11813,)
(7761,)
(11813,)
(7761,)


# Grid Search

In [9]:
metric = "f1_macro"

text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
parameters = {'tfidf__min_df': [1, 3, 5], 'tfidf__stop_words': [None, 'english'], 'tfidf__use_idf': [True, False], 'tfidf__binary': [True, False],
              'clf__alpha': [0.6, 0.8, 1]}
gs_clf = GridSearchCV(text_clf, scoring=metric, param_grid=parameters, cv=4)
gs_clf = gs_clf.fit(train_text, y_train)

for param_name in gs_clf.best_params_:
    print("{0}:\t{1}".format(param_name, gs_clf.best_params_[param_name]))

print("best f1 score: {:.3f}".format(gs_clf.best_score_))
cv_results = pd.DataFrame(gs_clf.cv_results_)
cv_results.to_excel(f"NB_cv_result_{nb_name}.xlsx")

clf__alpha:	0.6
tfidf__binary:	True
tfidf__min_df:	3
tfidf__stop_words:	english
tfidf__use_idf:	True
best f1 score: 0.889


In [10]:
metric = "f1_macro"

text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])
parameters = {'tfidf__min_df': [1, 3, 5], 'tfidf__stop_words': [None, 'english'], 'tfidf__use_idf': [True, False], 'tfidf__binary': [True, False],
             'clf__penalty':['l2'], 'clf__C':[1,2,3]}
gs_clf = GridSearchCV(text_clf, scoring=metric, param_grid=parameters, cv=4)
gs_clf = gs_clf.fit(train_text, y_train)

for param_name in gs_clf.best_params_:
    print("{0}:\t{1}".format(param_name, gs_clf.best_params_[param_name]))

print("best f1 score: {:.3f}".format(gs_clf.best_score_))
cv_results = pd.DataFrame(gs_clf.cv_results_)
cv_results.to_excel(f"SVC_cv_result_{nb_name}.xlsx")

clf__C:	1
clf__penalty:	l2
tfidf__binary:	True
tfidf__min_df:	1
tfidf__stop_words:	english
tfidf__use_idf:	True
best f1 score: 0.923


# Best params result

In [5]:
X_train, X_test, word_to_idx, tfidf_vect = tfidf_vectorizer(train_text, test_text, binary=True, min_df=1)
# X_train, X_test, word_to_idx, tfidf_vect = count_vectorizer(train_text, test_text, binary=True, min_df=1)
X_train, transform_mapper = dimension_reduction(X_train, out_dim=1000)
X_test = transform_mapper.transform(X_test)

print('X_train.shape', X_train.shape)
print('X_test.shape', X_test.shape)



clf = LinearSVC(penalty="l2", multi_class='ovr', C=1.0, dual=True, max_iter=3000)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(metrics.classification_report(y_true = y_test, y_pred=pred, target_names=labels))

##########################################
## CV shows the stable result
##########################################
cv_metrics = ["precision_macro","accuracy", "f1_macro", "f1_micro"]
cv = cross_validate(clf, X_train, y_train, scoring=cv_metrics, cv=4, return_train_score=True)
cv = pd.DataFrame(cv)
f1 = cv['test_f1_macro'].mean()
print("cv average f1 macro: ", f1)

cv

num of words: 127414
Dimension reduction with truncate SVD:
   input columns with  127414
   output columns with  1000
X_train.shape (11813, 1000)
X_test.shape (7761, 1000)
                          precision    recall  f1-score   support

             alt.atheism       0.81      0.73      0.77       319
           comp.graphics       0.76      0.75      0.75       389
 comp.os.ms-windows.misc       0.78      0.77      0.78       394
comp.sys.ibm.pc.hardware       0.73      0.70      0.72       392
   comp.sys.mac.hardware       0.78      0.83      0.81       385
          comp.windows.x       0.87      0.79      0.83       395
            misc.forsale       0.82      0.89      0.85       390
               rec.autos       0.92      0.89      0.91       395
         rec.motorcycles       0.95      0.96      0.96       398
      rec.sport.baseball       0.79      0.96      0.87       397
        rec.sport.hockey       0.99      0.87      0.93       827
               sci.crypt       0.9

Unnamed: 0,fit_time,score_time,test_precision_macro,train_precision_macro,test_accuracy,train_accuracy,test_f1_macro,train_f1_macro,test_f1_micro,train_f1_micro
0,7.141726,0.013002,0.898408,0.975268,0.89675,0.974941,0.897226,0.975039,0.89675,0.974941
1,7.246976,0.012002,0.919251,0.972628,0.918388,0.972348,0.918483,0.972403,0.918388,0.972348
2,7.151614,0.017002,0.921163,0.971405,0.919404,0.971106,0.919956,0.971216,0.919404,0.971106
3,7.109188,0.015998,0.90414,0.97366,0.903827,0.973363,0.903667,0.973508,0.903827,0.973363


In [13]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(metrics.classification_report(y_true = y_test, y_pred=pred, target_names=labels))


##########################################
## CV shows the stable result
##########################################
cv_metrics = ["precision_macro","accuracy", "f1_macro"]
cv = cross_validate(clf, X_train, y_train,scoring=cv_metrics, cv=4, return_train_score=True)
cv = pd.DataFrame(cv)
f1 = cv['test_f1_macro'].mean()
print("cv average f1 macro: ", f1)

cv

                          precision    recall  f1-score   support

             alt.atheism       0.83      0.74      0.78       319
           comp.graphics       0.58      0.81      0.68       389
 comp.os.ms-windows.misc       0.20      0.01      0.01       394
comp.sys.ibm.pc.hardware       0.52      0.78      0.62       392
   comp.sys.mac.hardware       0.78      0.83      0.80       385
          comp.windows.x       0.78      0.81      0.79       395
            misc.forsale       0.84      0.80      0.82       390
               rec.autos       0.85      0.91      0.88       395
         rec.motorcycles       0.92      0.95      0.93       398
      rec.sport.baseball       0.77      0.95      0.85       397
        rec.sport.hockey       0.98      0.83      0.90       827
               sci.crypt       0.85      0.93      0.89       396
         sci.electronics       0.79      0.73      0.76       393
                 sci.med       0.73      0.92      0.82       198
         

Unnamed: 0,fit_time,score_time,test_precision_macro,train_precision_macro,test_accuracy,train_accuracy,test_f1_macro,train_f1_macro
0,0.051996,0.013001,0.869581,0.940515,0.844617,0.934981,0.829927,0.920417
1,0.053,0.013,0.85481,0.938635,0.865222,0.93228,0.848576,0.916261
2,0.052002,0.012999,0.864089,0.937148,0.857094,0.9307,0.843153,0.914687
3,0.050997,0.013001,0.866366,0.967518,0.864883,0.96693,0.863328,0.966742


In [14]:

p = (clf.predict_proba(test_dtm))[:, 1] 
p_binary = clf.predict(test_dtm)

p_label = 2
fpr, tpr, thresholds = roc_curve(test_y, p, pos_label=p_label)
auc_score = auc(fpr, tpr)

pre, rec, thresholds = precision_recall_curve(test_y, p, pos_label=p_label)

# calculate auc
prc_score = auc(rec, pre)

if show_plots:

    print(classification_report(test_y, p_binary))

    plt.subplot(1, 2, 1)

    plt.plot(fpr, tpr, color='darkorange', lw=2)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('AUC')

    plt.subplot(1, 2, 2)
    plt.plot(rec, pre, color='darkorange', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('PRC')
    plt.tight_layout()

    plt.show()

print("AUC: {:.2%}, PRC: {:.2%}".format(auc_score, prc_score))


NameError: name 'test_dtm' is not defined