In [2]:
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from scikitplot.metrics import plot_confusion_matrix

  _nan_object_mask = _nan_object_array != _nan_object_array


In [3]:
#get all files path
posFiles = glob('review_polarity/txt_sentoken/pos/*')
negFiles = glob('review_polarity/txt_sentoken/neg/*')
#read text files
posReviews = np.array([open(f).read() for f in posFiles])
negReviews = np.array([open(f).read() for f in negFiles])
#use pandas to label and mix the data
polarity_files_df = pd.DataFrame({'pos':posReviews,'neg':negReviews})
polarity_files_df = pd.melt(polarity_files_df, value_vars=['pos','neg'],value_name="text",var_name="label")
polarity_files_df["label_num"] = polarity_files_df.label.map({"neg":0, "pos":1})
polarity_files_df.sample(5)

Unnamed: 0,label,text,label_num
729,pos,"while watching "" shallow grave , "" i found mys...",1
770,pos,"when i saw the trailer for this film , i laugh...",1
1944,neg,""" spice world "" is just one long promotional ...",0
1623,neg,i read the new yorker magazine and i enjoy som...,0
70,pos,when i first heard that kevin costner was maki...,1


In [4]:
# split X and y into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(polarity_files_df.text, polarity_files_df.label_num, test_size=0.33, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1340,)
(660,)
(1340,)
(660,)


In [5]:
# remove English stop words
# include 1-grams and 2-grams (if 3-grams : seems not better because of the length of the vector)
# ignore terms that appear in more than 70% of the documents (intuitively meaningful, it is indeed the best multiple of 10% to have a good score )
# only keep terms that appear in at least 2 documents

# TODO : remove 'not' etc. from stopwords to not remove 'not' from sentences like 'this film is not bad'
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range =(1,2), max_df =0.7, min_df=2)),
                     ('nb', MultinomialNB()),
                    ])

text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
# à vérifier : pos et neg sont biens les pos et neg
print(metrics.classification_report(y_test, y_pred,
    target_names=["pos","neg"]))
metrics.accuracy_score(y_test, y_pred)

# Plot non-normalized confusion matrix
#plt.figure()
#plot_confusion_matrix(cnf_matrix, classes=['pos','neg'],
#                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
#plt.figure()
#plot_confusion_matrix(cnf_matrix, classes=['pos','neg'], normalize=True,
#                      title='Normalized confusion matrix')

#plt.show()

              precision    recall  f1-score   support

         pos       0.80      0.78      0.79       335
         neg       0.78      0.80      0.79       325

   micro avg       0.79      0.79      0.79       660
   macro avg       0.79      0.79      0.79       660
weighted avg       0.79      0.79      0.79       660



0.78939393939393943

In [17]:
#test a range of hyperparameters
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range =(1,3), max_df =0.7, min_df=2)),
                     ('nb', MultinomialNB()),
                    ])


text_clf.get_params()

{'memory': None,
 'nb': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 'nb__alpha': 1.0,
 'nb__class_prior': None,
 'nb__fit_prior': True,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=0.7, max_features=None, min_df=2,
           ngram_range=(1, 3), preprocessor=None, stop_words='english',
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None)),
  ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.7, max_features=None, min_df=2,
         ngram_range=(1, 3), preprocessor=None, stop_words='english',
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
    

In [18]:
from sklearn.model_selection import GridSearchCV
#TO ADAPT depending on what text_clf.get_params() displays
#how did we look for a good parameter alpha : try with 1e-3 and 1e-2 : 1e-2 is the best so we tried with 1e0 and 1e-1 and 1e-2 : 1e-1 is the best one
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'nb__alpha': (1e-2, 1e-1),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, return_train_score=True)

In [19]:
gs_clf = gs_clf.fit(X_train, y_train)

gs_clf.best_score_  



0.79552238805970155

In [69]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

nb__alpha: 0.1
tfidf__use_idf: False
vect__ngram_range: (1, 1)


In [62]:
gs_clf.best_estimator_

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...False,
         use_idf=False)), ('nb', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))])

In [63]:
gs_clf.best_estimator_.get_params()["nb"]

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [64]:
gs_clf.cv_results_

{'mean_fit_time': array([ 0.90705363,  2.97942503,  0.81350295,  3.52606408,  0.84179568,
         3.05628093,  0.86787113,  2.99055672,  0.8450772 ,  2.92341105,
         0.79071728,  2.67280912]),
 'mean_score_time': array([ 0.36827596,  0.56174795,  0.35869686,  0.54121161,  0.4139332 ,
         0.64748732,  0.40891202,  0.57405368,  0.3593332 ,  0.57712452,
         0.34732358,  0.47334862]),
 'mean_test_score': array([ 0.80074627,  0.79328358,  0.82835821,  0.82014925,  0.77835821,
         0.77537313,  0.80149254,  0.79626866,  0.76119403,  0.76343284,
         0.78283582,  0.77835821]),
 'mean_train_score': array([ 0.99291072,  0.9981353 ,  0.98656841,  0.99515037,  0.99776203,
         1.        ,  0.99253787,  0.99925429,  0.99925429,  1.        ,
         0.99776203,  1.        ]),
 'param_nb__alpha': masked_array(data = [0.1 0.1 0.1 0.1 0.01 0.01 0.01 0.01 0.001 0.001 0.001 0.001],
              mask = [False False False False False False False False False False False False]

In [65]:
import pandas as pd
df = pd.DataFrame(gs_clf.cv_results_)
print(df)

    mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0        0.907054         0.368276         0.800746          0.992911   
1        2.979425         0.561748         0.793284          0.998135   
2        0.813503         0.358697         0.828358          0.986568   
3        3.526064         0.541212         0.820149          0.995150   
4        0.841796         0.413933         0.778358          0.997762   
5        3.056281         0.647487         0.775373          1.000000   
6        0.867871         0.408912         0.801493          0.992538   
7        2.990557         0.574054         0.796269          0.999254   
8        0.845077         0.359333         0.761194          0.999254   
9        2.923411         0.577125         0.763433          1.000000   
10       0.790717         0.347324         0.782836          0.997762   
11       2.672809         0.473349         0.778358          1.000000   

   param_nb__alpha param_tfidf__use_idf param_vect

In [82]:
##TODO : neg are neg ? pos are pos ?
nb = MultinomialNB()
# store the vocabulary of X_train
vect = CountVectorizer().fit(X_train)
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)
# examine the first 50 tokens
print(X_train_tokens[0:50])
# examine the last 50 tokens
print(X_train_tokens[-50:])
# Naive Bayes counts the number of times each token appears in each class
# trailing underscore is scikit convention for attributes that are learned during model fitting
X_train_dtm = vect.transform(X_train)
nb = nb.fit(X_train_dtm, y_train)
print(nb.feature_count_)
# rows represent classes, columns represent tokens
print(nb.feature_count_.shape)
# number of times each token appears across all HAM messages
pos_token_count = nb.feature_count_[0, :]
print(pos_token_count)
# number of times each token appears across all SPAM messages
neg_token_count = nb.feature_count_[1, :]
print(neg_token_count)
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({"token":X_train_tokens, "pos":pos_token_count, "neg":neg_token_count}).set_index("token")
print(tokens.head())
# examine 5 random DataFrame rows
print(tokens.sample(5, random_state=6))
# Naive Bayes counts the number of observations in each class
print(nb.class_count_)
# add 1 to ham and spam counts to avoid 0 probabilities
tokens['pos'] = tokens['pos'] + 1
tokens['neg'] = tokens['neg'] + 1
tokens.sample(5, random_state=6)
# convert the ham and spam counts into frequencies
tokens['pos'] = tokens['pos'] / nb.class_count_[0]
tokens['neg'] = tokens['neg'] / nb.class_count_[1]
tokens.sample(5, random_state=6)
# calculate the ratio of neg-to-pos for each token
tokens['neg_ratio'] = tokens['neg'] / tokens['pos']
print(tokens.sample(5, random_state=6))
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
print(tokens.sort_values('neg_ratio', ascending=False))
# look up the spam_ratio for a given token
print(tokens.loc["good", "neg_ratio"])

['00', '000', '007', '00s', '03', '04', '05', '05425', '10', '100', '1000', '100m', '101', '102', '103', '104', '105', '106', '107', '108', '109', '10b', '10s', '10th', '11', '110', '111', '112', '113', '114', '115', '117', '118', '11th', '12', '123', '125', '126', '127', '1272', '128', '129', '1298', '12th', '13', '130', '1305', '131', '132', '133']
['zi', 'zidler', 'ziegler', 'ziembicki', 'zigged', 'ziggy', 'zilch', 'zimbabwe', 'zimmely', 'zimmer', 'zimmerly', 'zinger', 'zingers', 'zinnia', 'zip', 'zipped', 'zippel', 'zipper', 'zippers', 'zippy', 'zips', 'ziyi', 'zodiac', 'zoe', 'zombie', 'zombies', 'zombified', 'zone', 'zones', 'zoo', 'zookeeper', 'zoolander', 'zoologist', 'zoom', 'zooming', 'zooms', 'zoot', 'zophres', 'zorg', 'zorro', 'zsigmond', 'zucker', 'zuehlke', 'zuko', 'zukovsky', 'zulu', 'zwick', 'zwigoff', 'zycie', 'zzzzzzz']
[[  5.  34.   3. ...,   0.   0.   1.]
 [  1.  33.   6. ...,   1.   2.   0.]]
(2, 34197)
[  5.  34.   3. ...,   0.   0.   1.]
[  1.  33.   6. ...,   1.

In [None]:
#same parameter to vectorize data in order to compare to precedent methods
# Logistic regression
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range =(1,2), max_df =0.7, min_df=2)),
                     ('nb', LogisticRegression()),
                    ])

text_clf.fit(X_train, y_train)
y_pred_class = text_clf.predict(X_test)

y_pred_prob = text_clf.predict(X_test)

metrics.confusion_matrix(y_test, y_pred_class)

print(metrics.classification_report(y_test, y_pred_class,
    target_names=["pos","neg"]))
metrics.accuracy_score(y_test, y_pred_class)
metrics.roc_auc_score(y_test, y_pred_prob)

In [31]:
# With TF
from sklearn.feature_extraction.text import TfidfTransformer

vect = CountVectorizer(stop_words='english', ngram_range =(1,2), max_df =0.7, min_df=2)

# combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)
#look at the training data
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

#Compute TF
tf_transformer = TfidfTransformer(use_idf=False)
X_train_tf = tf_transformer.fit_transform(X_train_dtm)
X_train_tf.shape
X_test_tf = TfidTransformer(use_idf=False).fit_transform(X_train_dtm)
#Train logistic regression with TF representation
lr = LogisticRegression().fit(X_train_tf, y_train)

y_pred_class = lr.predict(X_test_tf)

y_pred_prob = lr.predict_proba(X_test_tf)

metrics.confusion_matrix(y_test, y_pred_class)

print(metrics.classification_report(y_test, y_pred_class,
    target_names=["pos","neg"]))
metrics.accuracy_score(y_test, y_pred_class)
metrics.roc_auc_score(y_test, y_pred_prob)

NameError: name 'TfidTransformer' is not defined

In [46]:

### Computing TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_dtm)
X_train_tfidf.shape

nb_tfidf = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_tfidf = tfidf_transformer.transform(X_test_dtm)
print(X_test_tfidf.shape)
y_pred_class_tfidf = nb_tfidf.predict(X_test_tfidf)

metrics.accuracy_score(y_test, y_pred_class_tfidf)

plot_confusion_matrix(metrics.confusion_matrix(y_test, y_pred_class_tfidf),[])

(660, 50456)


ValueError: Found input variables with inconsistent numbers of samples: [2, 0]

AttributeError: module 'matplotlib.colors' has no attribute 'to_rgba'

In [None]:
#USE STRING KERNELS : not done until now in others group and suggested at the very end of the teacher's notebook

In [25]:
from sklearn.neighbors import KNeighborsClassifier
#test a range of hyperparameters
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', max_df =0.7, min_df=2)),
                     ('tfidf', TfidfTransformer()),
                     ('knn', KNeighborsClassifier()),
                    ])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf' : (True,False),
              'knn__n_neighbors': (10, 25, 50),
              'knn__p' : (1,2),
              'knn__weights': ('uniform', 'distance'),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, return_train_score=True)

gs_clf.fit(X_train, y_train)





GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'knn__n_neighbors': (10, 25, 50), 'tfidf__use_idf': (True, False), 'vect__ngram_range': [(1, 1), (1, 2)], 'knn__weights': ('uniform', 'distance'), 'knn__p': (1, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [26]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
print(gs_clf.best_score_)
df = pd.DataFrame(gs_clf.cv_results_)
print(df.sort_values('rank',ascending=True))

knn__n_neighbors: 50
knn__p: 2
knn__weights: 'distance'
tfidf__use_idf: True
vect__ngram_range: (1, 1)
0.764179104478


KeyError: 'rank'

In [None]:
from sklearn.ensemble import RandomForestClassifier
#test a range of hyperparameters
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', max_df =0.7, min_df=2)),
                     ('tfidf', TfidfTransformer()),
                     ('knn', RandomForestClassifier()),
                    ])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf' : (True,False),
              'n_estimators': (10, 25, 50),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, return_train_score=True)

gs_clf.fit(X_train, y_train)


Jupyter Notebook
saveBeforeMerge (auto-sauvegardé) Current Kernel Logo 

Python 3

    Fichier
    Édition
    Affichage
    Insérer
    Cellule
    Noyau
    Widgets
    Aide

import numpy as np

import matplotlib.pyplot as plt

from glob import glob

import pandas as pd

import itertools

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

from scikitplot.metrics import plot_confusion_matrix

/usr/local/lib/python3.5/dist-packages/sklearn/utils/fixes.py:313: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.
  _nan_object_mask = _nan_object_array != _nan_object_array

​

#get all files path

posFiles = glob('review_polarity/txt_sentoken/pos/*')

negFiles = glob('review_polarity/txt_sentoken/neg/*')

#read text files

posReviews = np.array([open(f).read() for f in posFiles])

negReviews = np.array([open(f).read() for f in negFiles])

#use pandas to label and mix the data

polarity_files_df = pd.DataFrame({'pos':posReviews,'neg':negReviews})

polarity_files_df = pd.melt(polarity_files_df, value_vars=['pos','neg'],value_name="text",var_name="label")

polarity_files_df["label_num"] = polarity_files_df.label.map({"neg":0, "pos":1})

polarity_files_df.sample(5)

	label 	text 	label_num
729 	pos 	while watching " shallow grave , " i found mys... 	1
770 	pos 	when i saw the trailer for this film , i laugh... 	1
1944 	neg 	" spice world " is just one long promotional ... 	0
1623 	neg 	i read the new yorker magazine and i enjoy som... 	0
70 	pos 	when i first heard that kevin costner was maki... 	1

# split X and y into training and testing sets

​

X_train, X_test, y_train, y_test = train_test_split(polarity_files_df.text, polarity_files_df.label_num, test_size=0.33, random_state=42)

print(X_train.shape)

print(X_test.shape)

print(y_train.shape)

print(y_test.shape)

(1340,)
(660,)
(1340,)
(660,)

# remove English stop words

# include 1-grams and 2-grams (if 3-grams : seems not better because of the length of the vector)

# ignore terms that appear in more than 70% of the documents (intuitively meaningful, it is indeed the best multiple of 10% to have a good score )

# only keep terms that appear in at least 2 documents

​

# TODO : remove 'not' etc. from stopwords to not remove 'not' from sentences like 'this film is not bad'

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range =(1,2), max_df =0.7, min_df=2)),

                     ('nb', MultinomialNB()),

                    ])

​

text_clf.fit(X_train, y_train)

y_pred = text_clf.predict(X_test)

​

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

# à vérifier : pos et neg sont biens les pos et neg

print(metrics.classification_report(y_test, y_pred,

    target_names=["pos","neg"]))

metrics.accuracy_score(y_test, y_pred)

​

# Plot non-normalized confusion matrix

#plt.figure()

#plot_confusion_matrix(cnf_matrix, classes=['pos','neg'],

#                      title='Confusion matrix, without normalization')

​

# Plot normalized confusion matrix

#plt.figure()

#plot_confusion_matrix(cnf_matrix, classes=['pos','neg'], normalize=True,

#                      title='Normalized confusion matrix')

​

#plt.show()

              precision    recall  f1-score   support

         pos       0.80      0.78      0.79       335
         neg       0.78      0.80      0.79       325

   micro avg       0.79      0.79      0.79       660
   macro avg       0.79      0.79      0.79       660
weighted avg       0.79      0.79      0.79       660

0.78939393939393943

​

#test a range of hyperparameters

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range =(1,3), max_df =0.7, min_df=2)),

                     ('nb', MultinomialNB()),

                    ])

​

​

text_clf.get_params()

{'memory': None,
 'nb': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 'nb__alpha': 1.0,
 'nb__class_prior': None,
 'nb__fit_prior': True,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=0.7, max_features=None, min_df=2,
           ngram_range=(1, 3), preprocessor=None, stop_words='english',
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None)),
  ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.7, max_features=None, min_df=2,
         ngram_range=(1, 3), preprocessor=None, stop_words='english',
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 0.7,
 'vect__max_features': None,
 'vect__min_df': 2,
 'vect__ngram_range': (1, 3),
 'vect__preprocessor': None,
 'vect__stop_words': 'english',
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None}

from sklearn.model_selection import GridSearchCV

#TO ADAPT depending on what text_clf.get_params() displays

#how did we look for a good parameter alpha : try with 1e-3 and 1e-2 : 1e-2 is the best so we tried with 1e0 and 1e-1 and 1e-2 : 1e-1 is the best one

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],

              'nb__alpha': (1e-2, 1e-1),

}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, return_train_score=True)

gs_clf = gs_clf.fit(X_train, y_train)

​

gs_clf.best_score_  

/usr/local/lib/python3.5/dist-packages/sklearn/model_selection/_split.py:1943: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
  warnings.warn(CV_WARNING, FutureWarning)

0.79552238805970155

for param_name in sorted(parameters.keys()):

    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

nb__alpha: 0.1
tfidf__use_idf: False
vect__ngram_range: (1, 1)

gs_clf.best_estimator_

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...False,
         use_idf=False)), ('nb', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))])

gs_clf.best_estimator_.get_params()["nb"]

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

gs_clf.cv_results_

{'mean_fit_time': array([ 0.90705363,  2.97942503,  0.81350295,  3.52606408,  0.84179568,
         3.05628093,  0.86787113,  2.99055672,  0.8450772 ,  2.92341105,
         0.79071728,  2.67280912]),
 'mean_score_time': array([ 0.36827596,  0.56174795,  0.35869686,  0.54121161,  0.4139332 ,
         0.64748732,  0.40891202,  0.57405368,  0.3593332 ,  0.57712452,
         0.34732358,  0.47334862]),
 'mean_test_score': array([ 0.80074627,  0.79328358,  0.82835821,  0.82014925,  0.77835821,
         0.77537313,  0.80149254,  0.79626866,  0.76119403,  0.76343284,
         0.78283582,  0.77835821]),
 'mean_train_score': array([ 0.99291072,  0.9981353 ,  0.98656841,  0.99515037,  0.99776203,
         1.        ,  0.99253787,  0.99925429,  0.99925429,  1.        ,
         0.99776203,  1.        ]),
 'param_nb__alpha': masked_array(data = [0.1 0.1 0.1 0.1 0.01 0.01 0.01 0.01 0.001 0.001 0.001 0.001],
              mask = [False False False False False False False False False False False False],
        fill_value = ?),
 'param_tfidf__use_idf': masked_array(data = [True True False False True True False False True True False False],
              mask = [False False False False False False False False False False False False],
        fill_value = ?),
 'param_vect__ngram_range': masked_array(data = [(1, 1) (1, 2) (1, 1) (1, 2) (1, 1) (1, 2) (1, 1) (1, 2) (1, 1) (1, 2)
  (1, 1) (1, 2)],
              mask = [False False False False False False False False False False False False],
        fill_value = ?),
 'params': [{'nb__alpha': 0.1,
   'tfidf__use_idf': True,
   'vect__ngram_range': (1, 1)},
  {'nb__alpha': 0.1, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)},
  {'nb__alpha': 0.1, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)},
  {'nb__alpha': 0.1, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)},
  {'nb__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)},
  {'nb__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)},
  {'nb__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)},
  {'nb__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)},
  {'nb__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)},
  {'nb__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)},
  {'nb__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)},
  {'nb__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}],
 'rank_test_score': array([ 4,  6,  1,  2,  8, 10,  3,  5, 12, 11,  7,  8], dtype=int32),
 'split0_test_score': array([ 0.81431767,  0.80760626,  0.8344519 ,  0.82997763,  0.80089485,
         0.80089485,  0.81655481,  0.81879195,  0.7852349 ,  0.78747204,
         0.80313199,  0.80089485]),
 'split0_train_score': array([ 0.99216125,  1.        ,  0.98656215,  0.99552072,  0.99888018,
         1.        ,  0.99216125,  1.        ,  1.        ,  1.        ,
         0.99888018,  1.        ]),
 'split1_test_score': array([ 0.81655481,  0.80536913,  0.85011186,  0.83221477,  0.79418345,
         0.77628635,  0.82102908,  0.8098434 ,  0.76510067,  0.76286353,
         0.79642058,  0.77852349]),
 'split1_train_score': array([ 0.9944009 ,  0.99888018,  0.98992161,  0.99776036,  0.99888018,
         1.        ,  0.9944009 ,  1.        ,  1.        ,  1.        ,
         0.99888018,  1.        ]),
 'split2_test_score': array([ 0.77130045,  0.76681614,  0.80044843,  0.79820628,  0.73991031,
         0.74887892,  0.76681614,  0.76008969,  0.73318386,  0.73991031,
         0.74887892,  0.75560538]),
 'split2_train_score': array([ 0.99217002,  0.99552573,  0.98322148,  0.99217002,  0.99552573,
         1.        ,  0.99105145,  0.99776286,  0.99776286,  1.        ,
         0.99552573,  1.        ]),
 'std_fit_time': array([ 0.05344825,  0.22817698,  0.08103768,  0.14229889,  0.06774117,
         0.15168821,  0.05769988,  0.10017736,  0.0529942 ,  0.07123096,
         0.01431095,  0.09777595]),
 'std_score_time': array([ 0.01425502,  0.01814319,  0.0188923 ,  0.02524657,  0.05019106,
         0.05670544,  0.05020673,  0.05297161,  0.0030775 ,  0.04299396,
         0.01706694,  0.08716292]),
 'std_test_score': array([ 0.02081809,  0.01871667,  0.02072461,  0.01552557,  0.02729431,
         0.02124127,  0.02456054,  0.02581377,  0.0214248 ,  0.01941753,
         0.02414039,  0.01848626]),
 'std_train_score': array([ 0.00105372,  0.00190104,  0.00273532,  0.00229722,  0.0015813 ,
         0.        ,  0.00139309,  0.0010546 ,  0.0010546 ,  0.        ,
         0.0015813 ,  0.        ])}

import pandas as pd

df = pd.DataFrame(gs_clf.cv_results_)

print(df)

    mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0        0.907054         0.368276         0.800746          0.992911   
1        2.979425         0.561748         0.793284          0.998135   
2        0.813503         0.358697         0.828358          0.986568   
3        3.526064         0.541212         0.820149          0.995150   
4        0.841796         0.413933         0.778358          0.997762   
5        3.056281         0.647487         0.775373          1.000000   
6        0.867871         0.408912         0.801493          0.992538   
7        2.990557         0.574054         0.796269          0.999254   
8        0.845077         0.359333         0.761194          0.999254   
9        2.923411         0.577125         0.763433          1.000000   
10       0.790717         0.347324         0.782836          0.997762   
11       2.672809         0.473349         0.778358          1.000000   

   param_nb__alpha param_tfidf__use_idf param_vect__ngram_range  \
0              0.1                 True                  (1, 1)   
1              0.1                 True                  (1, 2)   
2              0.1                False                  (1, 1)   
3              0.1                False                  (1, 2)   
4             0.01                 True                  (1, 1)   
5             0.01                 True                  (1, 2)   
6             0.01                False                  (1, 1)   
7             0.01                False                  (1, 2)   
8            0.001                 True                  (1, 1)   
9            0.001                 True                  (1, 2)   
10           0.001                False                  (1, 1)   
11           0.001                False                  (1, 2)   

                                               params  rank_test_score  \
0   {'nb__alpha': 0.1, 'vect__ngram_range': (1, 1)...                4   
1   {'nb__alpha': 0.1, 'vect__ngram_range': (1, 2)...                6   
2   {'nb__alpha': 0.1, 'vect__ngram_range': (1, 1)...                1   
3   {'nb__alpha': 0.1, 'vect__ngram_range': (1, 2)...                2   
4   {'nb__alpha': 0.01, 'vect__ngram_range': (1, 1...                8   
5   {'nb__alpha': 0.01, 'vect__ngram_range': (1, 2...               10   
6   {'nb__alpha': 0.01, 'vect__ngram_range': (1, 1...                3   
7   {'nb__alpha': 0.01, 'vect__ngram_range': (1, 2...                5   
8   {'nb__alpha': 0.001, 'vect__ngram_range': (1, ...               12   
9   {'nb__alpha': 0.001, 'vect__ngram_range': (1, ...               11   
10  {'nb__alpha': 0.001, 'vect__ngram_range': (1, ...                7   
11  {'nb__alpha': 0.001, 'vect__ngram_range': (1, ...                8   

    split0_test_score  split0_train_score  split1_test_score  \
0            0.814318            0.992161           0.816555   
1            0.807606            1.000000           0.805369   
2            0.834452            0.986562           0.850112   
3            0.829978            0.995521           0.832215   
4            0.800895            0.998880           0.794183   
5            0.800895            1.000000           0.776286   
6            0.816555            0.992161           0.821029   
7            0.818792            1.000000           0.809843   
8            0.785235            1.000000           0.765101   
9            0.787472            1.000000           0.762864   
10           0.803132            0.998880           0.796421   
11           0.800895            1.000000           0.778523   

    split1_train_score  split2_test_score  split2_train_score  std_fit_time  \
0             0.994401           0.771300            0.992170      0.053448   
1             0.998880           0.766816            0.995526      0.228177   
2             0.989922           0.800448            0.983221      0.081038   
3             0.997760           0.798206            0.992170      0.142299   
4             0.998880           0.739910            0.995526      0.067741   
5             1.000000           0.748879            1.000000      0.151688   
6             0.994401           0.766816            0.991051      0.057700   
7             1.000000           0.760090            0.997763      0.100177   
8             1.000000           0.733184            0.997763      0.052994   
9             1.000000           0.739910            1.000000      0.071231   
10            0.998880           0.748879            0.995526      0.014311   
11            1.000000           0.755605            1.000000      0.097776   

    std_score_time  std_test_score  std_train_score  
0         0.014255        0.020818         0.001054  
1         0.018143        0.018717         0.001901  
2         0.018892        0.020725         0.002735  
3         0.025247        0.015526         0.002297  
4         0.050191        0.027294         0.001581  
5         0.056705        0.021241         0.000000  
6         0.050207        0.024561         0.001393  
7         0.052972        0.025814         0.001055  
8         0.003077        0.021425         0.001055  
9         0.042994        0.019418         0.000000  
10        0.017067        0.024140         0.001581  
11        0.087163        0.018486         0.000000  

##TODO : neg are neg ? pos are pos ?

nb = MultinomialNB()

# store the vocabulary of X_train

vect = CountVectorizer().fit(X_train)

X_train_tokens = vect.get_feature_names()

len(X_train_tokens)

# examine the first 50 tokens

print(X_train_tokens[0:50])

# examine the last 50 tokens

print(X_train_tokens[-50:])

# Naive Bayes counts the number of times each token appears in each class

# trailing underscore is scikit convention for attributes that are learned during model fitting

X_train_dtm = vect.transform(X_train)

nb = nb.fit(X_train_dtm, y_train)

print(nb.feature_count_)

# rows represent classes, columns represent tokens

print(nb.feature_count_.shape)

# number of times each token appears across all HAM messages

pos_token_count = nb.feature_count_[0, :]

print(pos_token_count)

# number of times each token appears across all SPAM messages

neg_token_count = nb.feature_count_[1, :]

print(neg_token_count)

# create a DataFrame of tokens with their separate ham and spam counts

tokens = pd.DataFrame({"token":X_train_tokens, "pos":pos_token_count, "neg":neg_token_count}).set_index("token")

print(tokens.head())

# examine 5 random DataFrame rows

print(tokens.sample(5, random_state=6))

# Naive Bayes counts the number of observations in each class

print(nb.class_count_)

# add 1 to ham and spam counts to avoid 0 probabilities

tokens['pos'] = tokens['pos'] + 1

tokens['neg'] = tokens['neg'] + 1

tokens.sample(5, random_state=6)

# convert the ham and spam counts into frequencies

tokens['pos'] = tokens['pos'] / nb.class_count_[0]

tokens['neg'] = tokens['neg'] / nb.class_count_[1]

tokens.sample(5, random_state=6)

# calculate the ratio of neg-to-pos for each token

tokens['neg_ratio'] = tokens['neg'] / tokens['pos']

print(tokens.sample(5, random_state=6))

# examine the DataFrame sorted by spam_ratio

# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier

print(tokens.sort_values('neg_ratio', ascending=False))

# look up the spam_ratio for a given token

print(tokens.loc["good", "neg_ratio"])

['00', '000', '007', '00s', '03', '04', '05', '05425', '10', '100', '1000', '100m', '101', '102', '103', '104', '105', '106', '107', '108', '109', '10b', '10s', '10th', '11', '110', '111', '112', '113', '114', '115', '117', '118', '11th', '12', '123', '125', '126', '127', '1272', '128', '129', '1298', '12th', '13', '130', '1305', '131', '132', '133']
['zi', 'zidler', 'ziegler', 'ziembicki', 'zigged', 'ziggy', 'zilch', 'zimbabwe', 'zimmely', 'zimmer', 'zimmerly', 'zinger', 'zingers', 'zinnia', 'zip', 'zipped', 'zippel', 'zipper', 'zippers', 'zippy', 'zips', 'ziyi', 'zodiac', 'zoe', 'zombie', 'zombies', 'zombified', 'zone', 'zones', 'zoo', 'zookeeper', 'zoolander', 'zoologist', 'zoom', 'zooming', 'zooms', 'zoot', 'zophres', 'zorg', 'zorro', 'zsigmond', 'zucker', 'zuehlke', 'zuko', 'zukovsky', 'zulu', 'zwick', 'zwigoff', 'zycie', 'zzzzzzz']
[[  5.  34.   3. ...,   0.   0.   1.]
 [  1.  33.   6. ...,   1.   2.   0.]]
(2, 34197)
[  5.  34.   3. ...,   0.   0.   1.]
[  1.  33.   6. ...,   1.   2.   0.]
        neg   pos
token            
00      1.0   5.0
000    33.0  34.0
007     6.0   3.0
00s     0.0   1.0
03      0.0   2.0
                 neg  pos
token                    
psyches          1.0  0.0
finnegan         0.0  3.0
trustworthiness  1.0  0.0
patton           2.0  2.0
salivate         0.0  1.0
[ 665.  675.]
                      neg       pos  neg_ratio
token                                         
psyches          0.002963  0.001504   1.970370
finnegan         0.001481  0.006015   0.246296
trustworthiness  0.002963  0.001504   1.970370
patton           0.004444  0.004511   0.985185
salivate         0.001481  0.003008   0.492593
                    neg       pos  neg_ratio
token                                       
mulan          0.142222  0.001504  94.577778
flynt          0.118519  0.001504  78.814815
sweetback      0.045926  0.001504  30.540741
ordell         0.044444  0.001504  29.555556
hedwig         0.042963  0.001504  28.570370
argento        0.041481  0.001504  27.585185
taran          0.040000  0.001504  26.600000
pleasantville  0.038519  0.001504  25.614815
lambeau        0.038519  0.001504  25.614815
fei            0.038519  0.001504  25.614815
lebowski       0.075556  0.003008  25.122222
chad           0.037037  0.001504  24.629630
mallory        0.035556  0.001504  23.644444
matilda        0.034074  0.001504  22.659259
rounders       0.032593  0.001504  21.674074
carver         0.032593  0.001504  21.674074
lumumba        0.032593  0.001504  21.674074
redford        0.031111  0.001504  20.688889
rico           0.031111  0.001504  20.688889
cauldron       0.062222  0.003008  20.688889
shrek          0.029630  0.001504  19.703704
maximus        0.028148  0.001504  18.718519
dolores        0.028148  0.001504  18.718519
bubby          0.028148  0.001504  18.718519
capone         0.026667  0.001504  17.733333
gale           0.026667  0.001504  17.733333
hen            0.026667  0.001504  17.733333
bianca         0.026667  0.001504  17.733333
motta          0.026667  0.001504  17.733333
damon          0.053333  0.003008  17.733333
...                 ...       ...        ...
silverman      0.001481  0.021053   0.070370
zach           0.001481  0.021053   0.070370
bont           0.001481  0.021053   0.070370
vikings        0.001481  0.021053   0.070370
silverstone    0.001481  0.021053   0.070370
sphere         0.001481  0.021053   0.070370
dwayne         0.001481  0.021053   0.070370
grinch         0.001481  0.022556   0.065679
caulder        0.001481  0.022556   0.065679
musketeer      0.001481  0.022556   0.065679
webb           0.002963  0.046617   0.063560
jericho        0.001481  0.024060   0.061574
brenner        0.001481  0.024060   0.061574
macdonald      0.001481  0.025564   0.057952
psychlo        0.001481  0.025564   0.057952
bilko          0.001481  0.025564   0.057952
sinise         0.001481  0.027068   0.054733
eszterhas      0.001481  0.027068   0.054733
mandingo       0.001481  0.027068   0.054733
kersey         0.001481  0.027068   0.054733
bronson        0.001481  0.027068   0.054733
alicia         0.002963  0.057143   0.051852
schumacher     0.002963  0.061654   0.048058
hewitt         0.001481  0.034586   0.042834
crawford       0.001481  0.036090   0.041049
freddie        0.001481  0.040602   0.036488
jakob          0.001481  0.042105   0.035185
prinze         0.001481  0.042105   0.035185
seagal         0.002963  0.094737   0.031276
nbsp           0.001481  0.088722   0.016698

[34197 rows x 3 columns]
1.0544946957

#same parameter to vectorize data in order to compare to precedent methods

# Logistic regression

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range =(1,2), max_df =0.7, min_df=2)),

                     ('nb', LogisticRegression()),

                    ])

​

text_clf.fit(X_train, y_train)

y_pred_class = text_clf.predict(X_test)

​

y_pred_prob = text_clf.predict(X_test)

​

metrics.confusion_matrix(y_test, y_pred_class)

​

print(metrics.classification_report(y_test, y_pred_class,

    target_names=["pos","neg"]))

metrics.accuracy_score(y_test, y_pred_class)

metrics.roc_auc_score(y_test, y_pred_prob)

# With TF

from sklearn.feature_extraction.text import TfidfTransformer

​

vect = CountVectorizer(stop_words='english', ngram_range =(1,2), max_df =0.7, min_df=2)

​

# combine fit and transform into a single step

X_train_dtm = vect.fit_transform(X_train)

#look at the training data

pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

# transform testing data (using fitted vocabulary) into a document-term matrix

X_test_dtm = vect.transform(X_test)

X_test_dtm

​

#Compute TF

tf_transformer = TfidfTransformer(use_idf=False)

X_train_tf = tf_transformer.fit_transform(X_train_dtm)

X_train_tf.shape

X_test_tf = TfidTransformer(use_idf=False).fit_transform(X_train_dtm)

#Train logistic regression with TF representation

lr = LogisticRegression().fit(X_train_tf, y_train)

​

y_pred_class = lr.predict(X_test_tf)

​

y_pred_prob = lr.predict_proba(X_test_tf)

​

metrics.confusion_matrix(y_test, y_pred_class)

​

print(metrics.classification_report(y_test, y_pred_class,

    target_names=["pos","neg"]))

metrics.accuracy_score(y_test, y_pred_class)

metrics.roc_auc_score(y_test, y_pred_prob)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-31-5aced182d9df> in <module>
     16 X_train_tf = tf_transformer.fit_transform(X_train_dtm)
     17 X_train_tf.shape
---> 18 X_test_tf = TfidTransformer(use_idf=False).fit_transform(X_train_dtm)
     19 #Train logistic regression with TF representation
     20 lr = LogisticRegression().fit(X_train_tf, y_train)

NameError: name 'TfidTransformer' is not defined

​

### Computing TF-IDF

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_dtm)

X_train_tfidf.shape

​

nb_tfidf = MultinomialNB().fit(X_train_tfidf, y_train)

​

X_test_tfidf = tfidf_transformer.transform(X_test_dtm)

print(X_test_tfidf.shape)

y_pred_class_tfidf = nb_tfidf.predict(X_test_tfidf)

​

metrics.accuracy_score(y_test, y_pred_class_tfidf)

​

plot_confusion_matrix(metrics.confusion_matrix(y_test, y_pred_class_tfidf),[])

(660, 50456)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-46-7c07a568b88d> in <module>
     14 metrics.accuracy_score(y_test, y_pred_class_tfidf)
     15 
---> 16 plot_confusion_matrix(metrics.confusion_matrix(y_test, y_pred_class_tfidf),[])

/usr/local/lib/python3.5/dist-packages/scikitplot/metrics.py in plot_confusion_matrix(y_true, y_pred, labels, true_labels, pred_labels, title, normalize, hide_zeros, hide_counts, x_tick_rotation, ax, figsize, cmap, title_fontsize, text_fontsize)
    115         fig, ax = plt.subplots(1, 1, figsize=figsize)
    116 
--> 117     cm = confusion_matrix(y_true, y_pred, labels=labels)
    118     if labels is None:
    119         classes = unique_labels(y_true, y_pred)

/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py in confusion_matrix(y_true, y_pred, labels, sample_weight)
    251 
    252     """
--> 253     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    254     if y_type not in ("binary", "multiclass"):
    255         raise ValueError("%s is not supported" % y_type)

/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py in _check_targets(y_true, y_pred)
     69     y_pred : array or indicator matrix
     70     """
---> 71     check_consistent_length(y_true, y_pred)
     72     type_true = type_of_target(y_true)
     73     type_pred = type_of_target(y_pred)

/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    228     if len(uniques) > 1:
    229         raise ValueError("Found input variables with inconsistent numbers of"
--> 230                          " samples: %r" % [int(l) for l in lengths])
    231 
    232 

ValueError: Found input variables with inconsistent numbers of samples: [2, 0]

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/ipykernel/pylab/backend_inline.py in show(close, block)
     37             display(
     38                 figure_manager.canvas.figure,
---> 39                 metadata=_fetch_figure_metadata(figure_manager.canvas.figure)
     40             )
     41     finally:

/usr/local/lib/python3.5/dist-packages/ipykernel/pylab/backend_inline.py in _fetch_figure_metadata(fig)
    172     """Get some metadata to help with displaying a figure."""
    173     # determine if a background is needed for legibility
--> 174     if _is_transparent(fig.get_facecolor()):
    175         # the background is transparent
    176         ticksLight = _is_light([label.get_color()

/usr/local/lib/python3.5/dist-packages/ipykernel/pylab/backend_inline.py in _is_transparent(color)
    193 def _is_transparent(color):
    194     """Determine transparency from alpha."""
--> 195     rgba = colors.to_rgba(color)
    196     return rgba[3] < .5

AttributeError: module 'matplotlib.colors' has no attribute 'to_rgba'

#USE STRING KERNELS : not done until now in others group and suggested at the very end of the teacher's notebook

from sklearn.neighbors import KNeighborsClassifier

#test a range of hyperparameters

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', max_df =0.7, min_df=2)),

                     ('tfidf', TfidfTransformer()),

                     ('knn', KNeighborsClassifier()),

                    ])

​

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],

              'tfidf__use_idf' : (True,False),

              'knn__n_neighbors': (10, 25, 50),

              'knn__p' : (1,2),

              'knn__weights': ('uniform', 'distance'),

}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, return_train_score=True)

​

gs_clf.fit(X_train, y_train)

​

​

/usr/local/lib/python3.5/dist-packages/sklearn/model_selection/_split.py:1943: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
  warnings.warn(CV_WARNING, FutureWarning)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'knn__n_neighbors': (10, 25, 50), 'tfidf__use_idf': (True, False), 'vect__ngram_range': [(1, 1), (1, 2)], 'knn__weights': ('uniform', 'distance'), 'knn__p': (1, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

for param_name in sorted(parameters.keys()):

    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

print(gs_clf.best_score_)

df = pd.DataFrame(gs_clf.cv_results_)

print(df.sort_values('rank',ascending=True))

knn__n_neighbors: 50
knn__p: 2
knn__weights: 'distance'
tfidf__use_idf: True
vect__ngram_range: (1, 1)
0.764179104478

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-26-cf35602fc9d0> in <module>
      3 print(gs_clf.best_score_)
      4 df = pd.DataFrame(gs_clf.cv_results_)
----> 5 print(df.sort_values('rank',ascending=True))

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in sort_values(self, by, axis, ascending, inplace, kind, na_position)
   4419             by = by[0]
   4420             k = self._get_label_or_level_values(by, axis=axis,
-> 4421                                                 stacklevel=stacklevel)
   4422 
   4423             if isinstance(ascending, (tuple, list)):

/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py in _get_label_or_level_values(self, key, axis, stacklevel)
   1380             values = self.axes[axis].get_level_values(key)._values
   1381         else:
-> 1382             raise KeyError(key)
   1383 
   1384         # Check for duplicates

KeyError: 'rank'

from sklearn.ensemble import RandomForestClassifier

#test a range of hyperparameters

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', max_df =0.7, min_df=2)),

                     ('tfidf', TfidfTransformer()),

                     ('knn', RandomForestClassifier()),

                    ])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],

              'tfidf__use_idf' : (True,False),

              'n_estimators': (10, 25, 50),

}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, return_train_score=True)

gs_clf.fit(X_train, y_train)

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
print(gs_clf.best_score_)
df = pd.DataFrame(gs_clf.cv_results_)
print(df.sort_values('rank',ascending=True))