In [1]:
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from scikitplot.metrics import plot_confusion_matrix

In [2]:
#get all files path
posFiles = glob('review_polarity/txt_sentoken/pos/*')
negFiles = glob('review_polarity/txt_sentoken/neg/*')
#read text files
posReviews = np.array([open(f).read() for f in posFiles])
negReviews = np.array([open(f).read() for f in negFiles])
#use pandas to label and mix the data
polarity_files_df = pd.DataFrame({'pos':posReviews,'neg':negReviews})
polarity_files_df = pd.melt(polarity_files_df, value_vars=['pos','neg'],value_name="text",var_name="label")
polarity_files_df["label_num"] = polarity_files_df.label.map({"neg":0, "pos":1})
polarity_files_df.sample(5)

Unnamed: 0,label,text,label_num
1420,neg,capsule : a ham-handed and over/underwritten m...,0
978,pos,""" when you get out of jail , you can kill him...",1
637,pos,natural born killers is really a very simple s...,1
1123,neg,note : some may consider portions of the follo...,0
1429,neg,a couple of criminals ( mario van peebles and ...,0


In [3]:
# split X and y into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(polarity_files_df.text, polarity_files_df.label_num, test_size=0.33, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1340,)
(660,)
(1340,)
(660,)


In [4]:
# remove English stop words
# include 1-grams and 2-grams (if 3-grams : seems not better because of the length of the vector)
# ignore terms that appear in more than 70% of the documents (intuitively meaningful, it is indeed the best multiple of 10% to have a good score )
# only keep terms that appear in at least 2 documents

# TODO : remove 'not' etc. from stopwords to not remove 'not' from sentences like 'this film is not bad'
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range =(1,2), max_df =0.7, min_df=2)),
                     ('nb', MultinomialNB()),
                    ])

text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
# à vérifier : pos et neg sont biens les pos et neg
print(metrics.classification_report(y_test, y_pred,
    target_names=["pos","neg"]))
metrics.accuracy_score(y_test, y_pred)

# Plot non-normalized confusion matrix
#plt.figure()
#plot_confusion_matrix(cnf_matrix, classes=['pos','neg'],
#                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
#plt.figure()
#plot_confusion_matrix(cnf_matrix, classes=['pos','neg'], normalize=True,
#                      title='Normalized confusion matrix')

#plt.show()

              precision    recall  f1-score   support

         pos       0.81      0.80      0.80       335
         neg       0.79      0.80      0.80       325

   micro avg       0.80      0.80      0.80       660
   macro avg       0.80      0.80      0.80       660
weighted avg       0.80      0.80      0.80       660



0.8

In [5]:
#test a range of hyperparameters
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range =(1,3), max_df =0.7, min_df=2)),
                     ('nb', MultinomialNB()),
                    ])


text_clf.get_params()

{'memory': None,
 'steps': [('vect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=0.7, max_features=None, min_df=2,
           ngram_range=(1, 3), preprocessor=None, stop_words='english',
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None)),
  ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
 'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.7, max_features=None, min_df=2,
         ngram_range=(1, 3), preprocessor=None, stop_words='english',
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None),
 'nb': MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
 'vect__analyzer': 'word',
 'v

In [6]:
from sklearn.model_selection import GridSearchCV
#TO ADAPT depending on what text_clf.get_params() displays
#how did we look for a good parameter alpha : try with 1e-3 and 1e-2 : 1e-2 is the best so we tried with 1e0 and 1e-1 and 1e-2 : 1e-1 is the best one
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'nb__alpha': (1e-2, 1e-1),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, return_train_score=True)

In [7]:
gs_clf = gs_clf.fit(X_train, y_train)

gs_clf.best_score_  



0.7805970149253731

In [8]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

nb__alpha: 0.1
vect__ngram_range: (1, 1)


In [9]:
gs_clf.best_estimator_

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('nb', MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))])

In [10]:
gs_clf.best_estimator_.get_params()["nb"]

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [11]:
gs_clf.cv_results_

{'mean_fit_time': array([0.86149295, 2.63325866, 0.78922749, 2.75457239]),
 'std_fit_time': array([0.04743894, 0.08169105, 0.03576745, 0.24032973]),
 'mean_score_time': array([0.35999886, 0.56723094, 0.37510864, 0.51553424]),
 'std_score_time': array([0.00871117, 0.03332667, 0.02796307, 0.10340577]),
 'param_nb__alpha': masked_array(data=[0.01, 0.01, 0.1, 0.1],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_vect__ngram_range': masked_array(data=[(1, 1), (1, 2), (1, 1), (1, 2)],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'nb__alpha': 0.01, 'vect__ngram_range': (1, 1)},
  {'nb__alpha': 0.01, 'vect__ngram_range': (1, 2)},
  {'nb__alpha': 0.1, 'vect__ngram_range': (1, 1)},
  {'nb__alpha': 0.1, 'vect__ngram_range': (1, 2)}],
 'split0_test_score': array([0.75615213, 0.74496644, 0.78299776, 0.75838926]),
 'split1_test_score': array([0.7360179 , 0.75167785, 0.760

In [12]:
import pandas as pd
df = pd.DataFrame(gs_clf.cv_results_)
print(df)

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.861493      0.047439         0.359999        0.008711   
1       2.633259      0.081691         0.567231        0.033327   
2       0.789227      0.035767         0.375109        0.027963   
3       2.754572      0.240330         0.515534        0.103406   

  param_nb__alpha param_vect__ngram_range  \
0            0.01                  (1, 1)   
1            0.01                  (1, 2)   
2             0.1                  (1, 1)   
3             0.1                  (1, 2)   

                                             params  split0_test_score  \
0  {'nb__alpha': 0.01, 'vect__ngram_range': (1, 1)}           0.756152   
1  {'nb__alpha': 0.01, 'vect__ngram_range': (1, 2)}           0.744966   
2   {'nb__alpha': 0.1, 'vect__ngram_range': (1, 1)}           0.782998   
3   {'nb__alpha': 0.1, 'vect__ngram_range': (1, 2)}           0.758389   

   split1_test_score  split2_test_score  mean_test_score  std_test_

In [13]:
##TODO : neg are neg ? pos are pos ?
nb = MultinomialNB()
# store the vocabulary of X_train
vect = CountVectorizer().fit(X_train)
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)
# examine the first 50 tokens
print(X_train_tokens[0:50])
# examine the last 50 tokens
print(X_train_tokens[-50:])
# Naive Bayes counts the number of times each token appears in each class
# trailing underscore is scikit convention for attributes that are learned during model fitting
X_train_dtm = vect.transform(X_train)
nb = nb.fit(X_train_dtm, y_train)
print(nb.feature_count_)
# rows represent classes, columns represent tokens
print(nb.feature_count_.shape)
# number of times each token appears across all HAM messages
pos_token_count = nb.feature_count_[0, :]
print(pos_token_count)
# number of times each token appears across all SPAM messages
neg_token_count = nb.feature_count_[1, :]
print(neg_token_count)
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({"token":X_train_tokens, "pos":pos_token_count, "neg":neg_token_count}).set_index("token")
print(tokens.head())
# examine 5 random DataFrame rows
print(tokens.sample(5, random_state=6))
# Naive Bayes counts the number of observations in each class
print(nb.class_count_)
# add 1 to ham and spam counts to avoid 0 probabilities
tokens['pos'] = tokens['pos'] + 1
tokens['neg'] = tokens['neg'] + 1
tokens.sample(5, random_state=6)
# convert the ham and spam counts into frequencies
tokens['pos'] = tokens['pos'] / nb.class_count_[0]
tokens['neg'] = tokens['neg'] / nb.class_count_[1]
tokens.sample(5, random_state=6)
# calculate the ratio of neg-to-pos for each token
tokens['neg_ratio'] = tokens['neg'] / tokens['pos']
print(tokens.sample(5, random_state=6))
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
print(tokens.sort_values('neg_ratio', ascending=False))
# look up the spam_ratio for a given token
print(tokens.loc["good", "neg_ratio"])

['00', '000', '0009f', '007', '00s', '05', '10', '100', '1000', '10000', '100m', '101', '102', '103', '104', '105', '106', '107', '108', '109', '10b', '10s', '10th', '11', '110', '111', '112', '113', '1138', '114', '115', '118', '11th', '12', '121', '125', '126', '127', '1272', '129', '1298', '12th', '13', '130', '1305', '132', '135', '137', '138', '13th']
['ziembicki', 'ziggy', 'zigs', 'zigzagged', 'zilch', 'zillion', 'zimbabwe', 'zimmer', 'zimmerman', 'zinger', 'zingers', 'zinnia', 'zip', 'zipped', 'zippel', 'zipper', 'zippers', 'zippy', 'zit', 'zodiac', 'zoe', 'zombie', 'zombies', 'zone', 'zones', 'zoo', 'zookeeper', 'zookeepers', 'zoolander', 'zoologist', 'zoologists', 'zoom', 'zooming', 'zooms', 'zoot', 'zorg', 'zorro', 'zsigmond', 'zucker', 'zuehlke', 'zuko', 'zukovsky', 'zulu', 'zundel', 'zurg', 'zweibel', 'zwick', 'zwigoff', 'zycie', 'zzzzzzz']
[[ 3. 49.  0. ...  1.  0.  1.]
 [ 1. 39.  1. ...  1.  2.  0.]]
(2, 33778)
[ 3. 49.  0. ...  1.  0.  1.]
[ 1. 39.  1. ...  1.  2.  0.]
 

In [14]:
#same parameter to vectorize data in order to compare to precedent methods
# Logistic regression
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', ngram_range =(1,2), max_df =0.7, min_df=2)),
                     ('lr', LogisticRegression()),
                    ])

text_clf.fit(X_train, y_train)
y_pred_class = text_clf.predict(X_test)

y_pred_prob = text_clf.predict(X_test)

metrics.confusion_matrix(y_test, y_pred_class)

print(metrics.classification_report(y_test, y_pred_class,
    target_names=["pos","neg"]))
metrics.accuracy_score(y_test, y_pred_class)
metrics.roc_auc_score(y_test, y_pred_prob)



              precision    recall  f1-score   support

         pos       0.83      0.84      0.84       335
         neg       0.84      0.83      0.83       325

   micro avg       0.84      0.84      0.84       660
   macro avg       0.84      0.84      0.84       660
weighted avg       0.84      0.84      0.84       660



0.8362342135476464

In [31]:
# With TF
from sklearn.feature_extraction.text import TfidfTransformer

vect = CountVectorizer(stop_words='english', ngram_range =(1,2), max_df =0.7, min_df=2)

# combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)
#look at the training data
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

#Compute TF
tf_transformer = TfidfTransformer(use_idf=False)
X_train_tf = tf_transformer.fit_transform(X_train_dtm)
X_train_tf.shape
X_test_tf = TfidfTransformer(use_idf=False).fit_transform(X_train_dtm)
#Train logistic regression with TF representation
lr = LogisticRegression().fit(X_train_tf, y_train)

y_pred_class = lr.predict(X_test_tf)

y_pred_prob = lr.predict_proba(X_test_tf)

metrics.confusion_matrix(y_test, y_pred_class)

print(metrics.classification_report(y_test, y_pred_class,
    target_names=["pos","neg"]))
metrics.accuracy_score(y_test, y_pred_class)
metrics.roc_auc_score(y_test, y_pred_prob)



ValueError: Found input variables with inconsistent numbers of samples: [660, 1340]

In [None]:

### Computing TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_dtm)
X_train_tfidf.shape

nb_tfidf = MultinomialNB().fit(X_train_tfidf, y_train)

X_test_tfidf = tfidf_transformer.transform(X_test_dtm)
print(X_test_tfidf.shape)
y_pred_class_tfidf = nb_tfidf.predict(X_test_tfidf)

metrics.accuracy_score(y_test, y_pred_class_tfidf)

plot_confusion_matrix(metrics.confusion_matrix(y_test, y_pred_class_tfidf),[])

In [18]:
#USE STRING KERNELS : not done until now in others group and suggested at the very end of the teacher's notebook

In [19]:
from sklearn.neighbors import KNeighborsClassifier
#test a range of hyperparameters
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', max_df =0.7, min_df=2)),
                     ('tfidf', TfidfTransformer()),
                     ('knn', KNeighborsClassifier()),
                    ])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf' : (True,False),
              'knn__n_neighbors': (10, 25, 50),
              'knn__p' : (1,2),
              'knn__weights': ('uniform', 'distance'),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, return_train_score=True)

gs_clf.fit(X_train, y_train)





GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...ki',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'knn__n_neighbors': (10, 25, 50), 'knn__p': (1, 2), 'knn__weights': ('uniform', 'distance')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [30]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
print(gs_clf.best_score_)
df = pd.DataFrame({'rank':gs_clf.cv_results_['rank_test_score'], 
                  'ngram_range':gs_clf.cv_results_['param_vect__ngram_range'],
                  'tfidf':gs_clf.cv_results_['param_tfidf__use_idf'],
                  'knn__n_neighbors': gs_clf.cv_results_['param_knn__n_neighbors'],
                  'knn__p': gs_clf.cv_results_['param_knn__p'], 
                  'knn__weights': gs_clf.cv_results_['param_knn__weights'], 
                  'mean_test_score':gs_clf.cv_results_['mean_test_score'], 
                  'mean_train_score':gs_clf.cv_results_['mean_train_score']}).set_index('rank')
df.sort_values('rank',ascending=True).head(10)

knn__n_neighbors: 50
knn__p: 2
knn__weights: 'distance'
tfidf__use_idf: True
vect__ngram_range: (1, 2)
0.7626865671641792


Unnamed: 0_level_0,ngram_range,tfidf,knn__n_neighbors,knn__p,knn__weights,mean_test_score,mean_train_score
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"(1, 2)",True,50,2,distance,0.762687,1.0
2,"(1, 2)",True,50,2,uniform,0.75597,0.767172
3,"(1, 1)",True,50,2,distance,0.753731,1.0
4,"(1, 2)",False,50,2,distance,0.749254,1.0
5,"(1, 1)",False,50,2,distance,0.747761,1.0
6,"(1, 1)",True,50,2,uniform,0.741791,0.76941
7,"(1, 2)",False,25,2,distance,0.73806,1.0
8,"(1, 2)",False,25,2,uniform,0.736567,0.760822
8,"(1, 2)",False,50,2,uniform,0.736567,0.749627
10,"(1, 1)",False,50,2,uniform,0.734328,0.743657


In [None]:
from sklearn.ensemble import RandomForestClassifier
#test a range of hyperparameters
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english', max_df =0.7, min_df=2)),
                     ('tfidf', TfidfTransformer()),
                     ('knn', RandomForestClassifier()),
                    ])

parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf' : (True,False),
              'n_estimators': (10, 25, 50),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, return_train_score=True)

gs_clf.fit(X_train, y_train)

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
print(gs_clf.best_score_)
df = pd.DataFrame(gs_clf.cv_results_)
print(df.sort_values('rank',ascending=True))