In [1]:
import pandas as pd

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords

from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

In [2]:
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='unicode_escape')

df['target'] = df.is_there_an_emotion_directed_at_a_brand_or_product

unique_emotions = df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts().index

for emotion in unique_emotions:
    if emotion == "Positive emotion":
        df.target.replace(to_replace=emotion, value=1, inplace=True)
    elif emotion == "Negative emotion":
        df.target.replace(to_replace=emotion, value=0, inplace=True)
    else:
        df.target.replace(to_replace=emotion, value=0, inplace=True)

display(df)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,target
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,1
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,0
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,0
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,0


Early EDA revealed a single null row: nan in tweet text, nan in emotion, no emotion listed. Just dropping that one.

In [3]:
df = df[df.tweet_text.notnull()]

In [4]:
df.tweet_text.dropna(inplace=True)

Similarly, these one tweet hiding in the data that isn't a str, which throws off the tokenizer. So...

In [5]:
df.tweet_text = df.tweet_text.map(str)

In [6]:
nltk_sw = stopwords.words('english')
nltk_sw.extend(['sxsw', 'mention'])

In [7]:
X = df.tweet_text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42)

In [8]:
cvdict = {}

In [9]:
def reportcrossval(model, modelname, xtrain, ytrain):
    cvmean = cross_val_score(model, xtrain, ytrain).mean()
    cvdict[modelname] = cvmean 
    print(f"The cv mean of {modelname} is {cvmean}") 


In [10]:
pipeline_dummy = Pipeline([('dummy', DummyClassifier(strategy = 'most_frequent'))] ,  verbose = True)
pipeline_dummy.fit(X_train, y_train)
reportcrossval(pipeline_dummy, 'Dummy', X_train, y_train)

[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
The cv mean of Dummy is 0.6751722847006022


In [11]:
pipeline_count_vec_mnnb =Pipeline([('countvec', CountVectorizer()), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [12]:
pipeline_count_vec_mnnb.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('countvec', CountVectorizer()), ('mnnb', MultinomialNB())],
         verbose=True)

In [13]:
reportcrossval(pipeline_count_vec_mnnb, 'Pipeline 1', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Pipeline 1 is 0.7212172131941144


In [14]:
grid1dict = {
   'countvec__min_df': [.01,.05,.1,.15, None],
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3)],
   'countvec__stop_words': [None, 'english', nltk_sw]
}

Ran grid1 below, it took a long time. I've commented it out so it won't try to run again, but the best score and params are preserved as bestmodelfromgrid1.

In [56]:
grid1 = GridSearchCV(pipeline_count_vec_mnnb, grid1dict, verbose=1, n_jobs = 3)

In [57]:
grid1.fit(X=X_train, y=y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    4.5s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   14.1s
[Parallel(n_jobs=3)]: Done 445 out of 450 | elapsed:   29.8s remaining:    0.2s
[Parallel(n_jobs=3)]: Done 450 out of 450 | elapsed:   30.0s finished


[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


GridSearchCV(estimator=Pipeline(steps=[('countvec', CountVectorizer()),
                                       ('mnnb', MultinomialNB())],
                                verbose=True),
             n_jobs=3,
             param_grid={'countvec__min_df': [0.01, 0.05, 0.1, 0.15, None],
                         'countvec__ngram_range': [(1, 1), (1, 2), (1, 3),
                                                   (2, 2), (2, 3), (3, 3)],
                         'countvec__stop_words': [None, 'english',
                                                  ['i', 'me', 'my', 'myself',
                                                   'we', 'our', 'ours',
                                                   'ourselves', 'you', "you're",
                                                   "you've", "you'll", "you'd",
                                                   'your', 'yours', 'yourself',
                                                   'yourselves', 'he', 'him',
                            

In [58]:
grid1.best_estimator_ 

Pipeline(steps=[('countvec',
                 CountVectorizer(min_df=0.01,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [59]:
grid1.best_score_

0.6904211212544349

In [19]:
grid1.best_params_

{'countvec__min_df': 0.01,
 'countvec__ngram_range': (1, 1),
 'countvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',

In [20]:
bestmodelfromgrid1 = Pipeline([('countvec', CountVectorizer(
                                            min_df = 0.01,
                                            stop_words = nltk_sw)), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [21]:
bestmodelfromgrid1.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('countvec',
                 CountVectorizer(min_df=0.01,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [22]:
reportcrossval(bestmodelfromgrid1, 'Best from Grid 1', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Best from Grid 1 is 0.6904211212544349


In [23]:
testmodelfromgrid = Pipeline([('countvec', CountVectorizer(
                                            ngram_range= (1, 3),
                                            stop_words = None)), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [24]:
reportcrossval(testmodelfromgrid, 'Test', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.5s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.5s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.5s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Test is 0.7332426099921899


Seems like whatever was going on with min/max df really hurt the model. Rerunning a grid with defaults on those params. (Should be much faster!)

In [25]:
grid2dict = {
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3),(1,4),(2,4),(3,4),(4,4)],
   'countvec__stop_words': [None, 'english', nltk_sw]
}

In [26]:
grid2 = GridSearchCV(pipeline_count_vec_mnnb, grid2dict, verbose=1)

In [27]:
grid2.fit(X=X_train, y=y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   47.4s finished


[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


GridSearchCV(estimator=Pipeline(steps=[('countvec', CountVectorizer()),
                                       ('mnnb', MultinomialNB())],
                                verbose=True),
             param_grid={'countvec__ngram_range': [(1, 1), (1, 2), (1, 3),
                                                   (2, 2), (2, 3), (3, 3),
                                                   (1, 4), (2, 4), (3, 4),
                                                   (4, 4)],
                         'countvec__stop_words': [None, 'english',
                                                  ['i', 'me', 'my', 'myself',
                                                   'we', 'our', 'ours',
                                                   'ourselves', 'you', "you're",
                                                   "you've", "you'll", "you'd",
                                                   'your', 'yours', 'yourself',
                                                   'yourselves', 'he', 

In [28]:
grid2.best_estimator_

Pipeline(steps=[('countvec', CountVectorizer(ngram_range=(1, 2))),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [29]:
grid2.best_score_

0.7338295505644569

In [30]:
grid2.best_params_

{'countvec__ngram_range': (1, 2), 'countvec__stop_words': None}

In [31]:
bestmodelfromgrid2 = Pipeline([('countvec', CountVectorizer(ngram_range=(1, 2))), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [32]:
bestmodelfromgrid2.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('countvec', CountVectorizer(ngram_range=(1, 2))),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [33]:
reportcrossval(bestmodelfromgrid2, 'Best from Grid 2', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Best from Grid 2 is 0.7338295505644569


Commenting out the grid search again, to prevent it from running every time.

Bar some more preprocessing, futzing with RegEx, stemming and lemming etc, this might be the best we can get with CountVectorizing. Now to try some different models, and tf dif.

In [34]:
pipeline_count_vec_rf =Pipeline([('countvec', CountVectorizer()), 
                              ('rf', RandomForestClassifier(random_state = 42))] ,  verbose = True)

In [35]:
pipeline_count_vec_rf.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.2s


Pipeline(steps=[('countvec', CountVectorizer()),
                ('rf', RandomForestClassifier(random_state=42))],
         verbose=True)

In [36]:
reportcrossval(pipeline_count_vec_rf, 'Default CV and RF', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
The cv mean of Default CV and RF is 0.7256180841381893


Random Forest is a fair bit slower, but has decent results untuned. Going to run a modest gridsearch.

In [37]:
grid3dict = {
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3)],
   'countvec__stop_words': [None, 'english', nltk_sw]
}

In [38]:
grid3 = GridSearchCV(pipeline_count_vec_rf, grid3dict, verbose=1)

In [39]:
grid3.fit(X=X_train, y=y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   8.2s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   7.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  14.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  12.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.4s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.4s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipel

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 13.8min finished


[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.5s


GridSearchCV(estimator=Pipeline(steps=[('countvec', CountVectorizer()),
                                       ('rf',
                                        RandomForestClassifier(random_state=42))],
                                verbose=True),
             param_grid={'countvec__ngram_range': [(1, 1), (1, 2), (1, 3),
                                                   (2, 2), (2, 3), (3, 3)],
                         'countvec__stop_words': [None, 'english',
                                                  ['i', 'me', 'my', 'myself',
                                                   'we', 'our', 'ours',
                                                   'ourselves', 'you', "you're",
                                                   "you've", "you'll", "you'd",
                                                   'your', 'yours', 'yourself',
                                                   'yourselves', 'he', 'him',
                                                   'his', 'himsel

In [40]:
grid3.best_estimator_

Pipeline(steps=[('countvec',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('rf', RandomForestClassifier(random_state=42))],
         verbose=True)

In [41]:
grid3.best_score_

0.7273767543133033

In [42]:
grid3.best_params_

{'countvec__ngram_range': (1, 1),
 'countvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out',
  'on',
 

In [43]:
bestmodelfromgrid3 = Pipeline([('countvec', CountVectorizer(stop_words= nltk_sw)), 
                              ('rf', RandomForestClassifier(random_state=42))] ,  verbose = True)

In [44]:
bestmodelfromgrid3.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.4s


Pipeline(steps=[('countvec',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('rf', RandomForestClassifier(random_state=42))],
         verbose=True)

In [45]:
reportcrossval(bestmodelfromgrid3, 'Best from Grid 3', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
The cv mean of Best from Grid 3 is 0.7273767543133033


In [46]:
grid4dict = {
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3)],
   'countvec__stop_words': [None, 'english', nltk_sw],
   'rf__n_estimators': [50, 100, 150],
   'rf__max_features': ['auto', 'log2']
}

In [47]:
grid4 = GridSearchCV(pipeline_count_vec_rf, grid4dict, verbose=1)

In [48]:
grid4.fit(X=X_train, y=y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   1.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.3s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.2s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  15.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  14.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  14.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  15.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  14.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   6.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=  11.3s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  11.3s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  11.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  17.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  16.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  16.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  17.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.6s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.3s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  14.3s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.4s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   9.3s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   8.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  11.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  11.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  11.2s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=  14.5s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   6.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   6.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   6.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  12.1s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.4s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  13.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=  12.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipel

[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed: 85.5min finished


[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   7.8s


GridSearchCV(estimator=Pipeline(steps=[('countvec', CountVectorizer()),
                                       ('rf',
                                        RandomForestClassifier(random_state=42))],
                                verbose=True),
             param_grid={'countvec__ngram_range': [(1, 1), (1, 2), (1, 3),
                                                   (2, 2), (2, 3), (3, 3)],
                         'countvec__stop_words': [None, 'english',
                                                  ['i', 'me', 'my', 'myself',
                                                   'we', 'our', 'ours',
                                                   'ourselves', 'you', "you're",
                                                   "you've", "you'll", "you'd",
                                                   'your', 'yours', 'yourself',
                                                   'yourselves', 'he', 'him',
                                                   'his', 'himsel

In [49]:
print(grid4.best_estimator_)
grid4.best_score_

Pipeline(steps=[('countvec',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('rf',
                 RandomForestClassifier(n_estimators=150, random_state=42))],
         verbose=True)


0.7279639100397389

In [50]:
grid4.best_params_

{'countvec__ngram_range': (1, 1),
 'countvec__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out',
  'on',
 

In [51]:
bestfromgrid4 = Pipeline(steps=[
                ('countvec', CountVectorizer(stop_words = nltk_sw)),
                ('rf', RandomForestClassifier(random_state=42))])

In [52]:
bestfromgrid4.fit(X_train, y_train)

Pipeline(steps=[('countvec',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('rf', RandomForestClassifier(random_state=42))])

In [53]:
reportcrossval(bestfromgrid4, 'Best From Grid 4', X_train, y_train)

The cv mean of Best From Grid 4 is 0.7273767543133033


In [54]:
cvdict

{'Dummy': 0.6751722847006022,
 'Pipeline 1': 0.7212172131941144,
 'Best from Grid 1': 0.6904211212544349,
 'Test': 0.7332426099921899,
 'Best from Grid 2': 0.7338295505644569,
 'Default CV and RF': 0.7256180841381893,
 'Best from Grid 3': 0.7273767543133033,
 'Best From Grid 4': 0.7273767543133033}

In [55]:
bestmodelfromgrid2

Pipeline(steps=[('countvec', CountVectorizer(ngram_range=(1, 2))),
                ('mnnb', MultinomialNB())],
         verbose=True)