In [34]:
import pandas as pd

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords

from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

In [2]:
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding = 'unicode_escape')

df['target'] = df.is_there_an_emotion_directed_at_a_brand_or_product
df.target.replace(to_replace = "Positive emotion", value = 1, inplace= True)
df.target.replace(to_replace = "Negative emotion", value = 2, inplace= True)
df.target.replace(to_replace = "No emotion toward brand or product", value = 0, inplace= True)
df.target.replace(to_replace = "I can't tell", value = 0, inplace= True)
df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,target
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,2
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,1
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,0
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,0
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,0


Early EDA revealed a single null row: nan in tweet text, nan in emotion, no emotion listed. Just dropping that one.

In [3]:
df.tweet_text.dropna(inplace=True)

Similarly, these one tweet hiding in the data that isn't a str, which throws off the tokenizer. So...

In [4]:
df.tweet_text = df.tweet_text.map(str)

In [5]:
nltk_sw = stopwords.words('english')

In [6]:
X = df.tweet_text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42)

In [7]:
pipeline_count_vec_mnnb =Pipeline([('countvec', CountVectorizer()), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [8]:
pipeline_count_vec_mnnb.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('countvec', CountVectorizer()), ('mnnb', MultinomialNB())],
         verbose=True)

In [9]:
cvdict = {}
def reportcrossval(model, modelname, xtrain, ytrain):
    print(cross_val_score(model, xtrain, ytrain))
    cvmean = cross_val_score(model, xtrain, ytrain).mean()
    cvdict[modelname] = cvmean 
    print(f"The cv mean of {modelname} is {cvmean}") 
reportcrossval(pipeline_count_vec_mnnb, 'Pipeline 1', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[0.66055718 0.6744868  0.6568915  0.65909091 0.67204696]
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .........

In [10]:
grid1dictl = {
   'countvec__max_df': [.2,.3,.4,.5,.6,.7,.8,.9],
   'countvec__min_df': [.01,.05,.1,.15],
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3)],
   'countvec__max_features': [None, 5,10,15,20,30,50,100],
   'countvec__stop_words': [None, 'english', nltk_sw]
}

Ran grid1 below, it took a long time. I've commented it out so it won't try to run again, but the best score and params are preserved as bestmodelfromgrid1.

In [11]:
#grid1 = GridSearchCV(pipeline_count_vec_mnnb, grid1dictlist, verbose=1)

In [12]:
#grid1.fit(X=X_train, y=y_train)

In [13]:
#grid1.best_estimator_ 

In [14]:
#grid1.best_score_

In [15]:
#grid1.best_params_

In [16]:
bestmodelfromgrid1 = Pipeline([('countvec', CountVectorizer(
                                            max_df = 0.2,
                                            max_features = None,
                                            min_df = 0.01,
                                            ngram_range= (1, 1),
                                            stop_words = nltk_sw)), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [17]:
bestmodelfromgrid1.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('countvec',
                 CountVectorizer(max_df=0.2, min_df=0.01,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [18]:
reportcrossval(bestmodelfromgrid1, 'Best from Grid 1', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[0.63489736 0.61583578 0.62609971 0.62096774 0.60895084]
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .........

In [19]:
testmodelfromgrid = Pipeline([('countvec', CountVectorizer(
                                            ngram_range= (1, 3),
                                            stop_words = None)), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [20]:
reportcrossval(testmodelfromgrid, 'Test from Grid 1', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[0.69941349 0.6957478  0.67668622 0.67228739 0.68451944]
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .........

Seems like whatever was going on with min/max df really hurt the model. Rerunning gris with defaults on those params. (Should be much faster!)

In [28]:
grid2dict = {
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3),(1,4),(2,4),(3,4),(4,4)],
   'countvec__stop_words': [None, 'english', nltk_sw]
}

In [29]:
grid2 = GridSearchCV(pipeline_count_vec_mnnb, grid2dict, verbose=1)

In [30]:
grid2.fit(X=X_train, y=y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.



[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipe

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   45.3s finished


[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.5s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


GridSearchCV(estimator=Pipeline(steps=[('countvec', CountVectorizer()),
                                       ('mnnb', MultinomialNB())],
                                verbose=True),
             param_grid={'countvec__ngram_range': [(1, 1), (1, 2), (1, 3),
                                                   (2, 2), (2, 3), (3, 3),
                                                   (1, 4), (2, 4), (3, 4),
                                                   (4, 4)],
                         'countvec__stop_words': [None, 'english',
                                                  ['i', 'me', 'my', 'myself',
                                                   'we', 'our', 'ours',
                                                   'ourselves', 'you', "you're",
                                                   "you've", "you'll", "you'd",
                                                   'your', 'yours', 'yourself',
                                                   'yourselves', 'he', 

In [31]:
grid2.best_estimator_

Pipeline(steps=[('countvec', CountVectorizer(ngram_range=(1, 3))),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [32]:
grid2.best_score_

0.685730867953432

In [33]:
grid2.best_params_

{'countvec__ngram_range': (1, 3), 'countvec__stop_words': None}

Bar some more preprocessing, futzing with RegEx, stemming and lemming etc, this might be the best we can get with CountVectorizing. Now to try some different models, and tf dif.

In [None]:
pipeline_count_vec_mnnb =Pipeline([('countvec', CountVectorizer()), 
                              ('rf', RandomForestClassifier(random_state = 42))] ,  verbose = True)