In [1]:
import pandas as pd

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords

from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

In [2]:
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='unicode_escape')

df['target'] = df.is_there_an_emotion_directed_at_a_brand_or_product

unique_emotions = df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts().index

for emotion in unique_emotions:
    if emotion == "Positive emotion":
        df.target.replace(to_replace=emotion, value=1, inplace=True)
    elif emotion == "Negative emotion":
        df.target.replace(to_replace=emotion, value=2, inplace=True)
    else:
        df.target.replace(to_replace=emotion, value=0, inplace=True)

display(df)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,target
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,2
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,1
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,0
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,0
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,0


Early EDA revealed a single null row: nan in tweet text, nan in emotion, no emotion listed. Just dropping that one.

In [3]:
df = df[df.tweet_text.notnull()]

In [4]:
df.tweet_text.dropna(inplace=True)

Similarly, these one tweet hiding in the data that isn't a str, which throws off the tokenizer. So...

In [5]:
df.tweet_text = df.tweet_text.map(str)

In [6]:
nltk_sw = stopwords.words('english')
nltk_sw.extend(['sxsw', 'mention'])

In [7]:
X = df.tweet_text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42)

In [8]:
pipeline_count_vec_mnnb =Pipeline([('countvec', CountVectorizer()), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [9]:
pipeline_count_vec_mnnb.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('countvec', CountVectorizer()), ('mnnb', MultinomialNB())],
         verbose=True)

In [10]:
cvdict = {}

In [11]:
def reportcrossval(model, modelname, xtrain, ytrain):
    cvmean = cross_val_score(model, xtrain, ytrain).mean()
    cvdict[modelname] = cvmean 
    print(f"The cv mean of {modelname} is {cvmean}") 
reportcrossval(pipeline_count_vec_mnnb, 'Pipeline 1', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Pipeline 1 is 0.6709166428204129


In [12]:
grid1dict = {
   'countvec__min_df': [.01,.05,.1,.15, None],
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3)],
   'countvec__stop_words': [None, 'english', nltk_sw]
}

Ran grid1 below, it took a long time. I've commented it out so it won't try to run again, but the best score and params are preserved as bestmodelfromgrid1.

In [13]:
#grid1 = GridSearchCV(pipeline_count_vec_mnnb, grid1dict, verbose=1)

In [14]:
#grid1.fit(X=X_train, y=y_train)

In [15]:
#grid1.best_estimator_ 

In [16]:
#grid1.best_score_

In [17]:
#grid1.best_params_

In [18]:
bestmodelfromgrid1 = Pipeline([('countvec', CountVectorizer(
                                            min_df = 0.01,
                                            stop_words = nltk_sw)), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [19]:
bestmodelfromgrid1.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('countvec',
                 CountVectorizer(min_df=0.01,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [20]:
reportcrossval(bestmodelfromgrid1, 'Best from Grid 1', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Best from Grid 1 is 0.626043443929748


In [21]:
testmodelfromgrid = Pipeline([('countvec', CountVectorizer(
                                            ngram_range= (1, 3),
                                            stop_words = None)), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [22]:
reportcrossval(testmodelfromgrid, 'Test', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Test is 0.6858750212464741


Seems like whatever was going on with min/max df really hurt the model. Rerunning a grid with defaults on those params. (Should be much faster!)

In [23]:
grid2dict = {
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3),(1,4),(2,4),(3,4),(4,4)],
   'countvec__stop_words': [None, 'english', nltk_sw]
}

In [24]:
#grid2 = GridSearchCV(pipeline_count_vec_mnnb, grid2dict, verbose=1)

In [25]:
#grid2.fit(X=X_train, y=y_train)

In [26]:
#grid2.best_estimator_

In [27]:
#grid2.best_score_

In [28]:
#grid2.best_params_

In [29]:
bestmodelfromgrid2 = Pipeline([('countvec', CountVectorizer(ngram_range=(1, 2))), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [30]:
bestmodelfromgrid2.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('countvec', CountVectorizer(ngram_range=(1, 2))),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [31]:
reportcrossval(bestmodelfromgrid2, 'Best from Grid 2', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Best from Grid 2 is 0.6860228321603845


Commenting out the grid search again, to prevent it from running every time.

Bar some more preprocessing, futzing with RegEx, stemming and lemming etc, this might be the best we can get with CountVectorizing. Now to try some different models, and tf dif.

In [32]:
pipeline_count_vec_rf =Pipeline([('countvec', CountVectorizer()), 
                              ('rf', RandomForestClassifier(random_state = 42))] ,  verbose = True)

In [33]:
pipeline_count_vec_rf.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.4s


Pipeline(steps=[('countvec', CountVectorizer()),
                ('rf', RandomForestClassifier(random_state=42))],
         verbose=True)

In [34]:
reportcrossval(pipeline_count_vec_rf, 'Default CV and RF', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.8s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
The cv mean of Default CV and RF is 0.6761965261207918


Random Forest is a fair bit slower, but has decent results untuned. Going to run a modest gridsearch.

In [35]:
grid3dict = {
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3)],
   'countvec__stop_words': [None, 'english', nltk_sw]
}

In [36]:
grid3 = GridSearchCV(pipeline_count_vec_rf, grid3dict, verbose=1)

In [37]:
#grid3.fit(X=X_train, y=y_train)

In [38]:
#grid3.best_estimator_

In [39]:
#grid3.best_score_

In [40]:
#grid3.best_params_

In [41]:
bestmodelfromgrid3 = Pipeline([('countvec', CountVectorizer(stop_words= nltk_sw)), 
                              ('rf', RandomForestClassifier(random_state=42))] ,  verbose = True)

In [42]:
bestmodelfromgrid3.fit(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.7s


Pipeline(steps=[('countvec',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('rf', RandomForestClassifier(random_state=42))],
         verbose=True)

In [43]:
reportcrossval(bestmodelfromgrid3, 'Best from Grid 3', X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.0s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.9s
[Pipeline] .......... (step 1 of 2) Processing countvec, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.8s
The cv mean of Best from Grid 3 is 0.6838230959393954


In [44]:
grid4dict = {
   'countvec__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3)],
   'countvec__stop_words': [None, 'english', nltk_sw],
   'rf__n_estimators': [50, 100, 150],
   'rf__max_features': ['auto', 'log2']
}

In [45]:
#grid4 = GridSearchCV(pipeline_count_vec_rf, grid4dict, verbose=1)

In [46]:
#grid4.fit(X=X_train, y=y_train)

In [47]:
#print(grid4.best_estimator_)
#grid4.best_score_

In [48]:
#grid4.best_params_

In [49]:
bestfromgrid4 = Pipeline(steps=[
                ('countvec', CountVectorizer(stop_words = nltk_sw)),
                ('rf', RandomForestClassifier(random_state=42))])

In [50]:
bestfromgrid4.fit(X_train, y_train)

Pipeline(steps=[('countvec',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('rf', RandomForestClassifier(random_state=42))])

In [51]:
reportcrossval(bestfromgrid4, 'Best From Grid 4', X_train, y_train)

The cv mean of Best From Grid 4 is 0.6838230959393954


In [52]:
cvdict

{'Pipeline 1': 0.6709166428204129,
 'Best from Grid 1': 0.626043443929748,
 'Test': 0.6858750212464741,
 'Best from Grid 2': 0.6860228321603845,
 'Default CV and RF': 0.6761965261207918,
 'Best from Grid 3': 0.6838230959393954,
 'Best From Grid 4': 0.6838230959393954}