In [1]:
import pandas as pd

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords

from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

In [2]:
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='unicode_escape')

df['target'] = df.is_there_an_emotion_directed_at_a_brand_or_product

unique_emotions = df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts().index

for emotion in unique_emotions:
    if emotion == "Positive emotion":
        df.target.replace(to_replace=emotion, value=1, inplace=True)
    elif emotion == "Negative emotion":
        df.target.replace(to_replace=emotion, value=2, inplace=True)
    else:
        df.target.replace(to_replace=emotion, value=0, inplace=True)

display(df)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,target
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,2
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,1
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,0
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,0
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,0


In [3]:
df.tweet_text.dropna(inplace=True)

In [4]:
df.tweet_text = df.tweet_text.map(str)

In [5]:
nltk_sw = stopwords.words('english')
nltk_sw.extend(['sxsw', 'mention'])


In [6]:
X = df.tweet_text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42)

In [7]:
cvdict = {}
def reportcrossval(model, modelname, xtrain, ytrain):
    cvmean = cross_val_score(model, xtrain, ytrain).mean()
    cvdict[modelname] = cvmean 
    print(f"The cv mean of {modelname} is {cvmean}") 

In [8]:
pipeline_dummy = Pipeline([('dummy', DummyClassifier(strategy = 'most_frequent'))] ,  verbose = True)

In [9]:
pipeline_dummy.fit(X_train, y_train)

[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s


Pipeline(steps=[('dummy', DummyClassifier(strategy='most_frequent'))],
         verbose=True)

In [10]:
reportcrossval(pipeline_dummy, 'Dummy', X_train, y_train)

[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
[Pipeline] ............. (step 1 of 1) Processing dummy, total=   0.0s
The cv mean of Dummy is 0.6094735607799769


In [11]:
pipeline_tfidf_mnnb = Pipeline([('Tfidf', TfidfVectorizer()), 
                              ('mnnb', MultinomialNB())] ,  verbose = True)

In [12]:
pipeline_tfidf_mnnb.fit(X_train, y_train)

[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('Tfidf', TfidfVectorizer()), ('mnnb', MultinomialNB())],
         verbose=True)

In [13]:
reportcrossval(pipeline_tfidf_mnnb, 'Default Tfidf/Naive Bayes', X_train, y_train)

[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Default Tfidf/Naive Bayes is 0.6473088516576553


Default Tfidf is worse than CountVec, going to do some gridsearching.

In [14]:
grid2dict = {
   'Tfidf__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3),(1,4),(2,4),(3,4),(4,4)],
   'Tfidf__stop_words': [None, 'english', nltk_sw]
    }

In [15]:
#tfgrid1 = GridSearchCV(pipeline_tfidf_mnnb, grid2dict, verbose=1)

In [16]:
#tfgrid1.fit(X=X_train, y=y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.3s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.4s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:   44.2s finished



[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


GridSearchCV(estimator=Pipeline(steps=[('Tfidf', TfidfVectorizer()),
                                       ('mnnb', MultinomialNB())],
                                verbose=True),
             param_grid={'Tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2),
                                                (2, 3), (3, 3), (1, 4), (2, 4),
                                                (3, 4), (4, 4)],
                         'Tfidf__stop_words': [None, 'english',
                                               ['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
                                                'yours', 'yourself',
                                                'yourselves', 'he', 'him',
                                                'his', 'himself', 'she',
    

In [17]:
#tfgrid1.best_estimator_

Pipeline(steps=[('Tfidf',
                 TfidfVectorizer(ngram_range=(2, 2),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [18]:
#tfgrid1.best_score_

0.6593361848432494

In [19]:
#tfgrid1.best_params_

{'Tfidf__ngram_range': (2, 2),
 'Tfidf__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  'from',
  'up',
  'down',
  'in',
  'out',
  'on',
  'off'

In [20]:
tfgrid1best = Pipeline(steps=[('Tfidf',
                 TfidfVectorizer(ngram_range=(2, 2),
                                 stop_words= nltk_sw)),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [21]:
tfgrid1best.fit(X_train, y_train)

[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('Tfidf',
                 TfidfVectorizer(ngram_range=(2, 2),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [22]:
reportcrossval(tfgrid1best, 'Best from tfgrid1', X_train, y_train)

[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ............. (step 1 of 2) Processing Tfidf, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
The cv mean of Best from tfgrid1 is 0.6593361848432494
