In [1]:
import pandas as pd

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
from nltk.corpus import stopwords

from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

import re

In [2]:
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv', encoding='unicode_escape')
df['target'] = df.is_there_an_emotion_directed_at_a_brand_or_product

unique_emotions = df.is_there_an_emotion_directed_at_a_brand_or_product.value_counts().index
for emotion in unique_emotions:
    if emotion == "Positive emotion":
        df.target.replace(to_replace=emotion, value=1, inplace=True)
    elif emotion == "Negative emotion":
        df.target.replace(to_replace=emotion, value=2, inplace=True)
    else:
        df.target.replace(to_replace=emotion, value=0, inplace=True)

display(df)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,target
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,2
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1
...,...,...,...,...
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion,1
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,0
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,0
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,0


Early EDA revealed a single null row: nan in tweet text, nan in emotion, no emotion listed. Just dropping that one.

In [3]:
display(df.info())
display(df.iloc[6])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
 3   target                                              9093 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 284.3+ KB


None

tweet_text                                                                           NaN
emotion_in_tweet_is_directed_at                                                      NaN
is_there_an_emotion_directed_at_a_brand_or_product    No emotion toward brand or product
target                                                                                 0
Name: 6, dtype: object

In [4]:
df = df[df.tweet_text.notnull()]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9092 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9092 non-null   object
 3   target                                              9092 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 355.2+ KB


In [5]:
df.target.value_counts()

0    5544
1    2978
2     570
Name: target, dtype: int64

Now let's find very common words that begin with @ or #

In [6]:
tweet_num = 5
print(df.iloc[tweet_num])
re.findall('@[a-zA-Z0-9]+', df.tweet_text.iloc[tweet_num])

tweet_text                                            @teachntech00 New iPad Apps For #SpeechTherapy...
emotion_in_tweet_is_directed_at                                                                     NaN
is_there_an_emotion_directed_at_a_brand_or_product                   No emotion toward brand or product
target                                                                                                0
Name: 5, dtype: object


['@teachntech00']

In [7]:
list_hashtags = []
for num in range(0, df.shape[0]):
    temp = []
    temp = re.findall('#[a-zA-Z0-9]+', df.tweet_text.iloc[num])
    temp = list(set(temp))
    if len(temp) > 0:
        list_hashtags.extend(temp)

list_hashtags

['#RISE',
 '#SXSW',
 '#SXSW',
 '#SXSW',
 '#iPad',
 '#sxsw',
 '#SXSW',
 '#iear',
 '#asd',
 '#edchat',
 '#SpeechTherapy',
 '#SXSW',
 '#googleio',
 '#SXSW',
 '#android',
 '#CTIA',
 '#hollergram',
 '#sxsw',
 '#sxsw',
 '#sxsw',
 '#fail',
 '#SXSW',
 '#SXSW',
 '#SXSW',
 '#sxsw',
 '#hollergram',
 '#sxsw',
 '#sxsw',
 '#SXSW',
 '#iPhone',
 '#SXSW',
 '#SXSW',
 '#sxsw',
 '#SXSW',
 '#Android',
 '#sxsw',
 '#SXSW',
 '#SXSW',
 '#ChevySMC',
 '#SMCDallas',
 '#SXSW',
 '#iPad',
 '#hollergram',
 '#sxsw',
 '#sxsw',
 '#austin',
 '#SXSW',
 '#4sq3',
 '#KeepAustinWeird',
 '#sxsw',
 '#android',
 '#hollergram',
 '#sxsw',
 '#hollergram',
 '#SXSW',
 '#GDGTLive',
 '#SXSW',
 '#sxsw',
 '#SXSW',
 '#sxsw',
 '#pcma',
 '#engage365',
 '#eventprofs',
 '#ipad2',
 '#sxsw',
 '#iTunes',
 '#sxsw',
 '#Circles',
 '#Social',
 '#Google',
 '#SXSW',
 '#sxsw',
 '#sxsw',
 '#SXSW',
 '#SXSW',
 '#zazzlesxsw',
 '#gadgets',
 '#sxsw',
 '#ipad',
 '#UberSocial',
 '#iPhone',
 '#SXSW',
 '#IPad2',
 '#SmartCover',
 '#SXSW',
 '#apple',
 '#SXSW',
 '#

In [8]:
list_ats = []
for num in range(0, df.shape[0]):
    temp = []
    temp = re.findall('@[a-zA-Z0-9]+', df.tweet_text.iloc[num])
    temp = list(set(temp))
    if len(temp) > 0:
        list_ats.extend(temp)

display(list_ats)
display(set(list_ats))

['@wesley83',
 '@jessedee',
 '@fludapp',
 '@swonderlin',
 '@sxsw',
 '@sxtxstate',
 '@teachntech00',
 '@madebymany',
 '@thenextweb',
 '@samsungmobileus',
 '@HurricaneParty',
 '@Gowalla',
 '@ischafer',
 '@hamsandwich',
 '@madebymany',
 '@madebymany',
 '@marc',
 '@planely',
 '@KLM',
 '@malbonster',
 '@Foursquare',
 '@LaurieShook',
 '@madebymany',
 '@michaelpiliero',
 '@PartnerHub',
 '@gowalla',
 '@madebymany',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@mention',
 '@m

{'@AARPbulletin',
 '@ACLU',
 '@CMmeetup',
 '@DesignerDrugs',
 '@Dr',
 '@EricSchu',
 '@Foursquare',
 '@GapPacker',
 '@Gowalla',
 '@H0U5T0N',
 '@HurricaneParty',
 '@IndySaha',
 '@JordanBell',
 '@KLM',
 '@LaurieShook',
 '@LenaShaw',
 '@MarcusRoss',
 '@NJdoc',
 '@PartnerHub',
 '@PeterApokotos',
 '@Schmittastic',
 '@WuChangTX',
 '@anniemal',
 '@eightbit',
 '@flavugm',
 '@fludapp',
 '@garyvee',
 '@glove',
 '@gowalla',
 '@hamsandwich',
 '@iAmPaintedFace',
 '@iangogame',
 '@ischafer',
 '@jeremie',
 '@jerranalley',
 '@jessedee',
 '@juntao',
 '@kshepherd',
 '@lepanele',
 '@madebymany',
 '@malbonster',
 '@marc',
 '@maura',
 '@mention',
 '@mentionR',
 '@mentionW',
 '@mentionc',
 '@mentione',
 '@mentionm',
 '@mentionn',
 '@mentions',
 '@michaelpiliero',
 '@mobileroadie',
 '@planely',
 '@samsungmobileus',
 '@sehugg',
 '@sheltongreen',
 '@swonderlin',
 '@sxsw',
 '@sxtxstate',
 '@tbalinas',
 '@teachntech00',
 '@thenextweb',
 '@webdoc',
 '@wesley83'}

Make sure to add 'sxsw' and 'mention' to stopwords list at the least.

Now onto modeling!

In [9]:
nltk_sw = stopwords.words('english')
nltk_sw.extend(['sxsw', 'mention'])
nltk_sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [10]:
X = df.tweet_text
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
pipe_cv_mnnb = Pipeline(steps=[('cv', CountVectorizer()),\
                               ('mnnb', MultinomialNB())],
                        verbose=True)

In [12]:
pipe_cv_mnnb.fit(X_train, y_train)

[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('cv', CountVectorizer()), ('mnnb', MultinomialNB())],
         verbose=True)

In [13]:
def reportcrossval(model, model_name, X, Y):
        
    cvs_mean = cross_val_score(model, X, Y).mean()
    
    print(cross_val_score(model, X, Y))
    print(f"The mean of the cross validation scores of {model_name} is {cvs_mean}")
    
reportcrossval(pipe_cv_mnnb, 'Pipeline 1', X_train, y_train)

[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.1s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

In [14]:
param_grid = {'cv__ngram_range': [(1,1),(1,2),(1,3),(2,2),(2,3),(3,3),(1,4),(2,4),(3,4),(4,4)],
              'cv__stop_words': [None, nltk_sw]
             }

Ran grid1 below, it took a long time. I've commented it out so it won't try to run again, but the best score and params are preserved as bestmodelfromgrid1.

In [15]:
# grid = GridSearchCV(pipe_cv_mnnb, param_grid, verbose=1)

In [16]:
# grid.fit(X=X_train, y=y_train)

In [17]:
# grid.best_estimator_

In [18]:
# grid.best_score_

In [19]:
# grid.best_params_

In [20]:
final_model = Pipeline([('cv', CountVectorizer(ngram_range= (1, 2))),\
                        ('mnnb', MultinomialNB())],
                       verbose = True)

In [21]:
final_model.fit(X_train, y_train)

[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s


Pipeline(steps=[('cv', CountVectorizer(ngram_range=(1, 2))),
                ('mnnb', MultinomialNB())],
         verbose=True)

In [22]:
reportcrossval(final_model, 'Final Model', X_train, y_train)

[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipeline] ................ (step 1 of 2) Processing cv, total=   0.2s
[Pipeline] .............. (step 2 of 2) Processing mnnb, total=   0.0s
[Pipel

In [23]:
final_model.score(X_test, y_test)

0.6704795424549054