### EDA and Preprocessing Notebook

In [115]:
import numpy as np
import pandas as pd
import spacy
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer, TfidfTransformer

from nltk.stem.wordnet import WordNetLemmatizer
import string
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [116]:
nlp = spacy.load('en_core_web_sm')

In [117]:
print(stopwords)
print(nlp.Defaults.stop_words)

<WordListCorpusReader in 'C:\\Users\\josep\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>
{'onto', 'two', 'be', 'back', 'whereupon', 'we', 'indeed', 'mine', 'move', 'hence', 'your', 'his', "'ll", 'at', 'below', 'being', 'well', 'into', 'same', 'seemed', 'they', 'eight', 'yet', 'down', 'less', 'whence', 'more', 'about', 'latter', 'used', 'if', 'then', 'anywhere', 'five', 'all', '‘re', 'cannot', 'alone', 'thereafter', 'something', 'again', '‘s', 'everyone', 'ten', 'whither', 'before', 'just', 'bottom', 'are', 'herself', 'hereby', 'whenever', 'while', 'keep', 'up', 'this', "'re", 'could', 'here', 'will', 'my', 'fifty', '‘d', '‘ve', 'whatever', 'was', 'anything', 'last', 'ours', "'ve", 'been', 'it', 'yourselves', '‘ll', 'no', 'various', 'behind', 'of', 'please', 'thus', 'themselves', 'even', 'yours', 'go', 'thereupon', 'beforehand', 'former', 'somewhere', 'now', 'whoever', 'or', 'where', 'side', 'as', 'empty', 'there', 'seem', 'whom', 'further', 'becomes', 'above', 'by', 'four', 'seems

In [118]:
df = pd.read_csv('data/product_tweets.csv',encoding='latin1')

In [119]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [120]:
df['is_there_an_emotion_directed_at_a_brand_or_product'].unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

In [121]:
df = df.rename(columns= {'is_there_an_emotion_directed_at_a_brand_or_product'
                         :'Emotion','emotion_in_tweet_is_directed_at': 'Platform'})


In [122]:
df = df.rename(columns= {'tweet_text': 'Tweet'})

In [123]:
df.head() # want to remove the @'name' in the tweet 

Unnamed: 0,Tweet,Platform,Emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [124]:
df_dummify = pd.get_dummies(df['Emotion'])

In [125]:
df_dummify.head()

Unnamed: 0,I can't tell,Negative emotion,No emotion toward brand or product,Positive emotion
0,0,1,0,0
1,0,0,0,1
2,0,0,0,1
3,0,1,0,0
4,0,0,0,1


In [126]:
df_dummify.sum() # class bias 

I can't tell                           156
Negative emotion                       570
No emotion toward brand or product    5389
Positive emotion                      2978
dtype: int64

In [127]:
df.info()
df = pd.merge(df, df_dummify, how='outer',on=df.index) # ran this code, dummify emotion data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Tweet     9092 non-null   object
 1   Platform  3291 non-null   object
 2   Emotion   9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9093 entries, 0 to 9092
Data columns (total 8 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   key_0                               9093 non-null   int64 
 1   Tweet                               9092 non-null   object
 2   Platform                            3291 non-null   object
 3   Emotion                             9093 non-null   object
 4   I can't tell                        9093 non-null   uint8 
 5   Negative emotion                    9093 non-null   uint8 
 6   No emotion toward brand or product  9093 non-null   uint8 
 7   Positive emotion                    9093 non-null   uint8 
dtypes: int64(1), object(3), uint8(4)
memory usage: 390.7+ KB


In [129]:
df.head()

Unnamed: 0,key_0,Tweet,Platform,Emotion,I can't tell,Negative emotion,No emotion toward brand or product,Positive emotion
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0,1,0,0
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,0,0,0,1
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,0,0,0,1
3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0,1,0,0
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,0,0,0,1


In [130]:
df = df.rename(columns = {"I can't tell": "Uncertain", 'Negative emotion': 'Negative'
                          , 'No emotion toward brand or product': 'No Emotion'
                          , 'Positive emotion':'Positive'})

In [131]:
df = df.drop(columns='key_0')
df.head()

Unnamed: 0,Tweet,Platform,Emotion,Uncertain,Negative,No Emotion,Positive
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0,1,0,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,0,0,0,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,0,0,0,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0,1,0,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,0,0,0,1


In [132]:
corpus = list(df['Tweet'])
corpus[:10]

['.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.',
 "@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",
 '@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.',
 "@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw",
 "@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",
 '@teachntech00 New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http://ht.ly/49n4M #iear #edchat #asd',
 nan,
 '#SXSW is just starting, #CTIA is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan',
 'Beautifully smart and simple idea RT @madebymany @thenextweb wrote about our #hollergram iPad app for #sxsw! http://bit.l

### Tokenize the Words

In [133]:
tokenz = word_tokenize(','.join(str(v) for v in corpus))

In [134]:
tokenz[:10]

['.', '@', 'wesley83', 'I', 'have', 'a', '3G', 'iPhone', '.', 'After']

### Create Stopwords List

In [135]:
stopword_list = list(nlp.Defaults.stop_words)
len(nlp.Defaults.stop_words)

326

In [136]:
stopword_list

['onto',
 'two',
 'be',
 'back',
 'whereupon',
 'we',
 'indeed',
 'mine',
 'move',
 'hence',
 'your',
 'his',
 "'ll",
 'at',
 'below',
 'being',
 'well',
 'into',
 'same',
 'seemed',
 'they',
 'eight',
 'yet',
 'down',
 'less',
 'whence',
 'more',
 'about',
 'latter',
 'used',
 'if',
 'then',
 'anywhere',
 'five',
 'all',
 '‘re',
 'cannot',
 'alone',
 'thereafter',
 'something',
 'again',
 '‘s',
 'everyone',
 'ten',
 'whither',
 'before',
 'just',
 'bottom',
 'are',
 'herself',
 'hereby',
 'whenever',
 'while',
 'keep',
 'up',
 'this',
 "'re",
 'could',
 'here',
 'will',
 'my',
 'fifty',
 '‘d',
 '‘ve',
 'whatever',
 'was',
 'anything',
 'last',
 'ours',
 "'ve",
 'been',
 'it',
 'yourselves',
 '‘ll',
 'no',
 'various',
 'behind',
 'of',
 'please',
 'thus',
 'themselves',
 'even',
 'yours',
 'go',
 'thereupon',
 'beforehand',
 'former',
 'somewhere',
 'now',
 'whoever',
 'or',
 'where',
 'side',
 'as',
 'empty',
 'there',
 'seem',
 'whom',
 'further',
 'becomes',
 'above',
 'by',
 'four'

In [137]:
stopword_list.extend(string.punctuation)

In [138]:
len(stopword_list)

358

In [139]:
stopword_list.extend(stopwords.words('english'))

In [140]:
len(stopword_list)

537

In [141]:
additional_punc = ['“','”','...',"''",'’','``','https','rt','\.+']
stopword_list.extend(additional_punc)
stopword_list[-10:]

["wouldn't", '“', '”', '...', "''", '’', '``', 'https', 'rt', '\\.+']

### Remove stopwords and additional punctuation from the data

In [142]:
stopped_tokenz = [word.lower() for word in tokenz if word.lower() not in stopword_list]

In [143]:
freq = FreqDist(stopped_tokenz)
freq.most_common(50)

[('sxsw', 9418),
 ('mention', 7120),
 ('link', 4313),
 ('google', 2593),
 ('ipad', 2432),
 ('apple', 2301),
 ('quot', 1696),
 ('iphone', 1516),
 ('store', 1472),
 ('2', 1114),
 ('new', 1090),
 ('austin', 959),
 ('amp', 836),
 ('app', 810),
 ('circles', 658),
 ('launch', 653),
 ('social', 647),
 ('android', 574),
 ('today', 574),
 ('network', 465),
 ('ipad2', 457),
 ('pop-up', 420),
 ('line', 405),
 ('free', 387),
 ('called', 361),
 ('party', 346),
 ('sxswi', 340),
 ('mobile', 338),
 ('major', 301),
 ('like', 290),
 ('time', 271),
 ('temporary', 264),
 ('opening', 257),
 ('possibly', 240),
 ('people', 226),
 ('downtown', 225),
 ('apps', 224),
 ('great', 222),
 ('maps', 219),
 ('going', 217),
 ('check', 216),
 ('mayer', 214),
 ('day', 214),
 ('open', 210),
 ('popup', 209),
 ('need', 205),
 ('marissa', 189),
 ('got', 185),
 ('w/', 182),
 ('know', 180)]

### Lemmatize the Data and use Regex to find and remove URL's, Tags, other misc

In [144]:
additional_misc = ['sxsw','mention',r'[a-zA-Z]+\'?s]',r"(http[s]?://\w*\.\w*/+\w+)"
                   ,r'\#\w*',r'RT [@]?\w*:',r'\@\w*',r"\d$",r"^\d"
                   ,r"([a-zA-Z]+(?:'[a-z]+)?)",r'\d.',r'\d','RT'] #[A-Z]{2,20} remove caps like MAGA and CDT
stopword_list.extend(additional_misc)
stopword_list.extend(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])

In [145]:
lemmatizer = WordNetLemmatizer()

In [146]:
clean_stopped_tokenz = [word.lower() for word in stopped_tokenz if word not in stopword_list]
clean_lemmatized_tokenz = [lemmatizer.lemmatize(word.lower()) for word in stopped_tokenz if word not in stopword_list]

In [147]:
freq_clean_lemma = FreqDist(clean_lemmatized_tokenz)
freq_lemma = freq_clean_lemma.most_common(5000)
freq_lemma2 = freq_clean_lemma.most_common(25)

In [148]:
total_word_count = len(clean_lemmatized_tokenz)

In [149]:
lemma_word_count = sum(freq_clean_lemma.values()) # just a number

In [150]:
for word in freq_lemma2:
    normalized_freq = word[1] / lemma_word_count
    print(word, "----", "{:.3f}".format(normalized_freq*100),"%")

('link', 4324) ---- 5.004 %
('google', 2594) ---- 3.002 %
('ipad', 2432) ---- 2.814 %
('apple', 2304) ---- 2.666 %
('quot', 1696) ---- 1.963 %
('iphone', 1516) ---- 1.754 %
('store', 1511) ---- 1.749 %
('new', 1090) ---- 1.261 %
('austin', 960) ---- 1.111 %
('amp', 836) ---- 0.967 %
('app', 810) ---- 0.937 %
('launch', 691) ---- 0.800 %
('circle', 673) ---- 0.779 %
('social', 647) ---- 0.749 %
('android', 574) ---- 0.664 %
('today', 574) ---- 0.664 %
('network', 473) ---- 0.547 %
('ipad2', 457) ---- 0.529 %
('line', 442) ---- 0.512 %
('pop-up', 422) ---- 0.488 %
('free', 387) ---- 0.448 %
('party', 386) ---- 0.447 %
('called', 361) ---- 0.418 %
('mobile', 340) ---- 0.393 %
('sxswi', 340) ---- 0.393 %


In [151]:
# from wordcloud import WordCloud

# ## Initalize a WordCloud with our stopwords_list and no bigrams
# wordcloud = WordCloud(stopwords=stopword_list,collocations=False)

# ## Generate wordcloud from stopped_tokens
# wordcloud.generate(','.join(clean_lemmatized_tokenz))

# ## Plot with matplotlib
# plt.figure(figsize = (12, 12), facecolor = None) 
# plt.imshow(wordcloud) 
# plt.axis('off')

In [152]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
tweet_finder = nltk.BigramCollocationFinder.from_words(clean_lemmatized_tokenz)
tweets_scored = tweet_finder.score_ngrams(bigram_measures.raw_freq)

In [153]:
pd.DataFrame(tweets_scored, columns=["Word","Freq"]).head(10)

Unnamed: 0,Word,Freq
0,"(apple, store)",0.00692
1,"(social, network)",0.005277
2,"(new, social)",0.004837
3,"(google, launch)",0.003912
4,"(link, google)",0.003877
5,"(network, called)",0.003784
6,"(called, circle)",0.003634
7,"(today, link)",0.003437
8,"(major, new)",0.003356
9,"(iphone, app)",0.003333


In [154]:
# Bar Chart

In [155]:
tweet_pmi_finder = nltk.BigramCollocationFinder.from_words(clean_lemmatized_tokenz)
tweet_pmi_finder.apply_freq_filter(5)

tweet_pmi_scored = tweet_pmi_finder.score_ngrams(bigram_measures.pmi)

In [156]:
pd.DataFrame(tweet_pmi_scored, columns=["Words","PMI"]).head(10)

Unnamed: 0,Words,PMI
0,"(û÷sxsw, goûª)",14.076983
1,"(jc, penney)",13.813948
2,"(knitted, staircase)",13.813948
3,"(naomi, campbell)",13.813948
4,"(parking, 5-10)",13.813948
5,"(pauly, celebs)",13.813948
6,"(98, accuracy)",13.591556
7,"(aron, pilhofer)",13.591556
8,"(charlie, sheen)",13.591556
9,"(lynn, teo)",13.591556


In [157]:
# Bar Chart

In [174]:
df1 = df
df.head()

Unnamed: 0,Tweet,Platform,Emotion,Uncertain,Negative,No Emotion,Positive
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0,1,0,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,0,0,0,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,0,0,0,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0,1,0,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,0,0,0,1


In [192]:
# df1 = df1.drop(columns=['Uncertain','No Emotion'])
# Turn negative and positive columns into one column of just negatives and positive.
df1 = df1[df1['Emotion'] != "No emotion toward brand or product"]
df1 = df1[df1['Emotion'] != "I can't tell"]
# df1 = df1.drop(columns='Negative')
df1 = df1.rename(columns={'Positive': 'Positive_Bin'})
df1.head()

Unnamed: 0,Tweet,Platform,Emotion,Positive_Bin
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,1


In [193]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3548 entries, 0 to 9088
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Tweet         3548 non-null   object
 1   Platform      3191 non-null   object
 2   Emotion       3548 non-null   object
 3   Positive_Bin  3548 non-null   uint8 
dtypes: object(3), uint8(1)
memory usage: 114.3+ KB


In [159]:
freq_lemma2

[('link', 4324),
 ('google', 2594),
 ('ipad', 2432),
 ('apple', 2304),
 ('quot', 1696),
 ('iphone', 1516),
 ('store', 1511),
 ('new', 1090),
 ('austin', 960),
 ('amp', 836),
 ('app', 810),
 ('launch', 691),
 ('circle', 673),
 ('social', 647),
 ('android', 574),
 ('today', 574),
 ('network', 473),
 ('ipad2', 457),
 ('line', 442),
 ('pop-up', 422),
 ('free', 387),
 ('party', 386),
 ('called', 361),
 ('mobile', 340),
 ('sxswi', 340)]

### Train/Test Split

In [194]:
X = df1['Tweet']
y = df1['Positive_Bin']

In [195]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [216]:
print(len(y_train), X_train_tf_idf.shape)

2661 (2661, 5362)


In [197]:
X_train.shape

(2661,)

In [198]:
y_train.value_counts(1)

1    0.837279
0    0.162721
Name: Positive_Bin, dtype: float64

### Vectorize the two sets, lemmatized and not lemmatized using count vectorizer and TfidfVectorizer

In [205]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize,
                             stop_words=stopword_list,decode_error='ignore')
tokenizer = nltk.TweetTokenizer(preserve_case=False)

In [206]:
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.fit_transform(X_test)



In [207]:
tf_idf_vectorizer = TfidfVectorizer(tokenizer=tokenizer.tokenize,
                                    stop_words=stopword_list,decode_error='ignore')

In [208]:
X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf_idf = tf_idf_vectorizer.fit_transform(X_test)

In [210]:
from sklearn.ensemble import RandomForestClassifier

In [211]:
ran_for = RandomForestClassifier(class_weight='balanced')
ran_for.fit(X_train_tf_idf,y_train)

RandomForestClassifier(class_weight='balanced')

In [212]:
y_hat_test = ran_for.predict(X_test_tf_idf)

ValueError: Number of features of the model must match the input. Model n_features is 5362 and input n_features is 2740 

In [None]:
twt_token = TweetTokenizer()

# pattern = r"([a-zA-Z]+(?:'[a-z]+)?)"

# for sentence in list(df['Tweet']):
#     for word in str(sentence):
#         word_tokenize(word.lower())

In [None]:
# tokens = regexp_tokenize(','.join(str(v) for v in corpus), pattern=r"\w+")

In [None]:
# tokens

In [None]:
len(set(stopped_tokenz))

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
tweet_finder = nltk.BigramCollocationFinder.from_words(stopped_tokenz)
tweets_scored = tweet_finder.score_ngrams(bigram_measures.raw_freq)

In [None]:
pd.DataFrame(tweets_scored, columns=['Words','Freq']).head(10)

### Remove punctuation, numbers, uppercase letters, tags, #hashtags, stopwords, and numbers

In [None]:
# # create a function that takes a dataframe column input and cleans that column
# def clean_column(i=(0,len(corpus)-1)):
#     from nltk.corpus import stopwords
#     import string
#     from nltk import word_tokenize,regexp_tokenize
    
#     print(f"- Tweet #{i}:\n")
#     print(corpus[i],'\n')
#     tokens = word_tokenize(corpus[i])

#     # Get all the stop words in the English language
#     stopwords_list = stopwords.words('english')
#     stopwords_list += list(nlp.Defaults.stop_words)
#     stopwords_list += string.punctuation
#     stopwords_list += additional_punc
#     stopped_tokenz = [w.lower() for w in tokenz if w.lower() not in stopwords_list]
    
#     print(tokenz,end='\n\n')
#     print(stopped_tokenz)
                
# # run function



In [None]:
# clean_column(df['Tweet'])

In [None]:
tokens = word_tokenize(','.join(corpus))