### EDA and Preprocessing Notebook

In [108]:
import numpy as np
import pandas as pd
import spacy
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer, TfidfTransformer

from nltk.stem.wordnet import WordNetLemmatizer
import string
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
print(stopwords)
print(nlp.Defaults.stop_words)

<WordListCorpusReader in '.../corpora/stopwords' (not loaded yet)>
{'regarding', 'seeming', 'whole', 'yours', 'then', 'have', 'thereby', 'thru', 'for', 'whoever', 'becoming', 'though', 'next', 'toward', 'everyone', 'now', 're', 'somewhere', 'become', 'and', 'every', 'latterly', 'thereupon', 'just', 'yourself', 'cannot', 'with', 'towards', 'would', 'another', 'here', 'hereafter', 'since', 'he', 'its', 'therefore', 'how', 'rather', 'at', 'are', 'done', 'whereafter', 'over', 'below', 'take', 'whom', 'nine', 'nowhere', 'around', 'off', 'doing', 'amongst', 'eleven', 'call', 'meanwhile', 'not', 'his', 'if', 'an', 'used', 'beyond', 'throughout', 'many', 'upon', 'most', 'down', 'ours', 'otherwise', 'keep', 'nothing', 'beside', 'top', 'made', 'to', 'being', 'up', 'whereupon', 'a', 'further', 'get', 'once', 'twenty', 'mine', 'no', 'been', 'me', 'six', 'anyhow', 'had', 'does', 'indeed', 'into', 'or', 'they', 'under', 'somehow', 'very', 'besides', 'amount', 'although', 'hence', 'less', 'may', 'whe

In [5]:
df = pd.read_csv('data/product_tweets.csv',encoding='latin1')

In [6]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [7]:
df['is_there_an_emotion_directed_at_a_brand_or_product'].unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

In [8]:
df = df.rename(columns= {'is_there_an_emotion_directed_at_a_brand_or_product'
                         :'Emotion','emotion_in_tweet_is_directed_at': 'Platform'})


In [9]:
df = df.rename(columns= {'tweet_text': 'Tweet'})

In [10]:
df.head() # want to remove the @'name' in the tweet 

Unnamed: 0,Tweet,Platform,Emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [11]:
df_dummify = pd.get_dummies(df['Emotion'])

In [12]:
df_dummify.head()

Unnamed: 0,I can't tell,Negative emotion,No emotion toward brand or product,Positive emotion
0,0,1,0,0
1,0,0,0,1
2,0,0,0,1
3,0,1,0,0
4,0,0,0,1


In [13]:
df_dummify.sum() # class bias 

I can't tell                           156
Negative emotion                       570
No emotion toward brand or product    5389
Positive emotion                      2978
dtype: int64

In [14]:
df.info()
df = pd.merge(df, df_dummify, how='outer',on=df.index) # ran this code, dummify emotion data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
Tweet       9092 non-null object
Platform    3291 non-null object
Emotion     9093 non-null object
dtypes: object(3)
memory usage: 213.2+ KB


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9093 entries, 0 to 9092
Data columns (total 8 columns):
key_0                                 9093 non-null int64
Tweet                                 9092 non-null object
Platform                              3291 non-null object
Emotion                               9093 non-null object
I can't tell                          9093 non-null uint8
Negative emotion                      9093 non-null uint8
No emotion toward brand or product    9093 non-null uint8
Positive emotion                      9093 non-null uint8
dtypes: int64(1), object(3), uint8(4)
memory usage: 390.7+ KB


In [16]:
df.head()

Unnamed: 0,key_0,Tweet,Platform,Emotion,I can't tell,Negative emotion,No emotion toward brand or product,Positive emotion
0,0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0,1,0,0
1,1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,0,0,0,1
2,2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,0,0,0,1
3,3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0,1,0,0
4,4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,0,0,0,1


In [17]:
df = df.rename(columns = {"I can't tell": "Uncertain", 'Negative emotion': 'Negative'
                          , 'No emotion toward brand or product': 'No Emotion'
                          , 'Positive emotion':'Positive'})

In [18]:
df = df.drop(columns='key_0')
df.head()

Unnamed: 0,Tweet,Platform,Emotion,Uncertain,Negative,No Emotion,Positive
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,0,1,0,0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,0,0,0,1
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,0,0,0,1
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,0,1,0,0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,0,0,0,1


In [19]:
corpus = list(df['Tweet'])
corpus[:10]

['.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.',
 "@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",
 '@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.',
 "@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw",
 "@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",
 '@teachntech00 New iPad Apps For #SpeechTherapy And Communication Are Showcased At The #SXSW Conference http://ht.ly/49n4M #iear #edchat #asd',
 nan,
 '#SXSW is just starting, #CTIA is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan',
 'Beautifully smart and simple idea RT @madebymany @thenextweb wrote about our #hollergram iPad app for #sxsw! http://bit.l

### Tokenize the Words

In [20]:
tokenz = word_tokenize(','.join(str(v) for v in corpus))

In [21]:
tokenz[:10]

['.', '@', 'wesley83', 'I', 'have', 'a', '3G', 'iPhone', '.', 'After']

### Create Stopwords List

In [22]:
stopword_list = list(nlp.Defaults.stop_words)
len(nlp.Defaults.stop_words)

305

In [23]:
stopword_list

['regarding',
 'seeming',
 'whole',
 'yours',
 'then',
 'have',
 'thereby',
 'thru',
 'for',
 'whoever',
 'becoming',
 'though',
 'next',
 'toward',
 'everyone',
 'now',
 're',
 'somewhere',
 'become',
 'and',
 'every',
 'latterly',
 'thereupon',
 'just',
 'yourself',
 'cannot',
 'with',
 'towards',
 'would',
 'another',
 'here',
 'hereafter',
 'since',
 'he',
 'its',
 'therefore',
 'how',
 'rather',
 'at',
 'are',
 'done',
 'whereafter',
 'over',
 'below',
 'take',
 'whom',
 'nine',
 'nowhere',
 'around',
 'off',
 'doing',
 'amongst',
 'eleven',
 'call',
 'meanwhile',
 'not',
 'his',
 'if',
 'an',
 'used',
 'beyond',
 'throughout',
 'many',
 'upon',
 'most',
 'down',
 'ours',
 'otherwise',
 'keep',
 'nothing',
 'beside',
 'top',
 'made',
 'to',
 'being',
 'up',
 'whereupon',
 'a',
 'further',
 'get',
 'once',
 'twenty',
 'mine',
 'no',
 'been',
 'me',
 'six',
 'anyhow',
 'had',
 'does',
 'indeed',
 'into',
 'or',
 'they',
 'under',
 'somehow',
 'very',
 'besides',
 'amount',
 'althoug

In [24]:
stopword_list.extend(string.punctuation)

In [25]:
len(stopword_list)

337

In [26]:
stopword_list.extend(stopwords.words('english'))

In [27]:
len(stopword_list)

516

In [28]:
additional_punc = ['“','”','...',"''",'’','``','https','rt','\.+']
stopword_list.extend(additional_punc)
stopword_list[-10:]

["wouldn't", '“', '”', '...', "''", '’', '``', 'https', 'rt', '\\.+']

### Remove stopwords and additional punctuation from the data

In [29]:
stopped_tokenz = [word.lower() for word in tokenz if word.lower() not in stopword_list]

In [30]:
freq = FreqDist(stopped_tokenz)
freq.most_common(50)

[('sxsw', 9414),
 ('mention', 7120),
 ('link', 4313),
 ('google', 2592),
 ('ipad', 2431),
 ('apple', 2300),
 ('quot', 1696),
 ('iphone', 1513),
 ('store', 1469),
 ("'s", 1236),
 ('2', 1114),
 ('new', 1087),
 ('austin', 959),
 ('amp', 836),
 ('app', 810),
 ('launch', 653),
 ('circles', 651),
 ('social', 647),
 ('android', 574),
 ('today', 574),
 ("n't", 481),
 ('network', 465),
 ('ipad2', 457),
 ('pop-up', 420),
 ('line', 402),
 ('free', 387),
 ('called', 361),
 ('party', 346),
 ('sxswi', 340),
 ('mobile', 338),
 ('major', 301),
 ('like', 290),
 ('time', 271),
 ("'re", 265),
 ('temporary', 264),
 ('opening', 257),
 ("'m", 254),
 ('possibly', 240),
 ('people', 226),
 ('downtown', 225),
 ('apps', 224),
 ('great', 222),
 ('maps', 219),
 ('going', 217),
 ('check', 215),
 ('mayer', 214),
 ('day', 214),
 ('open', 210),
 ('popup', 209),
 ('need', 205)]

### Lemmatize the Data and use Regex to find and remove URL's, Tags, other misc

In [31]:
additional_misc = ['sxsw','mention',r'[a-zA-Z]+\'?s]',r"(http[s]?://\w*\.\w*/+\w+)"
                   ,r'\#\w*',r'RT [@]?\w*:',r'\@\w*',r"\d$",r"^\d"
                   ,r"([a-zA-Z]+(?:'[a-z]+)?)",r'\d.',r'\d','RT'] #[A-Z]{2,20} remove caps like MAGA and CDT
stopword_list.extend(additional_misc)
stopword_list.extend(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])

In [32]:
lemmatizer = WordNetLemmatizer()

In [33]:
clean_stopped_tokenz = [word.lower() for word in stopped_tokenz if word not in stopword_list]
clean_lemmatized_tokenz = [lemmatizer.lemmatize(word.lower()) for word in stopped_tokenz if word not in stopword_list]

In [34]:
freq_clean_lemma = FreqDist(clean_lemmatized_tokenz)
freq_lemma = freq_clean_lemma.most_common(5000)
freq_lemma2 = freq_clean_lemma.most_common(25)

In [35]:
total_word_count = len(clean_lemmatized_tokenz)

In [36]:
lemma_word_count = sum(freq_clean_lemma.values()) # just a number

In [37]:
for word in freq_lemma2:
    normalized_freq = word[1] / lemma_word_count
    print(word, "----", "{:.3f}".format(normalized_freq*100),"%")

('link', 4324) ---- 4.865 %
('google', 2593) ---- 2.918 %
('ipad', 2431) ---- 2.735 %
('apple', 2303) ---- 2.591 %
('quot', 1696) ---- 1.908 %
('iphone', 1513) ---- 1.702 %
('store', 1508) ---- 1.697 %
("'s", 1236) ---- 1.391 %
('new', 1087) ---- 1.223 %
('austin', 960) ---- 1.080 %
('amp', 836) ---- 0.941 %
('app', 810) ---- 0.911 %
('launch', 691) ---- 0.777 %
('circle', 666) ---- 0.749 %
('social', 647) ---- 0.728 %
('android', 574) ---- 0.646 %
('today', 574) ---- 0.646 %
("n't", 481) ---- 0.541 %
('network', 473) ---- 0.532 %
('ipad2', 457) ---- 0.514 %
('line', 439) ---- 0.494 %
('pop-up', 422) ---- 0.475 %
('free', 387) ---- 0.435 %
('party', 386) ---- 0.434 %
('called', 361) ---- 0.406 %


In [None]:
# from wordcloud import WordCloud

# ## Initalize a WordCloud with our stopwords_list and no bigrams
# wordcloud = WordCloud(stopwords=stopword_list,collocations=False)

# ## Generate wordcloud from stopped_tokens
# wordcloud.generate(','.join(clean_lemmatized_tokenz))

# ## Plot with matplotlib
# plt.figure(figsize = (12, 12), facecolor = None) 
# plt.imshow(wordcloud) 
# plt.axis('off')

In [38]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
tweet_finder = nltk.BigramCollocationFinder.from_words(clean_lemmatized_tokenz)
tweets_scored = tweet_finder.score_ngrams(bigram_measures.raw_freq)

In [39]:
pd.DataFrame(tweets_scored, columns=["Word","Freq"]).head(10)

Unnamed: 0,Word,Freq
0,"(apple, store)",0.006639
1,"(social, network)",0.005131
2,"(new, social)",0.004703
3,"(link, google)",0.003769
4,"(google, launch)",0.003736
5,"(network, called)",0.003679
6,"(called, circle)",0.003499
7,"(today, link)",0.003342
8,"(major, new)",0.003263
9,"(iphone, app)",0.003241


In [40]:
tweet_pmi_finder = nltk.BigramCollocationFinder.from_words(clean_lemmatized_tokenz)
tweet_pmi_finder.apply_freq_filter(5)

tweet_pmi_scored = tweet_pmi_finder.score_ngrams(bigram_measures.pmi)

In [41]:
pd.DataFrame(tweet_pmi_scored, columns=["Words","PMI"]).head(10)

Unnamed: 0,Words,PMI
0,"(û÷sxsw, goûª)",14.117562
1,"(jc, penney)",13.854528
2,"(knitted, staircase)",13.854528
3,"(naomi, campbell)",13.854528
4,"(parking, 5-10)",13.854528
5,"(pauly, celebs)",13.854528
6,"(98, accuracy)",13.632135
7,"(aron, pilhofer)",13.632135
8,"(charlie, sheen)",13.632135
9,"(lynn, teo)",13.632135


In [None]:
df.to_csv('Cleaned_Tweets.csv)
        

### Vectorize the two sets, lemmatized and not lemmatized

In [None]:
4 y columns 

In [44]:
vectorizer = TfidfVectorizer(stop_words=stopword_list,decode_error='ignore')

# Vectorize data and make X_train_tfidf and X_test_tfidf
X_train_tfidf = vectorizer.fit_transform(clean_lemmatized_tokenz)
# X_test_tfidf = vectorizer.transform(X_test)
# X_train_tfidf#.todense()

In [45]:
X_train_tfidf

<88875x8846 sparse matrix of type '<class 'numpy.float64'>'
	with 85700 stored elements in Compressed Sparse Row format>

In [None]:
twt_token = TweetTokenizer()

# pattern = r"([a-zA-Z]+(?:'[a-z]+)?)"

# for sentence in list(df['Tweet']):
#     for word in str(sentence):
#         word_tokenize(word.lower())

In [None]:
# tokens = regexp_tokenize(','.join(str(v) for v in corpus), pattern=r"\w+")

In [None]:
# tokens

In [None]:
len(set(stopped_tokenz))

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
tweet_finder = nltk.BigramCollocationFinder.from_words(stopped_tokenz)
tweets_scored = tweet_finder.score_ngrams(bigram_measures.raw_freq)

In [None]:
pd.DataFrame(tweets_scored, columns=['Words','Freq']).head(10)

### Remove punctuation, numbers, uppercase letters, tags, #hashtags, stopwords, and numbers

In [None]:
# # create a function that takes a dataframe column input and cleans that column
# def clean_column(i=(0,len(corpus)-1)):
#     from nltk.corpus import stopwords
#     import string
#     from nltk import word_tokenize,regexp_tokenize
    
#     print(f"- Tweet #{i}:\n")
#     print(corpus[i],'\n')
#     tokens = word_tokenize(corpus[i])

#     # Get all the stop words in the English language
#     stopwords_list = stopwords.words('english')
#     stopwords_list += list(nlp.Defaults.stop_words)
#     stopwords_list += string.punctuation
#     stopwords_list += additional_punc
#     stopped_tokenz = [w.lower() for w in tokenz if w.lower() not in stopwords_list]
    
#     print(tokenz,end='\n\n')
#     print(stopped_tokenz)
                
# # run function



In [None]:
# clean_column(df['Tweet'])

In [None]:
tokens = word_tokenize(','.join(corpus))