## NLP exercise using Tableau

Take U.S. Election tweets between AUG and OCT Look at

*   Common words
  * for high number of likes
  * trump or biden
      * sentiment analysis (textblob gives you subjectivity as well)
* LDA topic analysis

In [21]:
import pandas as pd

df_orig = pd.read_csv('../data/us_election-subset20like.csv')
df_orig

Unnamed: 0,Created At,Hashtags,Id,Language,Link,Name,Reply To,Tweet,User Id,Username,F1,Likes Count,Replies Count,Retweets Count
0,2020-10-28 19:13:51 IST,['uselection'],1321447588688584704,en,https://twitter.com/mariawirth1/status/1321447...,Maria Wirth,"{'user_id': None, 'username': None}","In 2016, almost all Germans were against Trump...",1358917686,mariawirth1,118,119,4,20
1,2020-10-28 19:09:31 IST,"['covid', 'nagornokarabakh']",1321446498794569728,en,https://twitter.com/AliTahmizian/status/132144...,Alison Meuse,"{'user_id': None, 'username': None}",Azerbaijan's offensive may have been timed to ...,394593711,alitahmizian,141,25,3,13
2,2020-10-28 19:06:28 IST,[],1321445733904617472,en,https://twitter.com/rcolvile/status/1321445733...,Robert Colvile,"{'user_id': None, 'username': None}",US election veterans - what is the best strate...,18331985,rcolvile,157,20,34,1
3,2020-10-28 18:47:09 IST,['uselection'],1321440873113112576,en,https://twitter.com/SkyNews/status/13214408731...,SkyNews,"{'user_id': None, 'username': None}",Eyewitness: Voters with doubts about both cand...,7587032,skynews,243,20,19,3
4,2020-10-28 18:38:01 IST,[],1321438571585245184,en,https://twitter.com/VABVOX/status/132143857158...,Victoria Brownworth #AntiFascistVoter,"{'user_id': None, 'username': None}",Make it all blue. Build your own US election...,138168339,vabvox,299,24,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5045,2020-10-24 00:51:08 IST,[],1319720532607143936,en,https://twitter.com/talkRADIO/status/131972053...,talkRADIO,"{'user_id': None, 'username': None}","Following last night's presidential debate, wh...",3380282686,talkradio,20424,137,60,155
5046,2020-10-24 00:46:06 IST,[],1319719262580330496,en,https://twitter.com/thehill/status/13197192625...,The Hill,"{'user_id': None, 'username': None}",NASA astronaut Kate Rubins votes from space in...,1917731,thehill,20435,623,26,82
5047,2020-10-24 00:42:35 IST,[],1319718380841144320,en,https://twitter.com/akhilkom/status/1319718380...,Akhil,"{'user_id': '18690738', 'username': '_bikerchi...",@_bikerchick @dbarrett @businessinsider I live...,32863133,akhilkom,20444,20,0,0
5048,2020-10-24 00:40:17 IST,[],1319717802182348800,en,https://twitter.com/gathara/status/13197178021...,gathara,"{'user_id': None, 'username': None}","There's nothing normal about being forced to ""...",15659814,gathara,20455,82,8,35


In [22]:
df_orig['Hashtags'].value_counts().head(15)

[]                             4035
['uselection']                   81
['untrendoctober']               20
['uselection2020']               17
['election2020']                 14
['covid19']                      12
['justvote']                     12
['vote']                         11
['breaking']                     10
['newsnight']                     9
['debates2020']                   9
['gravitas']                      8
['uselection', 'kayburley']       7
['uselections2020']               7
['coronavirus']                   7
Name: Hashtags, dtype: int64

In [23]:
# Use spacy's tokenizer to preproccess the data
import spacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)

In [24]:
df = df_orig.copy()

## All words

In [25]:
stop_words = nlp.Defaults.stop_words.union(['election', 'election.', '&amp;', '#uselection', "it's", "election?", "election,", "|",
                                            "election:", "biden's", "biden:", "biden."])
# tokenizer pipe
tokens = []

for doc in tokenizer.pipe(df['Tweet'], batch_size=500):
    doc_tokens = []
    for token in doc:
        if (token.text.lower() not in stop_words) & (token.is_punct == False) & (token.is_space is False):
            doc_tokens.append(token.lemma_.lower())
    tokens.append(doc_tokens)
    
df['tokens'] = tokens

In [26]:
from collections import Counter

word_count = Counter()
for doc in tokens:
    word_count.update(doc)
word_count.most_common(30)

[('trump', 926),
 ('vote', 798),
 ('biden', 472),
 ('win', 320),
 ('day', 312),
 ('say', 287),
 ('like', 260),
 ('2020', 259),
 ('people', 253),
 ('donald', 245),
 ('think', 242),
 ('go', 241),
 ('week', 239),
 ('president', 228),
 ('american', 220),
 ('time', 212),
 ('2020:', 206),
 ('joe', 204),
 ('know', 198),
 ('poll', 195),
 ('new', 194),
 ('want', 182),
 ('voter', 164),
 ('world', 162),
 ('debate', 160),
 ('twitter', 157),
 ('live', 156),
 ('facebook', 155),
 ('upcoming', 154),
 ('result', 154)]

## Tweets that mention Trump

In [27]:
# Let's see what words are associated with Trump and Biden
df_trump = df.copy()
df_trump = df_trump[df_trump['Tweet'].str.contains('Trump')]
tokens = []
for doc in tokenizer.pipe(df_trump['Tweet'], batch_size=500):
    doc_tokens = []
    for token in doc:
        if (token.text.lower() not in stop_words) & (token.is_punct == False) & (token.is_space is False):
            doc_tokens.append(token.lemma_.lower())
    tokens.append(doc_tokens)
    
df_trump['trump_tokens'] = tokens

In [28]:
trump_count = Counter()
for doc in tokens:
    trump_count.update(doc)
trump_count.most_common(30)

[('trump', 894),
 ('biden', 298),
 ('donald', 241),
 ('win', 162),
 ('vote', 156),
 ('president', 151),
 ('2020:', 110),
 ('joe', 109),
 ('say', 103),
 ("trump's", 89),
 ('@realdonaldtrump', 87),
 ('debate', 87),
 ('poll', 71),
 ('day', 68),
 ('campaign', 66),
 ('presidential', 65),
 ('think', 55),
 ('trump’s', 55),
 ('go', 54),
 ('voter', 53),
 ('live', 52),
 ('like', 51),
 ('people', 50),
 ('lose', 50),
 ('want', 50),
 ('2020', 47),
 ('#trump', 46),
 ('trump.', 45),
 ('american', 45),
 ('week', 45)]

## Tweets that mention Biden

In [29]:
# Now for Biden
df_biden = df.copy()
df_biden = df_biden[df_biden['Tweet'].str.contains('Biden')]
tokens = []
for doc in tokenizer.pipe(df_biden['Tweet'], batch_size=500):
    doc_tokens = []
    for token in doc:
        if (token.text.lower() not in stop_words) & (token.is_punct is False) & (token.is_space is False):
            doc_tokens.append(token.lemma_.lower())
    tokens.append(doc_tokens)
    
df_biden['biden_tokens'] = tokens

In [30]:
biden_count = Counter()
for doc in tokens:
    biden_count.update(doc)
biden_count.most_common(30)

[('biden', 453),
 ('trump', 303),
 ('joe', 193),
 ('vote', 119),
 ('win', 110),
 ('donald', 78),
 ('debate', 68),
 ('2020:', 63),
 ('president', 60),
 ('poll', 58),
 ('presidential', 50),
 ('say', 48),
 ('@joebiden', 42),
 ('campaign', 41),
 ('day', 37),
 ('live', 33),
 ('know', 30),
 ('science', 30),
 ('#trump', 28),
 ('2020', 28),
 ('@realdonaldtrump', 28),
 ('people', 27),
 ('🇺🇸', 27),
 ('state', 27),
 ('democratic', 26),
 ('like', 26),
 ('new', 26),
 ('trump:', 26),
 ('want', 26),
 ('covid', 24)]

In [16]:
# How about words with the top 10% of likes (from those above 20 likes)
df_top = df.copy()
df_top = df_top[df_top['Likes Count'] > df_top['Likes Count'].quantile(0.9)]
tokens = []
for doc in tokenizer.pipe(df_top['Tweet'], batch_size=500):
    doc_tokens = []
    for token in doc:
        if (token.text.lower() not in stop_words) & (token.is_punct is False) & (token.is_space is False):
            doc_tokens.append(token.lemma_.lower())
    tokens.append(doc_tokens)
    
df_top['like_tokens'] = tokens

top_count = Counter()
for doc in tokens:
    top_count.update(doc)
top_count.most_common(30)

[('trump', 121),
 ('vote', 102),
 ('biden', 66),
 ('people', 38),
 ('say', 37),
 ('president', 37),
 ('joe', 33),
 ('like', 33),
 ('russian', 31),
 ('win', 30),
 ('day', 30),
 ('week', 29),
 ('foreign', 28),
 ('think', 26),
 ('twitter', 25),
 ('donald', 23),
 ('tell', 23),
 ('go', 22),
 ('new', 22),
 ('year', 22),
 ('want', 22),
 ('time', 21),
 ('american', 21),
 ('campaign', 20),
 ('help', 20),
 ('voter', 20),
 ('result', 20),
 ('world', 19),
 ('2020', 19),
 ('influence', 19)]

## Sentiment analysis using TextBlob

In [17]:
# !pip3 install -U textblob

In [25]:
from textblob import TextBlob

subj = lambda x: TextBlob(x).subjectivity
senta = lambda x: TextBlob(x).polarity

df['Subjectivity'] = df['Tweet'].apply(subj)
df['Sentiment'] = df['Tweet'].apply(senta)

df_trump['Subjectivity'] = df_trump['Tweet'].apply(subj)
df_trump['Sentiment'] = df_trump['Tweet'].apply(senta)

df_biden['Subjectivity'] = df_biden['Tweet'].apply(subj)
df_biden['Sentiment'] = df_biden['Tweet'].apply(senta)


In [35]:
print("Overall,", df['Sentiment'].mean())
print("Trump,", df_trump['Sentiment'].mean())
print("Biden, ", df_biden['Sentiment'].mean())

Overall, 0.0835517950935405
Trump, 0.09077470197483492
Biden, 0.10920995477507763


In [33]:
print("Overall", df['Subjectivity'].mean())
print("Trump", df_trump['Subjectivity'].mean())
print("Biden", df_biden['Subjectivity'].mean())

Overall 0.3652150355861506
Trump 0.38315128055326403
Biden 0.39690158037524315


In [49]:
# Now think do you want to scatter just the mean or do you want to try and scatter all the scores?
df['Tweet'].apply(lambda x: TextBlob(x).sentiment)

0          (0.1259259259259259, 0.2851851851851852)
1                                        (0.0, 0.0)
2        (0.26666666666666666, 0.39999999999999997)
3                                        (0.5, 0.5)
4                         (0.4666666666666666, 0.5)
                           ...                     
5045     (0.26666666666666666, 0.18888888888888888)
5046                                     (0.0, 0.0)
5047     (0.11212121212121212, 0.30833333333333335)
5048    (-0.03750000000000001, 0.46249999999999997)
5049    (-0.09722222222222221, 0.10555555555555556)
Name: Tweet, Length: 5050, dtype: object

In [53]:
df_scatter = pd.DataFrame(data=[[0.0835517950935405, 0.3652150355861506], [0.09077470197483492, 0.38315128055326403], [0.10920995477507763, 0.39690158037524315]],
                         index=["Overall", "Trump", "Biden"], columns=['Sentiment', 'Subjectivity'])
df_scatter#.plot.scatter(x=df_scatter.Sentiment, y=df_scatter.Subjectivity)

Unnamed: 0,Sentiment,Subjectivity
Overall,0.083552,0.365215
Trump,0.090775,0.383151
Biden,0.10921,0.396902


In [61]:
# Export to use in Tableau
# df_trump['Tweet'].to_csv('../data/dftrump.csv')
# df_biden['Tweet'].to_csv('../data/dfbiden.csv')

## LDA topic modeling

In [6]:
# You can use the bag of words method or TF-IDF for topic modeling. I will just use BoW for this exercise
import gensim


In [33]:
stop_words_extra = ('trump', "trump's", "trump.", "trump\'s", 'joe', 'biden', 'donald', '2020:')

trump_docs = []
for tokens in df_trump['trump_tokens']:
    trump_list = []
    for token in tokens:
        if token not in stop_words_extra:
            trump_list.append(token)
    trump_docs.append(trump_list)

In [34]:
trump_words = gensim.corpora.Dictionary(trump_docs)
trump_corpus = [trump_words.doc2bow(doc) for doc in trump_docs]

In [35]:
lda_model = gensim.models.ldamodel.LdaModel(
                corpus=trump_corpus,
                id2word=trump_words,
                num_topics=10,
                random_state=42,
                update_every=1,
                passes=10,
                alpha='auto',
                per_word_topics=True
                )

In [36]:
for topic in lda_model.show_topics():
    print(topic)

(0, '0.007*"win" + 0.006*"lose" + 0.005*"think" + 0.005*"vote" + 0.005*"go" + 0.004*"know" + 0.003*"say" + 0.003*"want" + 0.003*"uk" + 0.003*"hope"')
(1, '0.009*"trump:" + 0.006*"vote" + 0.005*"win" + 0.004*"chance" + 0.004*"attack" + 0.004*"2020" + 0.003*"day" + 0.003*"college" + 0.003*"electoral" + 0.003*"russian"')
(2, '0.006*"president" + 0.005*"campaign" + 0.004*"announce" + 0.004*"state" + 0.004*"@realdonaldtrump" + 0.003*"week" + 0.003*"vote" + 0.003*"foreign" + 0.003*"hold" + 0.003*"talk"')
(3, '0.010*"vote" + 0.007*"debate" + 0.007*"presidential" + 0.005*"president" + 0.005*"campaign" + 0.005*"win" + 0.005*"day" + 0.005*"@realdonaldtrump" + 0.003*"🇺🇸" + 0.003*"#trump"')
(4, '0.007*"poll" + 0.005*"say" + 0.004*"debate" + 0.004*"watch" + 0.003*"follow" + 0.003*"president" + 0.003*"day" + 0.003*"white" + 0.003*"lead" + 0.003*"people"')
(5, '0.006*"say" + 0.005*"president" + 0.005*"win" + 0.005*"american" + 0.004*"poll" + 0.004*"@realdonaldtrump" + 0.004*"supreme" + 0.004*"2" + 0.

In [37]:
biden_docs = []
for tokens in df_biden['biden_tokens']:
    biden_list = []
    for token in tokens:
        if token not in stop_words_extra:
            biden_list.append(token)
    biden_docs.append(trump_list)
    
biden_words = gensim.corpora.Dictionary(biden_docs)
biden_corpus = [biden_words.doc2bow(doc) for doc in biden_docs]

lda_model = gensim.models.ldamodel.LdaModel(
                corpus=biden_corpus,
                id2word=biden_words,
                num_topics=10,
                random_state=42,
                update_every=1,
                passes=10,
                alpha='auto',
                per_word_topics=True
                )
for topic in lda_model.show_topics():
    print(topic)

(0, '0.049*"american" + 0.048*"incumbent" + 0.047*"dakota" + 0.047*"announce" + 0.047*"&lt;-" + 0.046*"south" + 0.046*"in:" + 0.046*"normalize" + 0.045*"relation" + 0.045*"peace"')
(1, '0.047*"dakota" + 0.047*"normalize" + 0.046*"relevance." + 0.046*"achievement" + 0.046*"incumbent" + 0.046*"relation" + 0.046*"in:" + 0.046*"agree" + 0.046*"american" + 0.045*"@jim_jordan"')
(2, '0.047*"@jim_jordan" + 0.046*"in:" + 0.046*"north" + 0.046*"foreign" + 0.046*"here." + 0.046*"announce" + 0.046*"dying." + 0.046*"week" + 0.046*"achievement" + 0.046*"&lt;-"')
(3, '0.051*"agree" + 0.049*"broke" + 0.048*"peace" + 0.048*"dakota" + 0.048*"dying." + 0.047*"relation" + 0.046*"normalize" + 0.046*"announce" + 0.046*"week" + 0.046*"american"')
(4, '0.046*"relevance." + 0.046*"dying." + 0.046*"achievement" + 0.046*"normalize" + 0.046*"incumbent" + 0.046*"in:" + 0.046*"agree" + 0.046*"@jim_jordan" + 0.046*"foreign" + 0.046*"broke"')
(5, '0.047*"relevance." + 0.046*"north" + 0.046*"week" + 0.046*"agree" + 0