In [1]:
import numpy as np
import pandas as pd
import re, emoji, string
from nltk.corpus import stopwords
import nltk, time
from biterm.cbtm import oBTM
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
import gensim

unable to import 'smart_open.gcs', disabling that module


In [2]:
# read data
data = pd.read_csv('/home/nguyen/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI))
print(data.head())

                   Id                                              Tweet
0  824941360449015808  emergency rally against trump's muslim travel ...
1  824941519857610752  theresa may has not apologized to trump for in...
2  824941616314122240  trump's immigration ban excludes countries wit...
3  824942056741167105  trump's immigration order expands the definiti...
4  824942966875774976  alert : senator john mccain threatens action o...


In [4]:
# def lemmatize_stemming(text):
#     return WordNetLemmatizer().lemmatize(text, pos='v')

In [None]:

# data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(lemmatize_stemming(y) for y in x.split(" ") if y.strip()!= ""))

In [5]:
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

In [6]:
cv = CountVectorizer()  
cv_fit = cv.fit_transform(list(data['Tweet1']))
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)   
wCount = dict(zip(word_list,count_list))
textCount =  sorted(wCount.items(), key=lambda k: -k[1])
print(len(word_list), len(textCount))
print(textCount)

48886 48886


In [7]:
# remove common topic words
# topicCommonWords = ['trump', 'trumps', 'ban', 'muslim', 'people', 'refugees', 'us', 'immigration',
#                     'muslimban', 'travel', 'countries', 'donald', 'via', 'muslims', 'world', 'news', 
#                     'order', 'white', 'president', 'america', 'americans',
#                     'like', 'https', 'htt', 'get', 'would', 'im', 'know', 'say',
#                    'want', 'see', 'make', 'need', 'think', 'going', 'please', 'let', 'w',
#                    '–', 'much', 'many', 'feel', 'go', 'take', 'like', 'hate']
topicCommonWords = ['like', 'https', 'htt', 'get', 'would', 'im', 'know', 'says',
                   'want', 'see', 'make', 'need', 'think', 'going', 'please', 'let', 'w',
                   '–', 'much', 'many', 'feel', 'go', 'take', 'like', 'hate', 'news']
data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if ((y not in topicCommonWords) and (y.isascii()))))

In [8]:
# remove tweets #unique words less than haft of length
data['nWords'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['nWords']!=0]
print(data.shape)

(123343, 4)


In [9]:
# remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.shape)

(104690, 4)


In [10]:
# remove tweets with lengths < 3
cv = CountVectorizer(stop_words='english', min_df = 10, max_df = 0.035) 
cv_fit = cv.fit(list(data['Tweet1']))
vocab = set(cv.get_feature_names())
data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in vocab))
while True:
    data['nWords'] = data['Tweet1'].apply(lambda x: len(x.split(" ")))
    data = data[data['nWords'] >2]
    cv = CountVectorizer(stop_words='english', min_df = 10)  
    cv.fit(list(data['Tweet1']))
    newVocab = set(cv.get_feature_names())
    
    print("Len: ", len(newVocab))
    print("data.shape: ", data.shape)
    if len(vocab) == len(newVocab):
        break
    data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in newVocab))
    vocab = newVocab.copy()

Len:  7977
data.shape:  (101157, 4)
Len:  7974
data.shape:  (101128, 4)
Len:  7974
data.shape:  (101128, 4)


In [11]:
texts = [x.split(" ") for x in data['Tweet1']]
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=10, no_above=0.3)

corpus = [dictionary.doc2bow(listWords) for listWords in texts]

num_topics = 20
%time
lda = models.LdaModel(corpus, num_topics = num_topics,
                     id2word = dictionary,
                     passes = 10, alpha=[0.01]*num_topics, 
                                  eta=[0.01]*len(dictionary.keys()), random_state = 10)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.53 µs


In [12]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [13]:
len(dictionary)

7974

In [14]:
for i,topic in lda.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()

0: 0.042*"years" + 0.041*"family" + 0.035*"nationwide" + 0.032*"students" + 0.026*"democrats" + 0.026*"year" + 0.023*"true" + 0.022*"woman" + 0.022*"laws" + 0.021*"allowed"

1: 0.133*"protest" + 0.040*"stand" + 0.039*"thousands" + 0.032*"presidents" + 0.030*"iraqi" + 0.027*"google" + 0.027*"seattle" + 0.027*"employees" + 0.024*"right" + 0.019*"mom"

2: 0.086*"obama" + 0.066*"americans" + 0.041*"advisory" + 0.032*"banned" + 0.029*"list" + 0.028*"rights" + 0.027*"states" + 0.027*"starbucks" + 0.027*"saudi" + 0.024*"human"

3: 0.056*"support" + 0.049*"safe" + 0.045*"temporary" + 0.040*"defend" + 0.032*"work" + 0.027*"rally" + 0.026*"open" + 0.025*"yes" + 0.021*"aclu" + 0.019*"act"

4: 0.055*"american" + 0.045*"legal" + 0.040*"green" + 0.038*"affected" + 0.038*"vetting" + 0.037*"card" + 0.034*"come" + 0.033*"islam" + 0.032*"justice" + 0.025*"supremacist"

5: 0.094*"muslims" + 0.058*"terrorist" + 0.052*"general" + 0.051*"attorney" + 0.030*"freedom" + 0.027*"thats" + 0.026*"religion" + 0.022

In [15]:
# get main topic for each document
topics = []
probs = []
for i, row in enumerate(lda[corpus]):
    row = sorted(row, key = lambda x: x[1], reverse = True)
    #get dominant topic and contribution
    topic_num = row[0][0]
    topic_prob = row[0][1]
    topics.append(topic_num)
    probs.append(round(topic_prob, 4))

In [16]:
data['topic'] = topics
data['prob'] = probs

In [17]:
data.head()

Unnamed: 0,Id,Tweet,Tweet1,nWords,topic,prob
0,824941360449015808,emergency rally against trump's muslim travel ...,emergency rally nyc 125 pm,5,13,0.7167
1,824941519857610752,theresa may has not apologized to trump for in...,theresa insulting fails today send,5,16,0.3045
2,824941616314122240,trump's immigration ban excludes countries wit...,excludes business ties,3,7,0.9406
3,824942056741167105,trump's immigration order expands the definiti...,expands definition criminal,3,15,0.6281
4,824942966875774976,alert : senator john mccain threatens action o...,alert senator john mccain threatens action,6,18,0.3242


In [18]:
data_topic = data.groupby('topic')

sent_topic_sorted = pd.DataFrame()
for i, group in data_topic:
    sent_topic_sorted = pd.concat([sent_topic_sorted, group.sort_values(['prob'], 
                                                            ascending=False).head(1)], axis=0)

In [19]:
sent_topic_sorted

Unnamed: 0,Id,Tweet,Tweet1,nWords,topic,prob
1559,825077176219402240,trump plans to designate the muslim brotherhoo...,plans designate brotherhood terrorist organiza...,5,0,0.9635
49579,825824815101521920,"across the country , thousands protest trump's...",country thousands protest thousands,4,1,0.9548
405,824997991958323200,"911 terrorists were from saudi arabia , united...",911 terrorists saudi arabia united arab emirat...,12,2,0.9844
1190,825053075761090560,sheryl sandberg criticizes trump’s ban on glob...,sheryl sandberg criticizes global abortion fun...,8,3,0.9736
29743,825568945801412608,lazer legal immigrants built this country ! n...,legal immigrants built country come illegally ...,8,4,0.9635
5671,825240741484122112,i don't see how he's generalizing . he's not ...,hes generalizing hes saying muslims terrorists...,8,5,0.9736
18665,825479925893324801,"how many ctizens of syria , iraq , iran , sud...",syria iraq iran sudan somalia yemen libya appl...,9,6,0.9736
364,824992468059975680,immigration law is supposed to benefit america...,law supposed benefit supposed welfare world,6,7,0.9694
1976,825103717783441408,f*ck trump . and f*ck all he stands for . and ...,fck fck stands fck voted fck,6,8,0.9694
1481,825070079452835840,oh trump . you do make me laugh . #theresistan...,oh laugh theresistance whitehouse resist,5,9,0.9635


In [20]:
for i in range(sent_topic_sorted.shape[0]):
    print("#{}. {}\t {}".format(i,  sent_topic_sorted.iloc[i]['Id'], sent_topic_sorted.iloc[i]['Tweet']))

#0. 825077176219402240	 trump plans to designate the muslim brotherhood a terrorist organization 
#1. 825824815101521920	 across the country , thousands protest trump's " muslim ban " : thousands of people poured into the … | 
#2. 824997991958323200	 911 terrorists were from saudi arabia , united arab emirates & egypt : countries not on trump ' s banned list . cuz he has investments there !
#3. 825053075761090560	 sheryl sandberg criticizes trump’s ban on global abortion funding #fashion #style 
#4. 825568945801412608	  lazer legal immigrants built this country ! not those who come here illegally or with the intent 2 harm o … 
#5. 825240741484122112	  i don't see how he's generalizing . he's not saying " all muslims are terrorists " . neither is he saying " only in europe "
#6. 825479925893324801	  how many ctizens of syria , iraq , iran , sudan , somalia , yemen and libya does apple want to employ in the us ?
#7. 824992468059975680	 immigration law is supposed to benefit america . it 