In [1]:
import numpy as np
import pandas as pd
import re, emoji, string
from nltk.corpus import stopwords
import nltk, time
from biterm.cbtm import oBTM
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary
from nltk.stem import WordNetLemmatizer
import pickle

In [2]:
# read data
data = pd.read_csv('/home/nguyen/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())
print(data.head())

                   Id                                              Tweet
0  824941360449015808  emergency rally against trump's muslim travel ...
1  824941519857610752  theresa may has not apologized to trump for in...
2  824941616314122240  trump's immigration ban excludes countries wit...
3  824942056741167105  trump's immigration order expands the definiti...
4  824942966875774976  alert : senator john mccain threatens action o...


In [4]:
# def lemmatize_stemming(text):
#     return WordNetLemmatizer().lemmatize(text, pos='v')

# data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(lemmatize_stemming(y) for y in x.split(" ") if y.strip()!= ""))

In [5]:
stop_words = stopwords.words('english')
stop_words.extend(['like', 'https', 'htt', 'get', 'would', 'im', 'know', 'says',
                   'want', 'see', 'make', 'need', 'think', 'going', 'please', 'let', 'w',
                   '–', 'much', 'many', 'feel', 'go', 'take', 'like', 'hate', 'news', 'rt'])
for item in 'abcdefghijklmnopqrstuvwxyz':
    stop_words.append(item)

In [6]:
# remove stopwords, punctuation

data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stop_words))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

In [7]:
data['Tweet1'].head()

0    emergency rally trumps muslim travel ban nyc 1...
1    theresa may apologized trump insulting fails t...
2    trumps immigration ban excludes countries busi...
3    trumps immigration order expands definition cr...
4    alert senator john mccain threatens action pre...
Name: Tweet1, dtype: object

In [8]:
# view most frequent words
cv = CountVectorizer()  
cv_fit = cv.fit_transform(list(data['Tweet1']))
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)   
wCount = dict(zip(word_list,count_list))
textCount =  sorted(wCount.items(), key=lambda k: -k[1])
print(len(word_list), len(textCount))
print(textCount[0:100])

48879 48879
[('trump', 36613), ('ban', 28016), ('trumps', 20850), ('muslim', 13374), ('people', 13119), ('order', 12963), ('us', 12287), ('refugees', 12088), ('immigration', 11977), ('muslimban', 10531), ('travel', 7473), ('countries', 7232), ('president', 7037), ('executive', 6266), ('donald', 6231), ('america', 5920), ('via', 5788), ('refugee', 5480), ('white', 5016), ('muslims', 4255), ('obama', 3883), ('world', 3851), ('protest', 3677), ('state', 3436), ('new', 3111), ('country', 3004), ('one', 2633), ('visit', 2627), ('immigrants', 2405), ('support', 2394), ('banned', 2354), ('may', 2314), ('house', 2269), ('quebec', 2247), ('americans', 2242), ('acting', 2229), ('uk', 2223), ('mosque', 2138), ('right', 2136), ('american', 2123), ('general', 2117), ('terrorist', 2108), ('iran', 2071), ('judge', 2070), ('breaking', 2056), ('attorney', 1998), ('airport', 1954), ('petition', 1921), ('yates', 1898), ('orders', 1883), ('nobannowall', 1867), ('media', 1803), ('say', 1779), ('im', 1753),

In [9]:
data['len'] = data['Tweet1'].apply(lambda x: len(x.split(" ")))
data['#uniWord'] = data['Tweet1'].apply(lambda x: len(set(x.split(" "))))

In [10]:
# remove tweets #unique words less than haft of length
# data['len'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['#uniWord']/data['len']>0.5]
print(data.shape)

(123341, 5)


In [11]:
# remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.shape)

(104687, 5)


In [12]:
# remove tweets with lengths < 3
cv = CountVectorizer(stop_words='english', min_df = 10, max_df = 0.035) 
cv_fit = cv.fit(list(data['Tweet1']))
vocab = set(cv.get_feature_names())
data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in vocab))
while True:
    data['len'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
    data['#uniWord'] = data['Tweet1'].apply(lambda x: len(set(x.split(" "))))
    data = data[data['len'] >2]
    cv = CountVectorizer(stop_words='english', min_df = 10)  
    cv.fit(list(data['Tweet1']))
    newVocab = set(cv.get_feature_names())
    
    print("Len: ", len(newVocab))
    print("data.shape: ", data.shape)
    if len(vocab) == len(newVocab):
        break
    data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in newVocab))
    vocab = newVocab.copy()

Len:  7979
data.shape:  (101133, 5)
Len:  7976
data.shape:  (101104, 5)
Len:  7976
data.shape:  (101104, 5)


In [13]:
# most frequent words after removing short tweets, highly/low frequent words
cv = CountVectorizer() 
cv_fit = cv.fit_transform(list(data['Tweet1']))
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)   
wCount = dict(zip(word_list,count_list))
textCount =  sorted(wCount.items(), key=lambda k: -k[1])
print(len(word_list), len(textCount))
print(textCount[0:100])

7976 7976
[('muslims', 3843), ('obama', 3238), ('protest', 3180), ('world', 3179), ('country', 2793), ('new', 2515), ('state', 2475), ('support', 2064), ('immigrants', 2051), ('right', 2022), ('americans', 1996), ('banned', 1987), ('american', 1941), ('uk', 1925), ('quebec', 1902), ('terrorist', 1879), ('visit', 1836), ('house', 1812), ('acting', 1793), ('nobannowall', 1782), ('iran', 1729), ('mosque', 1704), ('airport', 1701), ('yates', 1673), ('general', 1653), ('today', 1624), ('say', 1620), ('im', 1618), ('attorney', 1590), ('terrorists', 1589), ('breaking', 1583), ('judge', 1540), ('green', 1535), ('media', 1487), ('orders', 1463), ('stop', 1463), ('stand', 1419), ('saudi', 1386), ('syrian', 1386), ('detained', 1343), ('vetting', 1325), ('citizens', 1321), ('sally', 1315), ('card', 1304), ('killed', 1291), ('jfk', 1270), ('protests', 1270), ('usa', 1270), ('list', 1263), ('petition', 1251), ('business', 1250), ('said', 1242), ('time', 1220), ('attack', 1212), ('ceo', 1204), ('hold

In [None]:
num_topics = 20
if __name__ == "__main__":

    texts = list(data['Tweet1'])

    # vectorize texts
    vec = CountVectorizer()
    X = vec.fit_transform(texts).toarray()

    # get vocabulary
    vocab = np.array(vec.get_feature_names())
    print("Vocab: {}".format(len(vocab)))

    # get biterms
    biterms = vec_to_biterms(X)

    # create btm
    btm = oBTM(num_topics=num_topics, V=vocab)
    print("Len(biterms):", len(biterms))

    print("\n\n Train Online BTM ..")
    start = time.time()
    for i in range(0, len(biterms), 2000): # prozess chunk of 200 texts
        
        biterms_chunk = biterms[i:i + 2000]
        btm.fit(biterms_chunk, iterations=50)
        
        if i%2000 ==0:
            print("....Line:{}, {}".format(i, (time.time()-start)))
            start = time.time()
    topics = btm.transform(biterms)

#     print("\n\n Visualize Topics ..")
#     vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
#     pyLDAvis.save_html(vis, 'online_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)

#     print("\n\n Texts & Topics ..")
#     for i in range(len(texts)):
#         print("{}. {} (topic: {})".format(i, texts[i], topics[i].argmax()))
#     print(topics.max())

Vocab: 7976
Len(biterms): 101104


 Train Online BTM ..


100%|██████████| 50/50 [14:17<00:00, 17.15s/it]


....Line:0, 857.9769999980927


 78%|███████▊  | 39/50 [10:41<02:59, 16.33s/it]

In [15]:
topic_summuary(btm.phi_wz.T, X, vocab, 10)

Topic 0 | Coherence=-194.48 | Top words= quebec judge mosque detained federal court new today stay airport
Topic 1 | Coherence=-187.88 | Top words= obama immigrants list banned muslims illegal americans right maga committed
Topic 2 | Coherence=-193.51 | Top words= country world american law stop religious nobannowall im proud today
Topic 3 | Coherence=-178.03 | Top words= ceo starbucks uber hire google tech 10000 advisory apple council
Topic 4 | Coherence=-173.88 | Top words= green saudi card holders iran arabia iraq 911 syria banned
Topic 5 | Coherence=-183.16 | Top words= world stand country im americans immigrants good american say media
Topic 6 | Coherence=-180.90 | Top words= world support maga american muslims iran time media right obama
Topic 7 | Coherence=-181.13 | Top words= state acting yates visit general attorney sally petition uk defend
Topic 8 | Coherence=-137.39 | Top words= uk theresa minister pm british prime house press conference visit
Topic 9 | Coherence=-196.60 | T

{'coherence': [-194.47972203810775,
  -187.87835510311297,
  -193.50702655434074,
  -178.02936579862066,
  -173.88418514240198,
  -183.1572053868775,
  -180.90024534308355,
  -181.1284999771516,
  -137.3860702297489,
  -196.5957772744971,
  -185.51397900970758,
  -157.3085725728535,
  -162.50106103283855,
  -169.2734176893231,
  -201.2784702591572,
  -189.97232865891166,
  -214.78068470269673,
  -209.2743847980392,
  -165.69335127384565,
  -230.15177437752845],
 'top_words': [array(['quebec', 'judge', 'mosque', 'detained', 'federal', 'court', 'new',
         'today', 'stay', 'airport'], dtype='<U28'),
  array(['obama', 'immigrants', 'list', 'banned', 'muslims', 'illegal',
         'americans', 'right', 'maga', 'committed'], dtype='<U28'),
  array(['country', 'world', 'american', 'law', 'stop', 'religious',
         'nobannowall', 'im', 'proud', 'today'], dtype='<U28'),
  array(['ceo', 'starbucks', 'uber', 'hire', 'google', 'tech', '10000',
         'advisory', 'apple', 'council'], dtyp

In [16]:
 btm.phi_wz.shape

(7976, 20)

In [19]:
data = data.reset_index(drop= True)
data['len'] = data['Tweet1'].apply(lambda x: len(set(x.split(" "))))
data['#uniWord'] = data['Tweet1'].apply(lambda x: len(set(x.split(" "))))

## 1. Extract instances/tweets that have highest topic-document prob

In [20]:
x = pd.DataFrame(topics)
print(x.shape)
x.shape
for i in range(num_topics):
    print("#{}. {}\t {} {}".format(i, x[i].idxmax(),  x[i].max(), data.iloc[x[i].idxmax()]['Tweet1']))

(101104, 20)
#0. 87155	 0.9998515193421754 filed restraining filed
#1. 26277	 0.9917291554326706 results prescription online online pharmacy online
#2. 3365	 0.9939585193668687 heartbroken malala yousafzai
#3. 41468	 0.9987054701533128 apple tim cook
#4. 43147	 0.9934031065985757 uae saudia arabia
#5. 90325	 0.9886550666162748 elliptical bike cross trainer exercise fitness machine
#6. 5934	 0.9999027156281527 ausopen venus williams serena williams
#7. 71895	 0.9976800923081895 dana boente yates
#8. 8164	 0.9677912070854435 shinzo abe discusses importance japan alliance
#9. 55852	 0.9969359003546336 approval rating unpopular gallup
#10. 1424	 0.9780894955283705 persecuted christians given priority
#11. 25590	 0.9964889451041298 filling arrivals hall
#12. 6976	 0.9357109170151493 shoot theaters theater shooters coincidence woke
#13. 47120	 0.9997776447959211 malevolence tempered incompetence
#14. 56208	 0.9994641209076541 kal penn insulted
#15. 66787	 0.910216629282378 villains ashamed m

In [23]:
# extract only instance with > 5 uniWords
data1 = data[data['#uniWord']>4].copy()
print(data1.index)
x = pd.DataFrame(topics)
x = x.iloc[data1.index]
print(x.index)


Int64Index([     0,      1,      4,      6,      7,     10,     12,     15,
                16,     17,
            ...
            101093, 101094, 101095, 101096, 101097, 101099, 101100, 101101,
            101102, 101103],
           dtype='int64', length=83624)
Int64Index([     0,      1,      4,      6,      7,     10,     12,     15,
                16,     17,
            ...
            101093, 101094, 101095, 101096, 101097, 101099, 101100, 101101,
            101102, 101103],
           dtype='int64', length=83624)


In [24]:
data1 = data1.reset_index(drop=True)
x = x.reset_index(drop=True)

In [25]:
for i in range(num_topics):
    print("#{}. {}\t {} {}".format(i, x[i].idxmax(),  x[i].max(), data1.iloc[x[i].idxmax()]['Tweet1']))

#0. 25579	 0.9248268232756537 motion darweesh judge donnellys tro
#1. 1481	 0.9507302032856352 publishing crimes committed illegals democrats converting
#2. 1973	 0.9726844119312122 nobel peace winner malala heartbroken
#3. 12864	 0.9974272535683989 apple ceo cook apple exist recode
#4. 61653	 0.9820632675108047 sudan somalia libya banned traveling
#5. 74555	 0.9886550666162748 elliptical bike cross trainer exercise fitness machine
#6. 4361	 0.9947723003797263 venus serena williams tennis world
#7. 63226	 0.9941030929454294 fires acting attorney general betrayed
#8. 6708	 0.9677912070854435 shinzo abe discusses importance japan alliance
#9. 83241	 0.9864087866838271 poll job approval rating lags
#10. 40694	 0.9312212606196182 nowplaying listeners web media np
#11. 31536	 0.9876325257478071 thousands gathered bostons copley square protest
#12. 5743	 0.9357109170151493 shoot theaters theater shooters coincidence woke
#13. 24212	 0.938962215869479 memorandum boost military readiness rt
#1

## 2. Extract instances/tweets that maximize P(d|z)
   *   $P(d|z) = \ln{(len(d))}*\prod_{b_i \in d}P(b_i|z)$ # give more weight to long sentences
   
   *   $d = argmax_dP(d|z) = argmax_d \ln{(P(d|z))}$
   *   $d = argmax_d\ln{(\ln{(len(d)))}} + \sum_{b_i}\ln{P(b_i|z)}$
   *   $d = argmax_d\ln{(\ln{(len(d)))}} + \sum_{b_i, w_{i0, i1} \in b_i}\ln{(P(w_{i0}|z)*P(w_{i1}|z)))}$ 

In [27]:
data1 = data[data['#uniWord'] >4].copy()
remained_index = data1.index

In [35]:
data1 =data1.reset_index(drop=True)

In [28]:
remained_index

Int64Index([     0,      1,      4,      6,      7,     10,     12,     15,
                16,     17,
            ...
            101093, 101094, 101095, 101096, 101097, 101099, 101100, 101101,
            101102, 101103],
           dtype='int64', length=83624)

In [29]:
biterms1 = np.array(biterms).copy()

In [30]:
biterms1 = biterms1[remained_index]

In [31]:
num_topics = 20
P_dz = np.zeros([len(biterms1), num_topics])
# iterate documents
for i, d in enumerate(biterms1):
    n_biterms = len(d)
    P_bz = np.zeros([len(d), num_topics])
    for j, b in enumerate(d):
        P_bz[j] = np.log(btm.phi_wz[b[0], :] * btm.phi_wz[b[1], :])
    P_dz[i] = P_bz.sum(axis = 0)

print(P_dz.shape)

(83624, 20)


In [32]:
# extract most representative sentences for each topic
indices = P_dz.argmax(axis = 0)
    

In [36]:
for i, idx in enumerate(indices):
    print("{}. {}\t{}".format(i, idx, data1.iloc[idx]['Tweet']))

0. 39638	several people killed in shooting at quebec city mosque
1. 65407	obama did enforce the list and he deported more immigrants than any president so …
2. 5221	#ireland just became the world's first country to stop investing in #fossilfuels
3. 77954	uber ceo quits trump advisory council
4. 8563	these are us citizens . not green card holders not people on work/student/travel visas . us citizens . …
5. 50980	everything is not always about americans . i love my country but there is a whole big world out there .
6. 25972	world : we want you to win this #ausopen federer : roger that .
7. 54419	acting attorney general sally yates to trump's #muslimban
8. 640	watch : president donald trump and prime minister of uk theresa may insid ... via
9. 80696	new poll : majority of americans disapprove of trump's immigration ban
10. 17391	obama banned all iraqi refugees for 6 months in 2011 .
11. 17760	thousands at #newyork’s jfk airport protest visa and refugee suspensions
12. 27232	does trump kno

In [37]:
data[data['Tweet'].str.contains('')]

Unnamed: 0,Id,Tweet,Tweet1,len,#uniWord
0,824941360449015808,emergency rally against trump's muslim travel ...,emergency rally nyc 125 pm,5,5
1,824941519857610752,theresa may has not apologized to trump for in...,theresa insulting fails today send,5,5
2,824941616314122240,trump's immigration ban excludes countries wit...,excludes business ties,3,3
3,824942056741167105,trump's immigration order expands the definiti...,expands definition criminal,3,3
4,824942966875774976,alert : senator john mccain threatens action o...,alert senator john mccain threatens action,6,6
...,...,...,...,...,...
101099,827675291698880512,well done washington attorney general . federa...,washington attorney general federal judge seat...,9,9
101100,827675363006414848,"yesterday , i rted a clip about syrian refugee...",yesterday clip syrian working german hospitals...,8,8
101101,827675396577521666,judge in seattle blocks trump travel ban plan ...,judge seattle blocks plan resist,5,5
101102,827675522385612800,"trump responds to louvre attack in paris , urg...",responds louvre attack paris urging smart new ...,9,9


In [38]:
with open('biterm.pkl', 'wb') as f:
    pickle.dump(btm, f)