In [1]:
import numpy as np
import pandas as pd
import re, emoji, string
from nltk.corpus import stopwords
import nltk, time
from biterm.cbtm import oBTM
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary
from nltk.stem import WordNetLemmatizer
import pickle

In [2]:
# read data
data = pd.read_csv('/home/nguyen/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())
print(data.head())

                   Id                                              Tweet
0  824941360449015808  emergency rally against trump's muslim travel ...
1  824941519857610752  theresa may has not apologized to trump for in...
2  824941616314122240  trump's immigration ban excludes countries wit...
3  824942056741167105  trump's immigration order expands the definiti...
4  824942966875774976  alert : senator john mccain threatens action o...


In [4]:
# def lemmatize_stemming(text):
#     return WordNetLemmatizer().lemmatize(text, pos='v')

In [5]:

# data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(lemmatize_stemming(y) for y in x.split(" ") if y.strip()!= ""))

In [9]:
stop_words = stopwords.words('english')
stop_words.extend(['like', 'https', 'htt', 'get', 'would', 'im', 'know', 'says',
                   'want', 'see', 'make', 'need', 'think', 'going', 'please', 'let', 'w',
                   '–', 'much', 'many', 'feel', 'go', 'take', 'like', 'hate', 'news', 'rt'])
for item in 'abcdefghijklmnopqrstuvwxyz':
    stop_words.append(item)

In [10]:
# remove stopwords, punctuation

data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stop_words))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

In [7]:
cv = CountVectorizer()  
cv_fit = cv.fit_transform(list(data['Tweet1']))
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)   
wCount = dict(zip(word_list,count_list))
textCount =  sorted(wCount.items(), key=lambda k: -k[1])
print(len(word_list), len(textCount))
print(textCount[0:100])

48886 48886
[('trump', 36613), ('ban', 28016), ('trumps', 20850), ('muslim', 13374), ('people', 13119), ('order', 12963), ('us', 12287), ('refugees', 12088), ('immigration', 11977), ('muslimban', 10531), ('travel', 7473), ('countries', 7232), ('president', 7037), ('executive', 6266), ('donald', 6231), ('america', 5920), ('via', 5788), ('refugee', 5480), ('white', 5016), ('muslims', 4255), ('obama', 3883), ('world', 3851), ('see', 3819), ('protest', 3677), ('news', 3554), ('says', 3471), ('state', 3436), ('new', 3111), ('country', 3004), ('like', 2913), ('one', 2633), ('visit', 2627), ('immigrants', 2405), ('support', 2394), ('banned', 2354), ('may', 2314), ('house', 2269), ('quebec', 2247), ('americans', 2242), ('acting', 2229), ('uk', 2223), ('https', 2187), ('get', 2178), ('mosque', 2138), ('right', 2136), ('would', 2131), ('american', 2123), ('general', 2117), ('terrorist', 2108), ('iran', 2071), ('judge', 2070), ('breaking', 2056), ('attorney', 1998), ('airport', 1954), ('petition'

In [12]:
data['len'] = data['Tweet1'].apply(lambda x: len(x.split(" ")))
data['#uniWord'] = data['Tweet1'].apply(lambda x: len(set(x.split(" "))))

In [13]:
# remove tweets #unique words less than haft of length
# data['len'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['#uniWord']/data['len']>0.5]
print(data.shape)

(44, 5)


In [10]:
# remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.shape)

(104690, 4)


In [11]:
# remove tweets with lengths < 3
cv = CountVectorizer(stop_words='english', min_df = 10, max_df = 0.035) 
cv_fit = cv.fit(list(data['Tweet1']))
vocab = set(cv.get_feature_names())
data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in vocab))
while True:
    data['len'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
    data = data[data['len'] >2]
    cv = CountVectorizer(stop_words='english', min_df = 10)  
    cv.fit(list(data['Tweet1']))
    newVocab = set(cv.get_feature_names())
    
    print("Len: ", len(newVocab))
    print("data.shape: ", data.shape)
    if len(vocab) == len(newVocab):
        break
    data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in newVocab))
    vocab = newVocab.copy()

Len:  7976
data.shape:  (101104, 4)
Len:  7973
data.shape:  (101076, 4)
Len:  7973
data.shape:  (101076, 4)


In [12]:
cv = CountVectorizer() 
cv_fit = cv.fit_transform(list(data['Tweet1']))
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)   
wCount = dict(zip(word_list,count_list))
textCount =  sorted(wCount.items(), key=lambda k: -k[1])
print(len(word_list), len(textCount))
print(textCount[0:100])

7973 7973
[('muslims', 3849), ('obama', 3239), ('protest', 3180), ('world', 3180), ('country', 2794), ('new', 2510), ('state', 2479), ('support', 2064), ('immigrants', 2049), ('right', 2025), ('americans', 2000), ('banned', 1990), ('american', 1943), ('uk', 1924), ('quebec', 1903), ('terrorist', 1879), ('visit', 1838), ('house', 1811), ('acting', 1795), ('nobannowall', 1779), ('iran', 1732), ('mosque', 1707), ('airport', 1697), ('yates', 1674), ('general', 1652), ('today', 1628), ('say', 1620), ('attorney', 1589), ('terrorists', 1589), ('breaking', 1585), ('judge', 1542), ('green', 1532), ('media', 1489), ('stop', 1462), ('orders', 1460), ('stand', 1419), ('saudi', 1386), ('syrian', 1385), ('detained', 1340), ('vetting', 1325), ('citizens', 1320), ('sally', 1315), ('card', 1300), ('killed', 1295), ('usa', 1270), ('protests', 1269), ('jfk', 1267), ('list', 1261), ('petition', 1256), ('business', 1249), ('said', 1241), ('time', 1220), ('attack', 1211), ('ceo', 1201), ('holders', 1184), (

In [None]:
num_topics = 20
if __name__ == "__main__":

    texts = list(data['Tweet1'])

    # vectorize texts
    vec = CountVectorizer()
    X = vec.fit_transform(texts).toarray()

    # get vocabulary
    vocab = np.array(vec.get_feature_names())
    print("Vocab: {}".format(len(vocab)))

    # get biterms
    biterms = vec_to_biterms(X)

    # create btm
    btm = oBTM(num_topics=num_topics, V=vocab)
    print("Len(biterms):", len(biterms))

    print("\n\n Train Online BTM ..")
    start = time.time()
    for i in range(0, len(biterms), 2000): # prozess chunk of 200 texts
        
        biterms_chunk = biterms[i:i + 2000]
        btm.fit(biterms_chunk, iterations=10)
        
        if i%2000 ==0:
            print("....Line:{}, {}".format(i, (time.time()-start)))
            start = time.time()
    topics = btm.transform(biterms)

#     print("\n\n Visualize Topics ..")
#     vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
#     pyLDAvis.save_html(vis, 'online_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)

#     print("\n\n Texts & Topics ..")
#     for i in range(len(texts)):
#         print("{}. {} (topic: {})".format(i, texts[i], topics[i].argmax()))
#     print(topics.max())

Vocab: 7973
Len(biterms): 101076


 Train Online BTM ..


100%|██████████| 10/10 [02:40<00:00, 16.08s/it]


....Line:0, 161.35531544685364


100%|██████████| 10/10 [02:36<00:00, 15.64s/it]


....Line:2000, 157.07841205596924


100%|██████████| 10/10 [02:43<00:00, 16.37s/it]


....Line:4000, 164.43683385849


100%|██████████| 10/10 [02:37<00:00, 15.74s/it]


....Line:6000, 158.04039192199707


100%|██████████| 10/10 [02:40<00:00, 16.06s/it]


....Line:8000, 161.1889991760254


100%|██████████| 10/10 [02:35<00:00, 15.60s/it]


....Line:10000, 156.5211992263794


100%|██████████| 10/10 [02:39<00:00, 15.99s/it]


....Line:12000, 160.46943521499634


100%|██████████| 10/10 [02:37<00:00, 15.75s/it]


....Line:14000, 158.08185601234436


  0%|          | 0/10 [00:00<?, ?it/s]

In [30]:
topic_summuary(btm.phi_wz.T, X, vocab, 10)

Topic 0 | Coherence=-176.08 | Top words= quebec yates acting general mosque sally attorney judge house breaking
Topic 1 | Coherence=-190.19 | Top words= muslims world mosque obama country right americans attack said come
Topic 2 | Coherence=-197.96 | Top words= american stand world muslims right obama say business religious freedom
Topic 3 | Coherence=-174.08 | Top words= green card holders obama visa visas muslims security legal dhs
Topic 4 | Coherence=-193.99 | Top words= ceo tech country apple policy support google americans saudi employees
Topic 5 | Coherence=-183.73 | Top words= world free right muslims rights country law media american human
Topic 6 | Coherence=-141.32 | Top words= saudi iran arabia list iraq syria 911 banned yemen libya
Topic 7 | Coherence=-188.38 | Top words= country world americans obama syrian support canada american terrorist say
Topic 8 | Coherence=-204.41 | Top words= world safe chaos support airport rights immigrants ceo muslims war
Topic 9 | Coherence=-1

{'coherence': [-176.08016471423312,
  -190.1937594555519,
  -197.9640699729928,
  -174.07745424971364,
  -193.98989017688382,
  -183.72932518254785,
  -141.32139861827272,
  -188.37986070483106,
  -204.41298364504763,
  -180.98183711881407,
  -152.7192894991019,
  -176.8443527583473,
  -196.57747297389776,
  -201.99922037965052,
  -176.49111915891885,
  -114.95383430765486,
  -233.69011665121653,
  -169.28420384175527,
  -220.96502459575262,
  -177.02743426369048],
 'top_words': [array(['quebec', 'yates', 'acting', 'general', 'mosque', 'sally',
         'attorney', 'judge', 'house', 'breaking'], dtype='<U28'),
  array(['muslims', 'world', 'mosque', 'obama', 'country', 'right',
         'americans', 'attack', 'said', 'come'], dtype='<U28'),
  array(['american', 'stand', 'world', 'muslims', 'right', 'obama', 'say',
         'business', 'religious', 'freedom'], dtype='<U28'),
  array(['green', 'card', 'holders', 'obama', 'visa', 'visas', 'muslims',
         'security', 'legal', 'dhs'], dt

In [24]:
 btm.phi_wz.shape

(7973, 20)

## 1. Extract instances/tweets that have highest topic-document prob

In [18]:
x = pd.DataFrame(topics)
print(x.shape)
x.shape

for i in range(num_topics):
    print("#{}. {}\t {} {}".format(i, x[i].idxmax(),  x[i].max(), data.iloc[x[i].idxmax()]['Tweet1']))

(101076, 20)
#0. 18843	 0.9979276631701551 101 thx jnj
#1. 18122	 0.9999057484680026 bitfinex btce gdax bitstamp bitcoin
#2. 54580	 0.9842965979863711 explained geneva conventions
#3. 9569	 0.9992765005704639 earned pin pins
#4. 18831	 0.9927730948432226 apple exist tim cook
#5. 2990	 0.9925449103022643 tired poor huddled masses yearning breathe free
#6. 48818	 0.967278832793375 19 saudi arabia saudi arabia
#7. 40708	 0.8959245126108748 kal penn raises counting
#8. 7505	 0.9458090158492217 causes chaos panic anger worldwide
#9. 25999	 0.979730228046799 status saudis status
#10. 10747	 0.9789923712458503 gather terminal4 arrivals jfk nomuslimbanjfk
#11. 36510	 0.99155574191433 fucked niteflirt 1800toflirt ext
#12. 100675	 0.957457306404343 lowest approval rating 44
#13. 10153	 0.9954020869385828 asghar farhadi attending
#14. 39875	 0.9231368907491159 races religions ethnicities
#15. 38582	 0.9878202952438125 queen knight alien
#16. 57098	 0.9791957838194373 starbucks hire 10000 supports

## 2. Extract instances/tweets that maximize P(d|z)
   *   $P(d|z) = \ln{(len(d))}*\prod_{b_i \in d}P(b_i|z)$ # give more weight to long sentences
   
   *   $d = argmax_dP(d|z) = argmax_d \ln{(P(d|z))}$
   *   $d = argmax_d\ln{(\ln{(len(d)))}} + \sum_{b_i}\ln{P(b_i|z)}$
   *   $d = argmax_d\ln{(\ln{(len(d)))}} + \sum_{b_i, w_{i0, i1} \in b_i}\ln{(P(w_{i0}|z)*P(w_{i1}|z)))}$ 

In [62]:
data = data.reset_index(drop= True)

In [105]:
data1 = data.copy()
remained_index = data1.index

In [106]:
data1['uniW'] = data1['Tweet1'].apply(lambda x: len(set(x.split(" "))))

In [107]:
data1[data1['uniW']>4].head()

Unnamed: 0,Id,Tweet,Tweet1,len,uniW
0,824941360449015808,emergency rally against trump's muslim travel ...,emergency rally nyc 125 pm,5,5
1,824941519857610752,theresa may has not apologized to trump for in...,theresa insulting fails today send,5,5
4,824942966875774976,alert : senator john mccain threatens action o...,alert senator john mccain threatens action,6,6
6,824944363587395584,ty for bailing on gmb & today . piers morgan d...,ty today piers morgan aid vocal,6,6
7,824944376182927360,#trump to sign eo temporary ban suspending vi...,sign eo temporary suspending visas syria afric...,8,8


In [108]:
data1 = data1[data1['uniW'] >4]
remained_index = data1.index

In [109]:
remained_index

Int64Index([     0,      1,      4,      6,      7,     10,     12,     15,
                16,     17,
            ...
            101065, 101066, 101067, 101068, 101069, 101071, 101072, 101073,
            101074, 101075],
           dtype='int64', length=83513)

In [110]:
biterms1 = np.array(biterms).copy()

In [111]:
biterms1 = biterms1[remained_index]

In [112]:
num_topics = 20
P_dz = np.zeros([len(biterms1), num_topics])
# iterate documents
for i, d in enumerate(biterms1):
    n_biterms = len(d)
    P_bz = np.zeros([len(d), num_topics])
    for j, b in enumerate(d):
        P_bz[j] = np.log(btm.phi_wz[b[0], :] * btm.phi_wz[b[1], :])
    P_dz[i] = P_bz.sum(axis = 0)

print(P_dz.shape)

(83513, 20)


In [113]:
# extract most representative sentences for each topic
indices = P_dz.argmax(axis = 0)
    

In [116]:
for i, idx in enumerate(indices):
    print("{}. {}\t{}".format(i, idx, data1.iloc[idx]['Tweet']))

0. 54342	acting attorney general sally yates to trump's #muslimban 
1. 72995	#canadian_terrorist what you think about the person who has been killing six muslims in a mosque in canada #occidental_terrorist
2. 74138	i stand against the persecution of muslims . this is not only against our values as muslims , but also against the american values . #muslimban
3. 15657	trump’s immigration order strands green card holders who were outside country via mia …
4. 23354	breaking tim cook : " apple does not support trumps ban on refugees ! "
5. 16864	true leader of the free world . #welcometocanada 
6. 79841	 yet trump hasn't banned saudi arabia , who did 9/11 as he does business with them .
7. 79561	not one person from 7 countries banned ever came here and killed an american . but from the countries where psychotr … 
8. 29249	' refugees are welcome here ' : protesters rally at jfk airport 
9. 1497	 : trump says new vetting will keep ‘ radical islamic terrorists ’ out of u.s. 
10. 15508	protest a