In [1]:
import numpy as np
import pandas as pd
import re, emoji, string
from nltk.corpus import stopwords
import nltk, time
from biterm.cbtm import oBTM
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary
from nltk.stem import WordNetLemmatizer

In [2]:
# read data
data = pd.read_csv('/home/nguyen/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI))
print(data.head())

                   Id                                              Tweet
0  824941360449015808  emergency rally against trump's muslim travel ...
1  824941519857610752  theresa may has not apologized to trump for in...
2  824941616314122240  trump's immigration ban excludes countries wit...
3  824942056741167105  trump's immigration order expands the definiti...
4  824942966875774976  alert : senator john mccain threatens action o...


In [4]:
# def lemmatize_stemming(text):
#     return WordNetLemmatizer().lemmatize(text, pos='v')

In [5]:

# data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(lemmatize_stemming(y) for y in x.split(" ") if y.strip()!= ""))

In [5]:
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

In [6]:
cv = CountVectorizer()  
cv_fit = cv.fit_transform(list(data['Tweet1']))
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)   
wCount = dict(zip(word_list,count_list))
textCount =  sorted(wCount.items(), key=lambda k: -k[1])
print(len(word_list), len(textCount))
print(textCount)

48886 48886


In [7]:
# remove common topic words
# topicCommonWords = ['trump', 'trumps', 'ban', 'muslim', 'people', 'refugees', 'us', 'immigration',
#                     'muslimban', 'travel', 'countries', 'donald', 'via', 'muslims', 'world', 'news', 
#                     'order', 'white', 'president', 'america', 'americans',
#                     'like', 'https', 'htt', 'get', 'would', 'im', 'know', 'say',
#                    'want', 'see', 'make', 'need', 'think', 'going', 'please', 'let', 'w',
#                    '–', 'much', 'many', 'feel', 'go', 'take', 'like', 'hate']
topicCommonWords = ['like', 'https', 'htt', 'get', 'would', 'im', 'know', 'says',
                   'want', 'see', 'make', 'need', 'think', 'going', 'please', 'let', 'w',
                   '–', 'much', 'many', 'feel', 'go', 'take', 'like', 'hate', 'news']
data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if ((y not in topicCommonWords) and (y.isascii()))))

In [8]:
# remove tweets #unique words less than haft of length
data['uniWords'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['uniWords']!=0]
print(data.shape)

(123343, 4)


In [9]:
# remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.shape)

(104690, 4)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
# remove tweets with lengths < 3
cv = CountVectorizer(stop_words='english', min_df = 10, max_df = 0.035) 
cv_fit = cv.fit(list(data['Tweet1']))
vocab = set(cv.get_feature_names())
data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in vocab))
while True:
    data['uniWords'] = data['Tweet1'].apply(lambda x: len(x.split(" ")))
    data = data[data['uniWords'] >2]
    cv = CountVectorizer(stop_words='english', min_df = 10)  
    cv.fit(list(data['Tweet1']))
    newVocab = set(cv.get_feature_names())
    
    print("Len: ", len(newVocab))
    print("data.shape: ", data.shape)
    if len(vocab) == len(newVocab):
        break
    data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in newVocab))
    vocab = newVocab.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Len:  7977
data.shape:  (101157, 4)
Len:  7974
data.shape:  (101128, 4)
Len:  7974
data.shape:  (101128, 4)


In [14]:
cv = CountVectorizer() 
cv_fit = cv.fit_transform(list(data['Tweet1']))
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)   
wCount = dict(zip(word_list,count_list))
textCount =  sorted(wCount.items(), key=lambda k: -k[1])
print(len(word_list), len(textCount))
print(textCount)

7974 7974


In [None]:
num_topics = 20
if __name__ == "__main__":

    texts = list(data['Tweet1'])

    # vectorize texts
    vec = CountVectorizer()
    X = vec.fit_transform(texts).toarray()

    # get vocabulary
    vocab = np.array(vec.get_feature_names())
    print("Vocab: {}".format(len(vocab)))

    # get biterms
    biterms = vec_to_biterms(X)

    # create btm
    btm = oBTM(num_topics=num_topics, V=vocab)
    print("Len(biterms):", len(biterms))

    print("\n\n Train Online BTM ..")
    start = time.time()
    for i in range(0, len(biterms), 2000): # prozess chunk of 200 texts
        
        biterms_chunk = biterms[i:i + 2000]
        btm.fit(biterms_chunk, iterations=50)
        
        if i%2000 ==0:
            print("....Line:{}, {}".format(i, (time.time()-start)))
            start = time.time()
    topics = btm.transform(biterms)

#     print("\n\n Visualize Topics ..")
#     vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
#     pyLDAvis.save_html(vis, 'online_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)

    print("\n\n Texts & Topics ..")
    for i in range(len(texts)):
        print("{}. {} (topic: {})".format(i, texts[i], topics[i].argmax()))
    print(topics.max())

Vocab: 7974
Len(biterms): 101128


 Train Online BTM ..


  6%|▌         | 3/50 [00:41<10:52, 13.88s/it]

In [23]:
print("\n\n Topic coherence ..")
topic_summuary(btm.phi_wz.T, X, vocab, 10)

print("\n\n Texts & Topics ..")
for i in range(len(texts)):
    print("{}. {} (topic: {})".format(i, texts[i], topics[i].argmax()))
print(topics.max())



 Topic coherence ..
Topic 0 | Coherence=-186.65 | Top words= yates quebec acting sally attorney mosque general judge state breaking
Topic 1 | Coherence=-209.17 | Top words= obama protest entry days family immigrants visa plan denied syrian
Topic 2 | Coherence=-172.18 | Top words= country muslims christians religious world christian religion states banned obama
Topic 3 | Coherence=-164.02 | Top words= green card holders iran iraq syria visa visas legal yemen
Topic 4 | Coherence=-153.48 | Top words= protest airport jfk nobannowall protests protesters thousands today terminal outside
Topic 5 | Coherence=-193.67 | Top words= iraqi obama right detained airport jfk american iraq thousands lives
Topic 6 | Coherence=-190.20 | Top words= visit state petition uk wind rain humidity temp million mm
Topic 7 | Coherence=-192.81 | Top words= obama banned immigrants terrorist 2011 right law illegal obamas support
Topic 8 | Coherence=-165.92 | Top words= vetting new terrorists extreme islamic signs m

In [24]:
x = pd.DataFrame(topics)
x.shape

(101128, 20)

In [25]:
data.head(n=50)

Unnamed: 0,Id,Tweet,Tweet1,uniWords
0,824941360449015808,emergency rally against trump's muslim travel ...,emergency rally nyc 125 pm,5
1,824941519857610752,theresa may has not apologized to trump for in...,theresa insulting fails today send,5
2,824941616314122240,trump's immigration ban excludes countries wit...,excludes business ties,3
3,824942056741167105,trump's immigration order expands the definiti...,expands definition criminal,3
4,824942966875774976,alert : senator john mccain threatens action o...,alert senator john mccain threatens action,6
5,824943226931052545,kiva still distracted while trump gets on wit...,gets peoples business,3
6,824944363587395584,ty for bailing on gmb & today . piers morgan d...,ty today piers morgan aid vocal,6
7,824944376182927360,#trump to sign eo temporary ban suspending vi...,sign eo temporary suspending visas syria afric...,8
8,824944577480126464,did we have a moral obligation to stop hitler ...,moral obligation stop hitler moral obligation ...,7
9,824945223402938374,are these people just now getting radicalized ...,getting radicalized freedom,3


In [81]:
data = data.reset_index(drop =True)

In [85]:
z = pd.DataFrame()
z['Tweet1'] = data['Tweet1'].copy()
z['IdxMax'] = x.idxmax(axis =1 )
z['Max'] = x.max(axis = 1)

In [86]:
z.groupby(by = ['IdxMax']).count()

Unnamed: 0_level_0,Tweet1,Max
IdxMax,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,25313,25313
1.0,1666,1666
2.0,7685,7685
3.0,7191,7191
4.0,8712,8712
5.0,3019,3019
6.0,2974,2974
7.0,3551,3551
8.0,2167,2167
9.0,3594,3594


In [93]:
# num_topics = 20, max_df = 1.00, min_df = 10
for i in range(num_topics):
    print("#{}. {}\t {} {}".format(i, x[i].idxmax(),  x[i].max(), data.iloc[x[i].idxmax()]['Tweet1']))

#0. 37676	 0.9957509573008058 phobia stairs slopes
#1. 181	 0.948694578589058 days false claims inaccurate statements exaggerations wapo
#2. 1792	 0.9810873575636864 prioritize resettling christians
#3. 48374	 0.9892335752298087 green card holders exempted
#4. 21246	 0.992934508566851 protest msp airport
#5. 70331	 0.9032726213908937 served interpreters military risked lives soldiers
#6. 91076	 0.9999915992949862 weather mph wind
#7. 81229	 0.881807783303756 hurt bid 2024 olympics
#8. 3373	 0.9955400133031459 heartbroken malala yousafzai
#9. 3609	 0.9704561272337524 average hindu average
#10. 3771	 0.8705160050322539 common sense limiting
#11. 64176	 0.9822973401444547 tagged otps tag
#12. 33484	 0.9929721423400943 excludes linked businesses coincidence
#13. 92664	 0.9898193146700082 starbucks hiring 10000
#14. 8063	 0.9997020061838784 asghar farhadi attending oscars
#15. 2994	 0.9918070633215995 tired poor huddled masses yearning breathe free
#16. 18134	 0.999960425077053 bitfinex btc

In [113]:
data1 = data[data['uniWords']>5].copy()
x1 = x.iloc[data1.index, :].copy()
x1 = x1.reset_index(drop=True)
data1 = data1.reset_index(drop = True)

In [115]:
# num_topics = 20, max_df = 1.00, min_df = 10
for i in range(num_topics):
    print("#{}. {}\t {} {}".format(i, x1[i].idxmax(),  x1[i].max(), data1.iloc[x1[i].idxmax()]['Tweet']))

#0. 508	 0.9788986777738731 #news president trump holds a joint press conference with prime minister … 
#1. 122	 0.948694578589058 president trump’s first 7 days of false claims , inaccurate statements & exaggerations - wapo 
#2. 1297	 0.9308245320235031 trump says persecuted christians will be given priority over other refugees seeking ... by #cnnbrk via 
#3. 62395	 0.9778161520368038 elliptical bike 2 in 1 cross trainer exercise fitness machine gym workout 
#4. 8355	 0.9810572711040205 #muslimban protest at sfo 3pm , international terminal #notinourname 
#5. 48546	 0.9032726213908937  many of these people served as interpreters for the us military & risked there lives as much as us soldiers .
#6. 3755	 0.9998476539557648 11:41 cet : temperature : - , wind : s , 2 kph ( ave ) , 2 kph ( gust ) , humidity : , rain ( hourly ) 0.0 mm , pressure : 1012 hpa , falling slowly
#7. 46590	 0.8697040061736917 trump's travel ban could impact ioc vote on 2024 summer olympics host , but other factor

In [98]:
x.iloc[181].max()

0.948694578589058

In [107]:
df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
                   'co2_emissions': [37.2, 19.66, 1712]},
                 index = [1, 2, 3])

In [108]:
df

Unnamed: 0,consumption,co2_emissions
1,10.51,37.2
2,103.11,19.66
3,55.48,1712.0


In [110]:
df['co2_emissions'].idxmin()

2

In [116]:
import pickle
with open("biterm.pkl", "wb") as f:
    pickle.dump(btm, f)

In [120]:
len(biterms)

101128