In [6]:
import os
import sys

nlp_path = os.path.abspath('../')
if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)
from utils import tokenizeRawTweetText

In [2]:
import numpy as np
import pandas as pd
import re, emoji, string
from nltk.corpus import stopwords
import nltk, time
from biterm.cbtm import oBTM
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary
from nltk.stem import WordNetLemmatizer
import pickle

In [3]:
# read data
data = pd.read_csv('/home/ehoang/git/python/tweet_classification/data/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv')
print(data.head())
print(data.shape)

               tweet id                                              tweet  \
0  '262596552399396864'  I've got enough candles to supply a Mexican fa...   
1  '263044104500420609'  Sandy be soooo mad that she be shattering our ...   
2  '263309629973491712'  @ibexgirl thankfully Hurricane Waugh played it...   
3  '263422851133079552'  @taos you never got that magnificent case of B...   
4  '262404311223504896'  I'm at Mad River Bar &amp; Grille (New York, N...   

       label  
0  off-topic  
1   on-topic  
2  off-topic  
3  off-topic  
4  off-topic  
(10008, 3)


In [4]:
data.columns = ['TweetId', 'Tweet', 'label']
data = data[data['label'] == 'on-topic']
data = data.reset_index(drop=True)

In [7]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: ' '.join(tokenizeRawTweetText(x)))
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('TWEETMENTION', "").replace("HTTPURL", "").
                                    replace("EMAILADDRESS", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())
print(data.head())

                TweetId                                              Tweet  \
0  '263044104500420609'  sandy be soooo mad that she be shattering our ...   
1  '263101347421888513'  neighborly duties . arrives to the rescue spor...   
2  '263298821189156865'  i don't know how i'm getting back to jersey si...   
3  '262914476989358080'           already flooded so much #sandy @ hoboken   
4  '262991999911743490'  on that note , i pray that everyone stays safe...   

      label  
0  on-topic  
1  on-topic  
2  on-topic  
3  on-topic  
4  on-topic  


In [8]:
# def lemmatize_stemming(text):
#     return WordNetLemmatizer().lemmatize(text, pos='v')

# data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(lemmatize_stemming(y) for y in x.split(" ") if y.strip()!= ""))

In [9]:
stop_words = stopwords.words('english')
stop_words.extend(['like', 'https', 'htt', 'get', 'would', 'im', 'know', 'says',
                   'want', 'see', 'make', 'need', 'think', 'going', 'please', 'let', 'w',
                   '–', 'much', 'many', 'feel', 'go', 'take', 'like', 'hate', 'news', 'rt'])
for item in 'abcdefghijklmnopqrstuvwxyz':
    stop_words.append(item)

In [10]:
# remove stopwords, punctuation

data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stop_words))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

In [11]:
data['Tweet1'].head()

0    sandy soooo mad shattering doors shiet hurrica...
1    neighborly duties arrives rescue sporting spel...
2    im getting back jersey since trains subways ru...
3                        already flooded sandy hoboken
4    note pray everyone stays safe keeps positive a...
Name: Tweet1, dtype: object

In [12]:
# view most frequent words
cv = CountVectorizer()  
cv_fit = cv.fit_transform(list(data['Tweet1']))
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)   
wCount = dict(zip(word_list,count_list))
textCount =  sorted(wCount.items(), key=lambda k: -k[1])
print(len(word_list), len(textCount))
print(textCount[0:100])

8241 8241
[('hurricane', 5119), ('sandy', 3194), ('im', 402), ('new', 299), ('everyone', 287), ('power', 272), ('safe', 269), ('people', 266), ('frankenstorm', 247), ('storm', 229), ('coast', 224), ('east', 217), ('even', 203), ('hope', 201), ('us', 189), ('york', 184), ('school', 177), ('shit', 173), ('nyc', 171), ('name', 170), ('stay', 166), ('cant', 143), ('hit', 142), ('really', 142), ('aint', 139), ('lol', 135), ('time', 127), ('rain', 125), ('still', 124), ('tomorrow', 116), ('affected', 114), ('got', 113), ('bad', 112), ('coming', 110), ('fuck', 107), ('due', 105), ('god', 103), ('gonna', 102), ('way', 102), ('house', 100), ('water', 100), ('bitch', 99), ('good', 99), ('via', 99), ('day', 97), ('jersey', 96), ('one', 95), ('irene', 93), ('praying', 92), ('weather', 92), ('home', 91), ('hurricanesandy', 91), ('prayers', 91), ('right', 88), ('help', 86), ('come', 85), ('getting', 84), ('path', 84), ('obama', 83), ('back', 82), ('call', 82), ('oh', 81), ('romney', 81), ('watch', 8

In [13]:
data['len'] = data['Tweet1'].apply(lambda x: len(x.split(" ")))
data['#uniWord'] = data['Tweet1'].apply(lambda x: len(set(x.split(" "))))

In [14]:
# remove tweets #unique words less than haft of length
# data['len'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['#uniWord']/data['len']>0.5]
print(data.shape)

(6137, 6)


In [15]:
# remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.shape)

(5383, 6)


In [16]:
# remove tweets with lengths < 3
cv = CountVectorizer(stop_words='english', min_df = 10, max_df = 0.035) 
cv_fit = cv.fit(list(data['Tweet1']))
vocab = set(cv.get_feature_names())
data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in vocab))
while True:
    data['len'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
    data['#uniWord'] = data['Tweet1'].apply(lambda x: len(set(x.split(" "))))
    data = data[data['len'] >2]
    cv = CountVectorizer(stop_words='english', min_df = 10)  
    cv.fit(list(data['Tweet1']))
    newVocab = set(cv.get_feature_names())
    
    print("Len: ", len(newVocab))
    print("data.shape: ", data.shape)
    if len(vocab) == len(newVocab):
        break
    data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(y for y in x.split(" ") if y in newVocab))
    vocab = newVocab.copy()

Len:  533
data.shape:  (3257, 6)
Len:  513
data.shape:  (3028, 6)
Len:  507
data.shape:  (2977, 6)
Len:  507
data.shape:  (2960, 6)


In [17]:
# most frequent words after removing short tweets, highly/low frequent words
cv = CountVectorizer() 
cv_fit = cv.fit_transform(list(data['Tweet1']))
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)   
wCount = dict(zip(word_list,count_list))
textCount =  sorted(wCount.items(), key=lambda k: -k[1])
print(len(word_list), len(textCount))
print(textCount[0:100])

507 507
[('coast', 157), ('school', 144), ('york', 139), ('hope', 135), ('nyc', 132), ('stay', 122), ('really', 107), ('shit', 106), ('lol', 103), ('tomorrow', 102), ('hit', 98), ('bad', 94), ('good', 85), ('time', 84), ('affected', 83), ('coming', 82), ('gonna', 82), ('got', 79), ('fuck', 75), ('jersey', 75), ('day', 74), ('water', 74), ('house', 72), ('home', 70), ('right', 70), ('way', 69), ('prayers', 68), ('rain', 68), ('getting', 66), ('city', 65), ('hurricanesandy', 65), ('today', 65), ('come', 64), ('live', 63), ('weather', 62), ('praying', 61), ('god', 60), ('ny', 60), ('theres', 60), ('romney', 57), ('love', 56), ('twitter', 54), ('watch', 54), ('bitch', 52), ('obama', 51), ('path', 49), ('wind', 49), ('work', 48), ('relief', 47), ('away', 46), ('damn', 46), ('friends', 46), ('monday', 46), ('sandys', 46), ('thoughts', 46), ('ass', 45), ('thing', 45), ('yall', 45), ('night', 44), ('real', 44), ('thanks', 44), ('closed', 43), ('damage', 42), ('thats', 42), ('winds', 42), ('day

In [18]:
num_topics = 20
if __name__ == "__main__":

    texts = list(data['Tweet1'])

    # vectorize texts
    vec = CountVectorizer()
    X = vec.fit_transform(texts).toarray()

    # get vocabulary
    vocab = np.array(vec.get_feature_names())
    print("Vocab: {}".format(len(vocab)))

    # get biterms
    biterms = vec_to_biterms(X)

    # create btm
    btm = oBTM(num_topics=num_topics, V=vocab)
    print("Len(biterms):", len(biterms))

    print("\n\n Train Online BTM ..")
    start = time.time()
    for i in range(0, len(biterms), 2000): # prozess chunk of 200 texts
        
        biterms_chunk = biterms[i:i + 2000]
        btm.fit(biterms_chunk, iterations=50)
        
        if i%2000 ==0:
            print("....Line:{}, {}".format(i, (time.time()-start)))
            start = time.time()
    topics = btm.transform(biterms)

#     print("\n\n Visualize Topics ..")
#     vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0))
#     pyLDAvis.save_html(vis, 'online_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)

#     print("\n\n Texts & Topics ..")
#     for i in range(len(texts)):
#         print("{}. {} (topic: {})".format(i, texts[i], topics[i].argmax()))
#     print(topics.max())

  0%|          | 0/50 [00:00<?, ?it/s]

Vocab: 507
Len(biterms): 2960


 Train Online BTM ..


100%|██████████| 50/50 [00:37<00:00,  1.33it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

....Line:0, 37.4647696018219


100%|██████████| 50/50 [00:17<00:00,  2.86it/s]


....Line:2000, 17.65606117248535


 Topic coherence ..
Topic 0 | Coherence=-150.57 | Top words= shit school hope bad nyc really gonna coming good lol
Topic 2 | Coherence=-130.64 | Top words= got food water streets survival major driving trees black powers
Topic 3 | Coherence=-111.83 | Top words= katrina david stern said irene bout fuck aint lol niggas
Topic 4 | Coherence=-110.19 | Top words= romney relief obama campaign fema google mitt president bus disaster
Topic 6 | Coherence=-126.81 | Top words= jersey monday shore coast school tuesday blow closed snooki tomorrow
Topic 7 | Coherence=-140.79 | Top words= away hurricanes fuck things come little romney blown mitt house
Topic 8 | Coherence=-131.91 | Top words= york city nyc subway times service video approaches shut view
Topic 9 | Coherence=-140.29 | Top words= today relief time media million buy social local emergency president
Topic 10 | Coherence=-95.21 | Top words= tomb unknown soldier guard national winds center mph amazing contin

In [19]:
topic_summuary(btm.phi_wz.T, X, vocab, 10)

Topic 0 | Coherence=-150.57 | Top words= shit school hope bad nyc really gonna coming good lol
Topic 2 | Coherence=-130.64 | Top words= got food water streets survival major driving trees black powers
Topic 3 | Coherence=-111.83 | Top words= katrina david stern said irene bout fuck aint lol niggas
Topic 4 | Coherence=-110.19 | Top words= romney relief obama campaign fema google mitt president bus disaster
Topic 6 | Coherence=-126.81 | Top words= jersey monday shore coast school tuesday blow closed snooki tomorrow
Topic 7 | Coherence=-140.79 | Top words= away hurricanes fuck things come little romney blown mitt house
Topic 8 | Coherence=-131.91 | Top words= york city nyc subway times service video approaches shut view
Topic 9 | Coherence=-140.29 | Top words= today relief time media million buy social local emergency president
Topic 10 | Coherence=-95.21 | Top words= tomb unknown soldier guard national winds center mph amazing continues
Topic 11 | Coherence=-140.60 | Top words= school to

{'coherence': [-150.5675121148826,
  -141.99736434095607,
  -130.63707085423516,
  -111.82654720697106,
  -110.18830082802589,
  -142.83451239286268,
  -126.81212794987185,
  -140.78897075064302,
  -131.91192497611104,
  -140.29405872670708,
  -95.21145973414153,
  -140.59996714430235,
  -130.7235404361064,
  -139.1203029459656,
  -141.371086568604,
  -72.20884531450177,
  -130.28437099577383,
  -109.83143262724565,
  -147.21603447326498,
  -128.57804468210708],
 'top_words': [array(['shit', 'school', 'hope', 'bad', 'nyc', 'really', 'gonna',
         'coming', 'good', 'lol'], dtype='<U14'),
         'school', 'jersey'], dtype='<U14'),
  array(['got', 'food', 'water', 'streets', 'survival', 'major', 'driving',
         'trees', 'black', 'powers'], dtype='<U14'),
  array(['katrina', 'david', 'stern', 'said', 'irene', 'bout', 'fuck',
         'aint', 'lol', 'niggas'], dtype='<U14'),
  array(['romney', 'relief', 'obama', 'campaign', 'fema', 'google', 'mitt',
         'president', 'bus', 'd

In [20]:
 btm.phi_wz.shape

(507, 20)

In [21]:
data = data.reset_index(drop= True)
data['len'] = data['Tweet1'].apply(lambda x: len(set(x.split(" "))))
data['#uniWord'] = data['Tweet1'].apply(lambda x: len(set(x.split(" "))))

## 1. Extract instances/tweets that have highest topic-document prob

In [30]:
x = pd.DataFrame(topics)
print(x.shape)
x.shape
for i in range(num_topics):
    print("#{}. {}\t {} {}".format(i, x[i].idxmax(),  x[i].max(), data.iloc[x[i].idxmax()]['Tweet1']))

(2960, 20)
#0. 1745	 0.9994337396495933 live watch watch
#2. 1936	 0.8953393075771464 survival water candles black
#3. 1668	 0.999079775815964 katrina david stern
#4. 2638	 0.9865459762056363 campaign bus relief
#5. 86	 0.9732178360935507 apocalypse york ny
#6. 846	 0.9579061675484792 snooki lot theyre heading jersey shore blow
#8. 2381	 0.9577537247925334 york city subway
#9. 2688	 0.9973449888132752 social media approaches
#10. 2313	 0.9999937572115121 tomb unknown soldier
#11. 1840	 0.8394613494687058 wont bad pm
#12. 876	 0.9509203485826587 jokes funny dying
#13. 1654	 0.9861082589019909 blog ave ave
#14. 298	 0.7720940769865772 evacuation time evacuate
#15. 1616	 0.9995726635530693 gangnam style rain dance brought
#16. 2621	 0.9754705635626476 apple maps told approaching
#17. 2495	 0.9418282436916979 thoughts affected stay
#18. 28	 0.9088484860487207 town fine driving
#19. 271	 0.3339305138506348 way stop bus


In [31]:
# extract only instance with > 5 uniWords
data1 = data[data['#uniWord']>4].copy()
print(data1.index)
x = pd.DataFrame(topics)
x = x.iloc[data1.index]
print(x.index)


Int64Index([   4,    5,    6,    8,   10,   11,   12,   18,   23,   24,
            ...
            2931, 2932, 2936, 2938, 2940, 2946, 2947, 2948, 2952, 2953],
           dtype='int64', length=1000)
Int64Index([   4,    5,    6,    8,   10,   11,   12,   18,   23,   24,
            ...
            2931, 2932, 2936, 2938, 2940, 2946, 2947, 2948, 2952, 2953],
           dtype='int64', length=1000)


In [32]:
data1 = data1.reset_index(drop=True)
x = x.reset_index(drop=True)

In [33]:
for i in range(num_topics):
    print("#{}. {}\t {} {}".format(i, x[i].idxmax(),  x[i].max(), data1.iloc[x[i].idxmax()]['Tweet1']))

#0. 940	 0.9982797060849918 star coming thought worst week
#1. 306	 0.8026268554040461 great parents mt care disaster
#2. 127	 0.725340194253373 thank house telling major streets closed trees powers driving
#3. 181	 0.9917232640501498 nigga david stern said katrina
#4. 564	 0.976738199492876 mitt romney talk fema event
#6. 285	 0.9579061675484792 snooki lot theyre heading jersey shore blow
#7. 656	 0.7779909011227468 blow away mean send huge wind
#8. 263	 0.8854107129358217 video york city view times square
#9. 284	 0.88376932954098 social media residents approaches business
#10. 540	 0.9998254588776453 guard tomb unknown soldier usa
#11. 735	 0.6939992015094045 12 pm 12 10 days halloween cancelled
#12. 876	 0.8468227396384019 tsunami canada earthquake hawaii 2012
#13. 64	 0.7551031927107661 streets flooded rain came far
#14. 37	 0.6082512620000926 strong tree hard strong hurricanesandy jersey
#15. 538	 0.9995726635530693 gangnam style rain dance brought
#16. 933	 0.8931241022853917 ri

## 2. Extract instances/tweets that maximize P(d|z)
   *   $P(d|z) = \ln{(len(d))}*\prod_{b_i \in d}P(b_i|z)$ # give more weight to long sentences
   
   *   $d = argmax_dP(d|z) = argmax_d \ln{(P(d|z))}$
   *   $d = argmax_d\ln{(\ln{(len(d)))}} + \sum_{b_i}\ln{P(b_i|z)}$
   *   $d = argmax_d\ln{(\ln{(len(d)))}} + \sum_{b_i, w_{i0, i1} \in b_i}\ln{(P(w_{i0}|z)*P(w_{i1}|z)))}$ 

In [34]:
data1 = data[data['#uniWord'] >4].copy()
remained_index = data1.index

In [35]:
data1 =data1.reset_index(drop=True)

In [36]:
remained_index

Int64Index([   4,    5,    6,    8,   10,   11,   12,   18,   23,   24,
            ...
            2931, 2932, 2936, 2938, 2940, 2946, 2947, 2948, 2952, 2953],
           dtype='int64', length=1000)

In [37]:
biterms1 = np.array(biterms).copy()

In [38]:
biterms1 = biterms1[remained_index]

In [39]:
num_topics = 20
P_dz = np.zeros([len(biterms1), num_topics])
# iterate documents
for i, d in enumerate(biterms1):
    n_biterms = len(d)
    P_bz = np.zeros([len(d), num_topics])
    for j, b in enumerate(d):
        P_bz[j] = np.log(btm.phi_wz[b[0], :] * btm.phi_wz[b[1], :])
    P_dz[i] = P_bz.sum(axis = 0)

print(P_dz.shape)

(1000, 20)


In [40]:
# extract most representative sentences for each topic
indices = P_dz.argmax(axis = 0)
    

In [41]:
for i, idx in enumerate(indices):
    print("{}. {}\t{}".format(i, idx, data1.iloc[idx]['Tweet']))

0. 320	watch this hurricane not do shit and we all end up going to school tomorrow . #weallknowitsgoingtohappen
1. 500	since 1937 , the tomb guards have never left their post . hurricane sandy will be no exception . god bless our military ht ...
2. 750	#hurricanesandy rt : my hurricane survival kit : bottled water , candles , radar invisible black jetboat .
3. 181	lmfao this nigga david stern said hurricane katrina
4. 741	mitt romney donates campaign bus to hurricane sandy relief |
5. 554	frankenstorm in ny , earthquake in canada , tsunami in hawaii .... #mayans
6. 402	what do snooki and hurricane #sandy have in common ?… they will blow the entire east coast to get on tv .
7. 83	she gon blow yo ass away #hahaha #goodluck
8. 528	new york city shuts all subway , bus and train services at 19:00 est ( 23:00 gmt ) as hurricane #sandy approaches ...
9. 582	#thanksdonald for choosing to donate the 5 million to hurricane relief . tell your friends about this wonderful deed #s ...
10. 540	soldi

In [38]:
with open('biterm.pkl', 'wb') as f:
    pickle.dump(btm, f)