# Summarization
1. Built-in Textrank with gensim
2. Built-in Lexrank
3. Lexrank with tfidf & LSH
4. Lexrank with sentence embedding & LSH

In [1]:
# %pip install emoji

In [1]:
import os
import sys

nlp_path = os.path.abspath('../')
if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)
from utils import tokenizeRawTweetText

In [2]:
from numpy.linalg import norm
from fast_pagerank import pagerank
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
# from sentence_transformers import SentenceTransformer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from fast_lexrank import Lexrank
import numpy as np
from utils.lsh import LSH
import pandas as pd
import scipy
import pickle
import time
import re
import emoji, string
# import nltk
# nltk.download('stopwords')

In [3]:
# read data
# data = pd.read_csv('/home/ehoang/hnt/data/processed_travel_ban.csv')
data = pd.read_csv('/home/ehoang/git/python/tweet_classification/data/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv')
print(data.head())
print(data.shape)

               tweet id                                              tweet  \
0  '262596552399396864'  I've got enough candles to supply a Mexican fa...   
1  '263044104500420609'  Sandy be soooo mad that she be shattering our ...   
2  '263309629973491712'  @ibexgirl thankfully Hurricane Waugh played it...   
3  '263422851133079552'  @taos you never got that magnificent case of B...   
4  '262404311223504896'  I'm at Mad River Bar &amp; Grille (New York, N...   

       label  
0  off-topic  
1   on-topic  
2  off-topic  
3  off-topic  
4  off-topic  
(10008, 3)


In [4]:
data.columns = ['TweetId', 'tweet', 'label']
data = data[data['label'] == 'on-topic']
data.shape

(6138, 3)

In [5]:
data['Tweet'] = data['tweet'].apply(lambda x: ' '.join(tokenizeRawTweetText(x)))
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('TWEETMENTION', "").
                                    replace("EMAILADDRESS", "").replace('HTTPURL', ''))
data['Tweet'] = data['Tweet'].apply(lambda x: x.lower().strip())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '',  string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—|→', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

# remove tweets #unique words less than haft of length
data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['uniWPercent']!=0]
# # remove tweets with lengths < 3, duplicates
while data['uniWPercent'].min() <=2:
    data = data[data['uniWPercent'] >2]
    data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
# # # remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.head())

                 TweetId                                              tweet  \
1   '263044104500420609'  Sandy be soooo mad that she be shattering our ...   
5   '263101347421888513'  Neighborly duties. @Cory_Kennedy arrives to th...   
7   '263298821189156865'  I don't know how I'm getting back to Jersey si...   
10  '262914476989358080'  Already flooded so much #SANDY @ Hoboken http:...   
12  '262991999911743490'  On that note, i pray that everyone stays safe,...   

       label                                              Tweet  \
1   on-topic  sandy be soooo mad that she be shattering our ...   
5   on-topic  neighborly duties . arrives to the rescue spor...   
7   on-topic  i don't know how i'm getting back to jersey si...   
10  on-topic           already flooded so much #sandy @ hoboken   
12  on-topic  on that note , i pray that everyone stays safe...   

                                               Tweet1  uniWPercent  
1   sandy soooo mad shattering doors shiet hurrica...

In [6]:
data.shape

(5350, 6)

In [7]:
# def lemmatize_stemming(text):
#     return WordNetLemmatizer().lemmatize(text, pos='v')
# data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(lemmatize_stemming(y) for y in x.split(" ") if y.strip()!= ""))

In [8]:
remained_index = data.index

In [9]:
data = data.reset_index(drop=True)

In [10]:
# data[data['Tweet'].str.contains("president trump fires acting attorne")]

In [11]:
print(list(data.iloc[0:10]['Tweet1']))

['sandy soooo mad shattering doors shiet hurricanesandy', 'neighborly duties arrives rescue sporting spelunking equipment sandy 300 squad', 'know im getting back jersey since trains subways running', 'already flooded much sandy hoboken', 'note pray everyone stays safe keeps positive attitude godisgood', 'house creeking mean trying break', 'debating going home prep sandy', '11am going 100 chance rain hurricanesandy', '5 blocks water first two blocks evacuated sounds like train went stay safe thanks', 'crazy gonna lie im kind scared']


In [12]:
data.shape

(5350, 6)

In [13]:
data = data.iloc[0:5000]

## 1. Lexrank + lsh + tfidf

In [14]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

(5000, 7918)


In [15]:
lsh_tfidf = LSH(tfidfData)
lsh_tfidf.train(num_bits = 5)


(5000,)


In [16]:
%%time
lex_tfidf = Lexrank(tfidfData, lsh_tfidf)
lex_tfidf.build_graph(search_radius = 1, cosine_sim = 0.3)

#buckets: 32
.......Buck: 0, vec: (759, 7918)
CPU times: user 27.1 s, sys: 4.18 s, total: 31.3 s
Wall time: 3.97 s


In [17]:
lex_tfidf.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [18]:
sentIds = lex_tfidf.extract_summary(n_sents = 20, cosine_thres=0.5)

Extracting sentences....
Sent scores: 5000
selected one: 3755, 0.002815762292443252
selected one: 4886, 0.0012852135678720485
Sent 324 is similar to a 3755: 0.825667274731069
selected one: 333, 0.0012301626572219628
Sent 4485 is similar to a 333: 0.5544264334842268
Sent 3668 is similar to a 333: 0.7913595602944599
selected one: 3274, 0.0011316542427669603
selected one: 4422, 0.0011024537469009595
Sent 2741 is similar to a 4422: 0.8985201618411257
Sent 4307 is similar to a 4422: 0.6824526293067552
selected one: 933, 0.0010907606399100358
Sent 1560 is similar to a 3274: 0.5419315686379707
selected one: 3104, 0.001017956618676534
selected one: 2997, 0.0010087026029462866
Sent 4027 is similar to a 333: 0.5370259705042414
Sent 3509 is similar to a 4422: 0.8407558739952904
Sent 179 is similar to a 333: 0.5039468700194218
Sent 2021 is similar to a 3274: 0.5098151219529714
selected one: 3316, 0.0009220615371341031
selected one: 1792, 0.0009207151391918578
selected one: 1244, 0.0008708985391661

In [19]:
print("Id", "#adjacentEdges", "lexrank")
for i, idx in enumerate(sentIds):
    print(i, len(lex_tfidf.graph[idx]), lex_tfidf.scores[idx])

Id #adjacentEdges lexrank
0 94 0.002815762292443252
1 26 0.0012852135678720485
2 65 0.0012301626572219628
3 57 0.0011316542427669603
4 64 0.0011024537469009595
5 20 0.0010907606399100358
6 34 0.001017956618676534
7 24 0.0010087026029462866
8 42 0.0009220615371341031
9 51 0.0009207151391918578
10 24 0.0008708985391661912
11 24 0.000847990973540801
12 40 0.0008379946437504437
13 19 0.0008297303775093536
14 21 0.0008151748075952774
15 16 0.0007738980690878252
16 13 0.0007612240654215764
17 11 0.0007530354481963664
18 30 0.0007092230381072004
19 17 0.0007046751010733506


In [21]:
# nearby_bin = 0
for i, idx in enumerate(sentIds):
    print(i, idx, data.iloc[idx]['tweet'])

0 3755 it was a hurricane in new york. ??????
1 4886 The hurricane #Sandy is a bitch
2 333 i hope everyone is safe
3 3274 All you east coast friends stay safe! http://t.co/hJi3q5cn #hurricane #sandy
4 4422 My prayers go out to everyone affected by hurricane sandy .. Stay safe !
5 933 Hurricane SANDY on its way!!!
6 3104 RT @Juan55s: Really? Hurricane Sandy has a twitter.
7 2997 I'm not fucking with this hurricane
8 3316 RT @ItsManiHoe: Praying for those in hurricane sandy
9 1792 RT @FOXSports: For everyone on the East Coast in the path of Hurricane Sandy, please be safe!
10 1244 Fuck you Sandy. Fuck you Insomnia. &amp; Fuck you Twitter.
11 2884 Hope This Hurricane Is Not Bad '
12 1820 @Meowllory6 You're not affected by Hurricane Sandy are you?? Hope not!
13 4830 Why didn't the hurricane blow you away? :)))))))))
14 1835 Im Not Scared Of Hurricane Sandy
15 1088 I cant feel you here hurricane sandy!
16 923 And I just lost power... #sandy
17 1468 RT @DeBell22: Hurricane #sandy - I'd do yo

In [None]:
0 5324 hurricane sandy for east coast
1 3755 it was a hurricane in new york . ??????
2 5029 praying for everyone who were affected by hurricane sandy !!! stay safe !
3 333 i hope everyone is safe
4 4886 the hurricane #sandy is a bitch
5 2884 hope this hurricane is not bad '
6 933 hurricane sandy on its way !!!
7 1832 how you can help after hurricane sandy\
8 5265 there's a hurricane coming ..? where have i been ..
9 1395 i'm scared about this hurricane ...
10 2997 i'm not fucking with this hurricane
11 2003 hurricane sandy isn't funny .
12 3448 i want the hurricane to come so we don't have school ..
13 5307 prayers for those affected by hurricane sandy
14 3565 hurricane sandy : view from above
15 3280 no school tomorrow & tuesday . #hurricane
16 1468 hurricane #sandy - i'd do you .
17 923 and i just lost power ... #sandy
18 1401 that hurricane aint gonna be shit
19 4830 why didn't the hurricane blow you away ? :)))))))))

In [24]:
# nearby_bin = 0
for i, idx in enumerate(sentIds):
    print(i, idx, data.iloc[idx]['Tweet'])

0 2199 here is the " extreme vetting " executive order president trump signed
1 671 trump : don't ban immigration from muslim majority countries !
2 1930 president trump and british prime minister
3 2819 statement on donald trump's executive order targeting muslims .
4 3164 ' we don't want them here ' president trump signs executive order for ' extreme vetting ' of refugees …
5 3238 #resist trump's immigration ban excludes countries with business ties via
6 25 trump's immigration ban excludes countries with business ties via
7 2386 " we don't want them here " : trump signs executive order for ' extreme vetting ' of refugees …
8 2964 trump signs executive order on ‘ extreme vetting ’ via
9 1428 read our full statement on president trump's executive order on immigration and refugees here : …
10 994 president trump meets with british pm theresa may in the white house .
11 2446 is saudi arabia on the list ? how about muslim countries where trump has major investments ?
12 413 #breaking - p

In [18]:
# nearby_bin = 1
for i, idx in enumerate(sentIds):
    print(i, data.iloc[idx]['Tweet'])

0 on executive order immigration
1 green card holders
2 ban all muslim countries
3 what do people in the middle east think about terrorism ? for more on the middle east visit http …
4 our statement on president trump’s executive order on immigration :
5 make america great . again ?
6 from saudi arabia . #trump's ban doesn't include saudi arabia .
7 keep america safe president trump keep the ban on muslim refugees , keep them out and keep us safe .
8 my statement on president trump's executive order on refugees :
9 it is a religion ban trump
10 protest at the airport !!! #muslimban
11 the president of the united states fires the attorney general of the united states .
12 are you from any of the 7 banned countries ? what are you on about ?
13 trump's state visit to the uk :
14 it's not a muslim ban . it's a ban from muslim countries where trump doesn't do business .
15 trump signs executive order on ‘ extreme vetting ’
16 jfk protest . #muslimban #nobannowall
17 it's a temporary ban from

In [21]:
# result for the first 10000 tweets
for i, idx in enumerate(sentIds):
    print(i, data.iloc[idx]['Tweet'])

0 following trump’s executive order , green card , visa holders already blocked at airports …
1 trump executive order : refugees detained at us airports follow for more
2 trump signs executive order for ‘ extreme vetting ’ of refugees
3 trump executive order : refugees detained at us airports
4 trump's state visit to the uk :
5 list of trump's executive orders |
6 trump executive order : refugees detained at us airports - bbc news
7 breaking : prime minister theresa may has arrived at the white house for talks with president trump .
8 trump signs executive actions on immigration , military
9 president trump signs executive order temporarily halting all refugees
10 trump signs ' new vetting ' immigration order
11 trump says syrian christian refugees will be given priority for entering u.s. !
12 ' we don't want them here ' president trump signs executive order for ' extreme vetting ' of refugees …
13 trump’s immigration ban excludes countries with business ties
14 breaking : trump to sig

In [12]:
with open('saved_models/tfidf_model.pkl', "rb") as f:
    lex_tfidf = pickle.load(f)
#     pickle.dump(lex_tfidf, f)

#####

* <b>Sub-events captured:</b>
    1. green card holders
    2. protest at jfk airport
    3. attorney general get fired
    4. ban 7 countries, muslim countries
    5. trump's ban doesn't include saudi arabia .
    6. trump visit uk
    7. executive order on ‘ extreme vetting ’ 
* <b>Lack</b>:
    1. starbuck hires 10K refugees
    2. trump’s deportation orders 
    3. washington state will sue to stop trump's immigration
    4. canada will accept the refugees 
    5. quebec city mosque shooting 


## 2. Lexrank with sentEmbeddings and LSH

In [13]:
#load embeddings
with open('/home/nguyen/data/travel_ban_sentence_transformers_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)
print(len(embeddings))

123385


In [22]:
# embeddings[0].shape
# count = 0
# list = []
# for i in range(1000):
#     cos_sim = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[i].reshape(1, -1))
#     print(cos_sim)
    
#     if cos_sim >0.65:
#         count+=1
#         list.append(i)
# print("Number of sentences that are similar with the first sen:", count)

In [14]:
sentenceEmbs = np.array(embeddings)
print(sentenceEmbs.shape)

(123385, 768)


In [15]:
sentenceEmbs = sentenceEmbs[remained_index]

In [16]:
sentenceEmbs.shape

(105175, 768)

In [49]:
lsh_sen = LSH(sentenceEmbs)
lsh_sen.train(num_bits=14)
table = lsh_sen.model['table']

(105175,)


In [50]:
lex_sen = Lexrank(sentenceEmbs, lsh_sen)

In [None]:
%%time
lex_sen.build_graph(search_radius = 1, cosine_sim=0.7)

In [53]:
lex_sen.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [83]:
# 14 bins
sentIds = lex_sen.extract_summary(n_sents = 20, cosine_thres = 0.85, max_sent=9000)

Extracting sentences....
Sent scores: 105175
selected one: 72679, 6.96076591613844e-05
selected one: 42773, 6.741798915014396e-05
Sent 70826 is similar to a 72679: 0.8773021495975561
Sent 66616 is similar to a 72679: 0.884185416560963
Sent 67230 is similar to a 72679: 0.882424157623164
Sent 15821 is similar to a 72679: 0.8550647142535271
Sent 31600 is similar to a 72679: 0.8643050357142914
Sent 15323 is similar to a 72679: 0.8857740382537653
Sent 23584 is similar to a 72679: 0.8856029181528562
selected one: 18628, 6.354185669127839e-05
Sent 65594 is similar to a 42773: 0.8566214339815087
Sent 59451 is similar to a 72679: 0.8539545469585664
Sent 86305 is similar to a 72679: 0.8638024807209533
Sent 2754 is similar to a 72679: 0.8620114943391185
Sent 36918 is similar to a 42773: 0.9902525069804194
selected one: 14812, 6.079135691638127e-05
Sent 15237 is similar to a 14812: 0.9931927816431556
Sent 66394 is similar to a 72679: 0.8836333610668062
Sent 76628 is similar to a 72679: 0.889995278

In [84]:
for i, idx in enumerate(sentIds):
    print("{}. {}".format(i, data.iloc[idx]['Tweet']))

0. pragmactivist : trump fired ag #sallyyates for enforcing laws and #resist ing a #muslimban . will he fire seanspi … 
1. " this policy is going to get americans killed : " sen. chris murphy on trump's refugee ban • #antitrumpmvmt …
2. liberals will be upset to learn that clinton , bush , reagan , obama and carter all issued immigrant bans . #muslimban .…
3. tech leaders finally find their voice , opposing trump’s muslim ban : “ so un-american , it pains us all . ” via 
4.  four states sue trump administration over ' un-american ' travel ban #breaking #hope #politics #truth
5. mark levin " with this 7 nation detaining of muslims while vetted , potus trump is trying 2 prevent carnage of america …
6. iraq war vet congressman says he is ashamed trump is president after muslim ban via …
7. 12/8/15 mike pence said : calls to ban muslims from entering the us are offensive and unconstitutional . flip flop man now vp …
8. #muslimban muslims try to force american christians to submit to sharia

In [85]:
print("Id", "Index", "lexrank")
for i, idx in enumerate(sentIds):
    print(i, idx, lex_sen.scores[idx])

Id Index lexrank
0 72679 6.96076591613844e-05
1 42773 6.741798915014396e-05
2 18628 6.354185669127839e-05
3 14812 6.079135691638127e-05
4 86089 5.979216205568747e-05
5 67637 5.948932230295637e-05
6 2115 5.922095935587901e-05
7 17668 5.770827381088001e-05
8 11192 5.6541809956873435e-05
9 555 5.61033781753414e-05
10 25754 5.5482023437627004e-05
11 79690 5.509297971550304e-05
12 28510 5.504408797654373e-05
13 69997 5.4146177612986906e-05
14 38139 5.382959392083732e-05
15 26558 5.351312303136145e-05
16 93289 5.350937065123945e-05
17 14601 5.3446392656902614e-05
18 103112 5.3196643331092124e-05
19 23918 5.221696773719943e-05


In [86]:
str(data.iloc[72679]['Tweet'])

'pragmactivist : trump fired ag #sallyyates for enforcing laws and #resist ing a #muslimban . will he fire seanspi … '

In [87]:
str(data.iloc[67489]['Tweet'])

'trump administration tweaks immigration ban on green card holders after outrage .. #trump . #muslimban '

# debug

In [None]:
#--> problem: cosine similarity does not measure well the similarity of embeddings
# tweets with "green card" are not in the selected set because they have lower cosine similarity than the selected one

In [18]:
green_indices = data[data['Tweet'].str.contains('green card')].index

In [70]:
data.iloc[green_indices]

Unnamed: 0,Id,Tweet,Tweet1,uniWPercent
1216,825064765261176832,company sent out a notice about trump's muslim...,company sent notice trumps muslim ban green ca...,10
2323,825137645504172032,visas being denied immediately . chaos at airp...,visas denied immediately chaos airports air mu...,13
2419,825141021906382848,"ban applies if you have a visa , green card , ...",ban applies visa green card even dual citizen ...,9
2609,825147334342303745,current concern : what about people with green...,current concern people green cards currently a...,13
2755,825152564652081157,many elderly come to green card interview with...,many elderly come green card interview suit ti...,12
...,...,...,...,...
103920,827623785662644224,my uncle from syria needs a green card any tak...,uncle syria needs green card takers,6
104019,827626771994533889,why is tomi defending the rights of milo ( a ...,tomi defending rights milo non american rights...,12
104760,827658820658851840,green card coming soon !,green card coming soon,4
104876,827664285862129668,alert : dos issues stmnt clarifying ban dn app...,alert dos issues stmnt clarifying ban dn apply...,18


In [19]:
green_indices

Int64Index([  1216,   2323,   2419,   2609,   2755,   2995,   3099,   3202,
              3259,   3385,
            ...
            102906, 102993, 103099, 103206, 103512, 103920, 104019, 104760,
            104876, 104916],
           dtype='int64', length=1446)

In [30]:
cosine_matrix = cosine_similarity(sentenceEmbs[green_indices], sentenceEmbs[green_indices])

In [31]:
arr = cosine_matrix.flatten()

In [34]:
n = cosine_matrix.shape[0]

In [73]:
max_idx = -1
max_value = -1
for i in green_indices:
    if lex_sen.scores[i]>max_value:
        max_value = lex_sen.scores[i]
        max_idx = i
print(max_idx, lex_sen.scores[max_idx], str(data.iloc[max_idx]['Tweet']))

67489 5.265613346268356e-05 trump administration tweaks immigration ban on green card holders after outrage .. #trump . #muslimban 


In [92]:
list(green_indices).index(67489)

1172

In [111]:
[1172]

67489

In [102]:
# get cosine similarity of 67489 and other tweets that contains 'green card' word
x= {key: value for key, value in enumerate(cosine_matrix[1172])}

In [106]:
# sort in descending order
x = {k: v for k, v in sorted(x.items(), key=lambda item: item[1], reverse = True)}

In [130]:
count = 0
for key, value in x.items():
    if count >25:
        break
    count+=1 
    print(green_indices[key], value)

67489 1.0000000000000004
14568 0.9097172370497544
7672 0.8804448534365881
1216 0.8803281653920463
60084 0.880106057521026
9034 0.8797233520977021
15572 0.8727023496902853
7960 0.8681128812564491
3385 0.8609483496851006
35615 0.8601970847836715
93850 0.8590100880572535
47008 0.8582769108235137
4846 0.852298885976542
87080 0.8510681270339433
86575 0.8509157052733496
13054 0.8500323277896142
8358 0.8484250372807346
5045 0.8479882189362389
11078 0.8473223807416036
41543 0.8470223230647428
52674 0.8419954574442077
91662 0.8376496816713401
89383 0.8376411036700553
32558 0.8372166300390903
11699 0.8352250159227328
93870 0.8349929223778321


In [119]:
str(data.iloc[67489]['Tweet'])

'trump administration tweaks immigration ban on green card holders after outrage .. #trump . #muslimban '

In [117]:
str(data.iloc[72679]['Tweet'])

'pragmactivist : trump fired ag #sallyyates for enforcing laws and #resist ing a #muslimban . will he fire seanspi … '

In [118]:
str(data.iloc[green_indices[79]]['Tweet'])

'the executive order " will ban green card holders " says homeland security . this is religious persecution . …'

## --> cosine similarity score does not really reflect properly the similarity
+ Many tweets that are similar have lower similarity scores than different tweets
+ Example, the algorithm does not choose any tweet of green card event, because they either have lower cosine similarity with each other (low lexrank score) or high cosine similarity with chosen tweets (that is not about green card event)
    - many tweets that are about 'green card event' and should have high similarity score with 67489, but it has lower score with 67489 than cosine(72679, 67489)

In [125]:
# compute cosine similarity of 67489 and all the others, count how many of them are about green card event
cosine_67489 = cosine_similarity(sentenceEmbs[67489].reshape(1, -1), sentenceEmbs)[0]

In [128]:
cosine_67489_sorted = {key: value for key, value in enumerate(cosine_67489)}
cosine_67489_sorted = {k: v for k, v in sorted(cosine_67489_sorted.items(), key=lambda item: item[1], reverse = True)}

In [18]:
str(data.iloc[22430]['Tweet'])

"breaking : federal judge has just blocked trump's islamophobic ban on immigrants & refugees . #heretostay #muslimban https …"

In [19]:
cosine_similarity(sentenceEmbs[67489].reshape(1, -1), sentenceEmbs[22430].reshape(1, -1))

array([[0.91123844]])

In [20]:
count = 0
for key, value in cosine_67489_sorted.items():
    print(key, value, str(data.iloc[key]['Tweet']))
    if count>25:
        break
    count+=1

NameError: name 'cosine_67489_sorted' is not defined

In [134]:
# compare to tfidf representation: find most similar tweets of 67489
cosine_67489_tfidf = cosine_similarity(tfidfData[67489], tfidfData)[0]
cosine_67489_tfidf = {key:value for key, value in enumerate(cosine_67489_tfidf)}
cosine_67489_tfidf = {key: value for key, value in sorted(cosine_67489_tfidf.items(), key = lambda item: item[1], reverse = True)}

In [135]:
count = 0
for key, value in cosine_67489_tfidf.items():
    print(key, value, str(data.iloc[key]['Tweet']))
    if count>25:
        break
    count+=1

67489 1.0 trump administration tweaks immigration ban on green card holders after outrage .. #trump . #muslimban 
91662 0.6409255342486253 white house tweaks trump's travel ban to exempt green card holders 
13109 0.49899079015468784 green card holders 
10126 0.4624729946806367 green card holders ? how can this be legal ?! #muslimban 
35313 0.4560962336083251 trump administration flip-flops : green card holders are not affected by immigration ban 
46384 0.4437992312080231 trump administration says green card holders won’t be barred 
42962 0.44095667986007875 priebus : green card holders will be allowed into us - president donald trump's administration says green card h ... 
7776 0.4360798275840697 green card holders included in trump ban - via 
15131 0.4342351601902188 trump immigration ban will apply to green card holders : report 
36742 0.4302402254560106 president trump's administration says green card holders will be allowed into the country …
46821 0.42907888766047647 trump adminis

In [None]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

In [None]:
lsh_tfidf_embs = LSH(tfidfData)
lsh_tfidf_embs.train(num_bits = 8)


In [None]:
buckets = lsh_tfidf_embs.extract_nearby_bins(max_search_radius = 1)
print(len(buckets[0]))

## 3. Combination of LSH with tfidf and lex rank with sent embeddings

In [12]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

(105175, 48876)


In [13]:
#load embeddings
with open('/home/nguyen/data/travel_ban_sentence_transformers_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)
print(len(embeddings))

FileNotFoundError: [Errno 2] No such file or directory: '/home/nguyen/data/travel_ban_sentence_transformers_embeddings.pkl'

In [142]:
sentenceEmbs = np.array(embeddings)
sentenceEmbs = sentenceEmbs[remained_index]
print(sentenceEmbs.shape)

(105175, 768)


In [143]:
lsh_tfidf_embs = LSH(tfidfData)
lsh_tfidf_embs.train(num_bits = 8)
lex_tfidf_embs = Lexrank(sentenceEmbs, lsh_tfidf_embs)

(105175,)


In [144]:
%%time
lex_tfidf_embs.build_graph(search_radius = 1, cosine_sim = 0.75)

#buckets: 256
.......Buck: 0, vec: (4405, 768)
.......Buck: 100, vec: (3664, 768)
.......Buck: 200, vec: (3694, 768)
CPU times: user 6min 32s, sys: 6min 12s, total: 12min 45s
Wall time: 3min 37s


In [145]:
lex_tfidf_embs.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [146]:
sentIds = lex_tfidf_embs.extract_summary(n_sents = 20, cosine_thres = 0.85, max_sent=20000)

Extracting sentences....
Sent scores: 105175
selected one: 51052, 7.648065808677812e-05
selected one: 1986, 7.608186491893474e-05
Sent 23584 is similar to a 1986: 0.8722660801906754
Sent 29998 is similar to a 51052: 0.8925799228678891
Sent 22430 is similar to a 51052: 0.8751887541384444
Sent 77797 is similar to a 51052: 0.9310951000130575
Sent 25117 is similar to a 51052: 0.8664335156196253
Sent 102111 is similar to a 51052: 0.8992971624392283
Sent 43651 is similar to a 51052: 0.868846460471362
selected one: 9573, 7.086856553357003e-05
Sent 100720 is similar to a 51052: 0.9001413643413779
selected one: 2865, 7.038706088198347e-05
selected one: 45817, 6.927957863849398e-05
Sent 71278 is similar to a 9573: 0.8786017593352146
selected one: 76628, 6.862070452352763e-05
Sent 11548 is similar to a 51052: 0.8628708512619934
Sent 67230 is similar to a 51052: 0.860118524926383
Sent 64996 is similar to a 51052: 0.8689097664591182
selected one: 78022, 6.737707014599702e-05
Sent 37472 is similar t

In [147]:
lex_tfidf_embs.scores[4846]

3.4206957192817335e-05

In [148]:
str(data.iloc[2419]['Tweet'])

'ban applies if you have a visa , green card , or even if you are dual citizen ... #nowallsnobans '

In [149]:
str(data.iloc[2755]['Tweet'])

'many elderly come to green card interview with suit and tie . we ask why ? most answer with : out of respect to america & you .'

In [150]:
cosine_similarity(sentenceEmbs[2755].reshape(1, -1), sentenceEmbs[2419].reshape(1, -1))

array([[0.36787995]])

In [151]:
green_indices = data[data['Tweet'].str.contains('green card')].index

In [152]:
cosine_matrix = cosine_similarity(sentenceEmbs[green_indices], sentenceEmbs[green_indices])

In [153]:
for x in lex_tfidf_embs.graph[2323]:
    print(x, str(data.iloc[x]['Tweet']), cosine_similarity(sentenceEmbs[x].reshape(1, -1), sentenceEmbs[2323].reshape(1, -1)))

5037 trump's #executiveorders banning refugees & muslims is tearing families apart , exploits fear & ignorance & embolden … [[0.78014991]]
6454 refugees who were in the air on the way to the us when trump signed his order were stopped and detained at airports . https :/ … [[0.75339771]]
6525 this --> refugees in the air on the way to the us when trump signed is exec order were stopped & detained at airports https … [[0.77457203]]
8866 " refugees who were in the air on the way to the us when the order was signed were stopped and detained at airports . " https … [[0.75640029]]
10389 this is terrifying . holding back tears on the train . trump anti-refugee order : ' green-card holders included in ban '  [[0.75814093]]
20188 #nobannowall american protests against the #muslimban are spreading . #denver #jfkprotest #neveragainisnow … [[0.75001516]]
39266 trump hotels twitter trolled mercilessly in wake of ' muslim ban ' - rt #twitter #news [[0.75712114]]
45778 houston airport protest going o

66142 eventually we are going to have to boycott everything . #boycottusa #boycottamazon #boycotttrumphotels #boycottusproducts #muslimban #love [[0.75362069]]
7672 trump's racist , anti-muslim executive order bans people with both green cards ( ie , permanent us residents ) and visas ht … [[0.80765511]]
11789 under latest order all syrian #refugees are barred from us , with no reasoning . show your outrage : https :/ … [[0.75896694]]
35577 muslims diss america as trump visa ban bites [[0.75169596]]
5754 in one of darkest moments in #us history #trump signs order banning #syria /n refugees on #holocaustmemorialday https : … [[0.76864175]]
18774 please retweet this ! #muslimban if you are within driving distance of a major int airport protests will continue on su … [[0.75043108]]
19894 #live : more footage of the growing protest outside o'hare airport ag . trump's immigration bans , chanting refugees a … [[0.75235426]]
14488 #muslimban bans people from the countries around the one where

In [154]:
str(data.iloc[50246]['Tweet'])

'several people killed in shooting at quebec city mosque '

In [155]:
for i, idx in enumerate(sentIds):
    print(lex_tfidf_embs.scores[idx])
#     print("{}. {}".format(i, data.iloc[idx]['Tweet']))

7.648065808677812e-05
7.608186491893474e-05
7.086856553357003e-05
7.038706088198347e-05
6.927957863849398e-05
6.862070452352763e-05
6.737707014599702e-05
6.54830689262678e-05
6.432313107376382e-05
6.389285399164867e-05
6.375720187125975e-05
6.336930322875994e-05
6.284434455581455e-05
6.14903671600921e-05
6.13492683327275e-05
6.038837980504194e-05
5.8794951231091684e-05
5.838637860771925e-05
5.835114072263316e-05
5.803515873934461e-05


In [157]:
for i, idx in enumerate(sentIds):
    print(i, idx,  str(data.iloc[idx]['Tweet']))

0 51052 #er_updates sag awards turns into protest against donald trump’s immigration ban | see photos via 
1 1986 trump eos to stop refugee admissions & shut down visas for majority-muslim nations are a smokescreen for this same reli …
2 9573 #theresistance #muslimban #noban #nobannowall breaking : aclu is suing trump over immigration ban . …
3 2865 #muslimban is another lying hashtag . trump is banning unvettables from nations w lots of terrorists . opposing him o …
4 45817 trump's muslim ban proves that his administration is full of idiots via #nomuslimb …
5 76628 wow . trump has fired the acting attorney general for saying his #muslimban is probably illegal . 
6 78022 left protests while #trump junks obama's global immigration plan - breitbart #maga #tcot #deplo …
7 30226 #breaking merkel says trump immigration ban ' not justified ' : spokesman
8 98982 gold star father khizr khan says president trump’s immigration ban has “ alienated patrioti … #news #follow #rt #retw … 
9 88921 lef

# Test: 

In [136]:
tfidfData.shape

(105175, 48876)

In [137]:
tfidfCosine = cosine_similarity(tfidfData[0], tfidfData)

In [138]:
# extract 10 sentence with highest similarity
indices = np.argpartition(tfidfCosine[0], -10)[-10:]
elements = tfidfCosine[0][indices]
print("Indices: {}".format(indices))
print("Cosine values: {}".format(elements))
print("Tweet 0:", str(data.iloc[0]['Tweet1']))
for i in indices:
    print("{}\t{}\t{}".format(i, tfidfCosine[0][i], str(data.iloc[i]['Tweet1'])))

Indices: [69001 89159 45071 97335  8934 47837 23965 97957 79193     0]
Cosine values: [0.29885404 0.31449262 0.31054174 0.32932619 0.35482742 0.37053803
 0.35177892 0.3296877  0.33406314 1.        ]
Tweet 0: emergency rally trumps muslim travel ban nyc 125 5 pm
69001	0.2988540434810337	watch live protesters gather rally hate nyc response trumps refugee ban
89159	0.31449261555765645	google workers rally trumps travel ban
45071	0.31054173613047253	many american flags todays nyc rally trumps muslim ban america resist htt
97335	0.32932618513296824	defend immigrants trump rally nyc 2 pm today near nyu nobannowall
8934	0.35482741613624674	protesters rally trumps muslim immigration ban
47837	0.37053803108721006	senator chuck schumer slams trumps travel ban nyc rally
23965	0.3517789220943597	blocks call trumps unconstitutional muslim ban comes emergency hearing nyc
97957	0.32968770264912745	judge extends emergency stay blocking trumps travel ban via newyork nyc
79193	0.33406314343351484	know f

In [139]:
embCosine = cosine_similarity(sentenceEmbs[0].reshape(1, -1), sentenceEmbs)
# extract 10 sentence with highest similarity
indices = np.argpartition(embCosine[0], -10)[-10:]
elements = embCosine[0][indices]
print("Indices: {}".format(indices))
print("Cosine values: {}".format(elements))
print("Tweet 0:", str(data.iloc[0]['Tweet1']))
for i in indices:
    print("{}\t{}\t{}".format(i, embCosine[0][i], str(data.iloc[i]['Tweet1'])))

Indices: [ 1279 44958 40096 52607 37999 90081 97777 52571 25520     0]
Cosine values: [0.85526015 0.85747451 0.85783423 0.8714714  0.87335137 0.87723659
 0.88470356 0.89061269 0.897992   1.        ]
Tweet 0: emergency rally trumps muslim travel ban nyc 125 5 pm
1279	0.8552601497991861	protest today trump dapl downtown 5pm 611 woodward
44958	0.8574745142185886	seattle protest muslimban tomorrow westlake park 5pm
40096	0.8578342286334354	emergency demo york tomorrow 5pm st helens square protest muslimban uk complicity mu
52607	0.8714713959799587	morning half 5 today protest trumps muslim ban hopefully see ️️
37999	0.8733513658784792	cambridge friends rally 5pm tomorrow gsm trumps muslimban uk govts complicity whos coming ht
90081	0.8772365917424746	today 5pm est take questions president trumps immigration ban response new york
97777	0.8847035615336298	west hollywood holding antitrump rally today 5pm pst ill defend great president
52571	0.890612686995518	seattle join us 5pm westlake park 

In [111]:
for i in indices:
    print("{}\t{}\t{}".format(i, cosine_similarity(sentenceEmbs[0].reshape(1, -1), sentenceEmbs[i].reshape(1, -1)), str(data.iloc[i]['Tweet1'])))

1279	[[0.85526013]]	protest today trump dapl downtown 5pm 611 woodward
44958	[[0.85747457]]	seattle protest muslimban tomorrow westlake park 5pm
40096	[[0.8578342]]	emergency demo york tomorrow 5pm st helens square protest muslimban uk complicity mu
52607	[[0.8714714]]	morning half 5 today protest trumps muslim ban hopefully see ️️
37999	[[0.87335134]]	cambridge friends rally 5pm tomorrow gsm trumps muslimban uk govts complicity whos coming ht
52571	[[0.8906126]]	seattle join us 5pm westlake park protest president trumps immigration refugee ban
25520	[[0.897992]]	protest muslimban tomorrow dallas city hall 5 pm
97778	[[0.8847035]]	west hollywood holding antitrump rally today 5pm pst ill defend great president
90082	[[0.87723655]]	today 5pm est take questions president trumps immigration ban response new york
0	[[1.]]	emergency rally trumps muslim travel ban nyc 125 5 pm


In [116]:
sents = sentenceEmbs[indices]
print(sents.shape)

(10, 768)


In [117]:
num = np.dot(sents, sents.T)
            
if scipy.sparse.issparse(sents):
    magnitude = norm(sents.toarray(), axis = 1)
else:
    magnitude = norm(sents, axis = 1)

den = np.dot(magnitude.reshape(-1, 1), magnitude.T.reshape(1, -1))


cosine_matrix = np.array(num/den)

(10, 10)


In [118]:
cosine_matrix[9]

array([0.8552605 , 0.85747445, 0.85783404, 0.8714711 , 0.8733514 ,
       0.8906124 , 0.89799184, 0.88470346, 0.87723655, 0.9999999 ],
      dtype=float32)

# Test bertscore

In [27]:
from bert_score import BERTScorer
scorer = BERTScorer(lang='en', rescale_with_baseline = True, idf = True, idf_sents = list(data['Tweet']), device='cuda:3')

In [28]:
cands = [data.iloc[0]['Tweet']]*data.shape[0]
refs = list(data['Tweet'])

In [None]:
import time
time0 = time.time()
batch_size = 5000
f1_scores = []
for i in range(0, data.shape[0], batch_size):
    _, _, F1 = scorer.score(cands[i:i+batch_size], refs[i:i+batch_size])
    f1_scores += F1.tolist()
    time1 = time.time()
    print(i, "--", i+batch_size, time1 - time0)
    time0 = time1

0 -- 5000 14.433952569961548
5000 -- 10000 14.843559980392456
10000 -- 15000 14.934136867523193
15000 -- 20000 15.487010478973389
20000 -- 25000 21.0456702709198
25000 -- 30000 20.31022047996521
30000 -- 35000 21.16242003440857
35000 -- 40000 20.423917293548584
40000 -- 45000 18.55930519104004
45000 -- 50000 20.600172758102417
50000 -- 55000 20.334948539733887
55000 -- 60000 20.02225613594055
60000 -- 65000 20.0243399143219
65000 -- 70000 19.104448556900024
70000 -- 75000 19.304067611694336
75000 -- 80000 20.95208215713501
80000 -- 85000 21.397560834884644


In [31]:
f1_scores1 = np.array(f1_scores).copy()
f1_scores1.mean()

-0.08983207250651078

In [None]:
# find most similar tweet to the first one using bert score with idf of 0

indices = np.argpartition(f1_scores1, -10)[-10:]
elements = f1_scores1[indices]
print("Indices: {}".format(indices))
print("Similarity values: {}".format(elements))
print("Tweet 0:", str(data.iloc[0]['Tweet1']))
for i in indices:
    print("{}\t{}\t{}".format(i, f1_scores1[i], str(data.iloc[i]['Tweet1'])))

In [63]:
# find most similar tweet to the first one using bert score with idf of 0

indices = np.argpartition(f1_scores1, -10)[-10:]
elements = f1_scores1[indices]
print("Indices: {}".format(indices))
print("Similarity values: {}".format(elements))
print("Tweet 0:", str(data.iloc[0]['Tweet1']))
for i in indices:
    print("{}\t{}\t{}".format(i, f1_scores1[i], str(data.iloc[i]['Tweet1'])))

Indices: [49951 52968 41158 94919 41783 53618 52869 53374  8934     0]
Similarity values: [0.37086952 0.37508908 0.40021974 0.39870292 0.37976035 0.38699275
 0.40110406 0.40852752 0.47117564 1.00000072]
Tweet 0: emergency rally trumps muslim travel ban nyc 125 5 pm
49951	0.370869517326355	thousands protest seattle trumps ban
52968	0.3750890791416168	emergency demo trumps muslimban monday 30 january 530 pm greys monument newcastle
41158	0.40021973848342896	thousands demonstrate boston trumps immigration orders
94919	0.3987029194831848	protesters march nycs federal immigration offices trumps ban
41783	0.37976035475730896	protesters new york trumps travel ban
53618	0.38699275255203247	emergency demo today trumps muslimban
52869	0.40110406279563904	protesters rally trumps immigration orders
53374	0.40852752327919006	demonstrators rally trumps immigration orders indy airport
8934	0.4711756408214569	protesters rally trumps muslim immigration ban
0	1.0000007152557373	emergency rally trumps mu

In [31]:
# find most similar tweet to the first one using bert score without idf

indices = np.argpartition(f1_scores1, -10)[-10:]
elements = f1_scores1[indices]
print("Indices: {}".format(indices))
print("Similarity values: {}".format(elements))
print("Tweet 0:", str(data.iloc[0]['Tweet']))
for i in indices:
    print("{}\t{}\t{}".format(i, f1_scores1[i], str(data.iloc[i]['Tweet'])))

Indices: [46983 52869 46127 41783 49951 41158  8934 94919 53374     0]
Similarity values: [0.44857961 0.4497115  0.46857849 0.45573822 0.45327032 0.47105238
 0.53879553 0.45089778 0.46392944 1.        ]
Tweet 0: emergency rally against trump's muslim travel ban in nyc , 1/25 at 5 p.m.
46983	0.4485796093940735	hundreds gather in nashville to protest trump's travel ban
52869	0.44971150159835815	protesters rally against trump's immigration orders
46127	0.4685784876346588	protesters rally at detroit metro airport against trump's muslim ban
41783	0.45573821663856506	protesters in new york against trump's travel ban
49951	0.4532703161239624	thousands protest in seattle against trump's ban
41158	0.4710523784160614	thousands demonstrate in boston against trump's immigration orders
8934	0.538795530796051	protesters rally against trump's muslim immigration ban .
94919	0.45089778304100037	protesters march to nyc's federal immigration offices against trump’s ban
53374	0.4639294445514679	demonstrat

## 5. Lexrank with tfidf and bert embeddings

In [13]:
lsh = LSH(embeddingData)
lsh.train(num_bits = 32)
lex = LexRank(embeddingData, lsh)

(123385,)


In [None]:
%%time
lex.build_graph(search_radius = 1, percent = 0.05)

In [16]:
# lex.matrix.getnnz(axis = 1)

array([  1,   0, 346, ...,   1,   0,  87], dtype=int32)

In [15]:
lex.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [16]:
sentIds2 = lex.extract_sentences(n_sents = 15)

Extracting sentences.....
0 ,  id:  105661 :  when will we get a prime minister who stands up against trump for british values top q by today
1 ,  id:  108290 :  trumps america is a rogue state the remaining free world nations to impose sanctions before its too late
2 ,  id:  24813 :  at jfk where protesters have totally shut down roads to terminal 4 a cheer erupts as news breaks that the
3 ,  id:  75672 :  farsi speakers needed at sfo see below
4 ,  id:  79361 :  honestly how many people does the uk government deport every single day why would they speak up against the muslimban
5 ,  id:  118736 :  48 of trump voters think airport protesters across the country last weekend were paid to do so by george soros
6 ,  id:  63237 :  why did miami submit to trumps executive order culture
7 ,  id:  63259 :  over 1 million sign u k petition to ban trump from state visit
8 ,  id:  119399 :  48 ppl killed by white terrorists in us while 26 were killed by radical islamists since 911
9 ,  id:  5528

In [19]:
print(len(lex.graph[105661]))
print(len(lex.graph[106760]))

3464
5


In [22]:
print(lex.scores[105661])
print(lex.scores[106760])

0.00012323896237877713
1.8887369727844597e-06


In [None]:
with open('embedding_lex.pkl', 'wb') as f:
    pickle.dump(lex, f)