# Summarization
1. Built-in Textrank with gensim
2. Built-in Lexrank
3. Lexrank with tfidf & LSH
4. Lexrank with sentence embedding & LSH

In [1]:
from numpy.linalg import norm
from fast_pagerank import pagerank
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
# from sentence_transformers import SentenceTransformer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from fast_lexrank import Lexrank
import numpy as np
from lsh import LSH
import pandas as pd
import scipy
import pickle
import time
import re
import emoji, string

In [2]:
# read data
data = pd.read_csv('/home/nguyen/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI))
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '',  string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—|→', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

# remove tweets #unique words less than haft of length
data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['uniWPercent']!=0]
# # remove tweets with lengths < 3, duplicates
while data['uniWPercent'].min() <=2:
    data = data[data['uniWPercent'] >2]
    data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
# # # remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.head())

                   Id                                              Tweet  \
0  824941360449015808  emergency rally against trump's muslim travel ...   
1  824941519857610752  theresa may has not apologized to trump for in...   
2  824941616314122240  trump's immigration ban excludes countries wit...   
3  824942056741167105  trump's immigration order expands the definiti...   
4  824942966875774976  alert : senator john mccain threatens action o...   

                                              Tweet1  uniWPercent  
0  emergency rally trumps muslim travel ban nyc 1...           10  
1  theresa may apologized trump insulting fails t...           11  
2  trumps immigration ban excludes countries busi...            9  
3  trumps immigration order expands definition cr...            6  
4  alert senator john mccain threatens action pre...            8  


In [4]:
data.shape

(105175, 4)

In [5]:
# def lemmatize_stemming(text):
#     return WordNetLemmatizer().lemmatize(text, pos='v')
# data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(lemmatize_stemming(y) for y in x.split(" ") if y.strip()!= ""))

In [6]:
data.iloc[21284]

Id                                            825521541756645377
Tweet          if you or someone you know has been affected b...
Tweet1         someone know affected new refugeeimmigrationtr...
uniWPercent                                                   10
Name: 23995, dtype: object

In [7]:
remained_index = data.index

In [8]:
data = data.reset_index(drop=True)

In [9]:
# data[data['Tweet'].str.contains("president trump fires acting attorne")]

In [10]:
print(list(data.iloc[0:10]['Tweet1']))

['emergency rally trumps muslim travel ban nyc 125 5 pm', 'theresa may apologized trump insulting fails today trump send back b', 'trumps immigration ban excludes countries business ties via democracyfor', 'trumps immigration order expands definition criminal', 'alert senator john mccain threatens action president trump', 'kiva still distracted trump gets peoples business', 'ty bailing gmb today piers morgan drank trump kool aid vocal opponent', 'trump sign eo temporary ban suspending visas syria six african countries buildthewall', 'moral obligation stop hitler moral obligation stop trump', 'people getting radicalized trump always hate freedom']


In [11]:
data.shape

(105175, 4)

## 1. Lexrank + lsh + tfidf

In [84]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

(105175, 48876)


In [85]:
lsh_tfidf = LSH(tfidfData)
lsh_tfidf.train(num_bits = 8)
lex_tfidf = Lexrank(tfidfData, lsh_tfidf)

(105175,)


In [None]:
%%time
lex_tfidf.build_graph(search_radius = 1, cosine_sim = 0.3)

#buckets: 256
.......Buck: 0, vec: (4405, 48876)


In [17]:
lex.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [18]:
sentIds = lex.extract_summary(n_sents = 20, cosine_thres=0.5)

Extracting sentences....
Sent scores: 105175
selected one: 62658, 0.00020242707405364153
Sent 80396 is similar to a 62658: 0.7907479705005656
selected one: 13109, 0.00018880050779761126
selected one: 32294, 0.00017162311007124417
selected one: 91027, 0.00015780050458681738
selected one: 16346, 0.00013832732862091305
Sent 62378 is similar to a 32294: 0.800885372372297
selected one: 55536, 0.00012749418907202905
selected one: 22089, 0.0001244623936573965
Sent 10126 is similar to a 13109: 0.8216643750207849
Sent 33510 is similar to a 22089: 0.720695735370632
selected one: 103476, 0.00011871329635948848
selected one: 39127, 0.00011697775526405306
selected one: 41746, 0.00011553935404484163
selected one: 15499, 0.00011492288316361148
Sent 85108 is similar to a 16346: 0.5930587680453396
Sent 61849 is similar to a 62658: 0.672487377445378
Sent 65930 is similar to a 55536: 0.932304152068153
Sent 47447 is similar to a 62658: 0.7400724194755225
selected one: 71398, 0.00010798483103909025
selecte

In [19]:
print("Id", "#adjacentEdges", "lexrank")
for i, idx in enumerate(sentIds):
    print(i, len(lex.graph[idx]), lex.scores[idx])

Id Index lexrank
0 356 0.00020242707405364153
1 342 0.00018880050779761126
2 215 0.00017162311007124417
3 159 0.00015780050458681738
4 259 0.00013832732862091305
5 149 0.00012749418907202905
6 258 0.0001244623936573965
7 102 0.00011871329635948848
8 228 0.00011697775526405306
9 95 0.00011553935404484163
10 149 0.00011492288316361148
11 149 0.00010798483103909025
12 147 0.00010697164164957916
13 475 0.0001059602037582633
14 158 0.00010534054644177753
15 250 0.00010525056852056679
16 129 9.781553837955189e-05
17 103 9.574184662715573e-05
18 105 9.462894645832923e-05
19 78 9.19887782750149e-05


In [20]:
for i, idx in enumerate(sentIds):
    print(i, data.iloc[idx]['Tweet'])

0  on executive order immigration 
1 green card holders 
2 ban all muslim countries 
3 what do people in the middle east think about terrorism ? for more on the middle east visit http …
4 our statement on president trump’s executive order on immigration : 
5  make america great . again ?
6  from saudi arabia . #trump's ban doesn't include saudi arabia .
7 keep america safe president trump keep the ban on muslim refugees , keep them out and keep us safe . 
8 my statement on president trump's executive order on refugees : 
9  it is a religion ban trump
10 protest at the airport !!! #muslimban 
11 the president of the united states fires the attorney general of the united states .
12  are you from any of the 7 banned countries ? what are you on about ?
13 trump's state visit to the uk :
14 it's not a muslim ban . it's a ban from muslim countries where trump doesn't do business . 
15 trump signs executive order on ‘ extreme vetting ’ 
16 jfk protest . #muslimban #nobannowall 
17 it's a tem

In [21]:
with open('saved_models/tfidf_model.pkl', "wb") as f:
    pickle.dump(lex, f)

#####

* <b>Sub-events captured:</b>
    1. green card orders
    2. protest at jfk airport
    3. attorney general get fired
    4. ban 7 countries, muslim countries
    5. trump's ban doesn't include saudi arabia .
    6. trump visit uk
    7. executive order on ‘ extreme vetting ’ 
* <b>Lack</b>:
    1. starbuck hires 10K refugees
    2. trump’s deportation orders 
    3. washington state will sue to stop trump's immigration
    4. canada will accept the refugees 
    5. quebec city mosque shooting 


## 2. Lexrank with sentEmbeddings and LSH

In [41]:
#load embeddings
with open('/home/nguyen/data/travel_ban_sentence_transformers_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)
print(len(embeddings))

123385


In [35]:
embeddings[0].shape
count = 0
list = []
for i in range(1000):
    cos_sim = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[i].reshape(1, -1))
    print(cos_sim)
    
    if cos_sim >0.65:
        count+=1
        list.append(i)
print("Number of sentences that are similar with the first sen:", count)

[[1.]]
[[0.47645986]]
[[0.55713235]]
[[0.36821206]]
[[0.56801199]]
[[0.28415421]]
[[0.579572]]
[[0.61550595]]
[[0.32550028]]
[[0.3986782]]
[[0.46043204]]
[[0.46043204]]
[[0.46043204]]
[[0.46043204]]
[[0.46043204]]
[[0.42543658]]
[[0.54272559]]
[[0.39507733]]
[[0.30051262]]
[[0.48724031]]
[[0.40539427]]
[[0.51602106]]
[[0.60498151]]
[[0.41438888]]
[[0.52208378]]
[[0.5208903]]
[[0.43861775]]
[[0.57033993]]
[[0.43334224]]
[[0.50714861]]
[[0.54170544]]
[[0.50465723]]
[[0.59245738]]
[[0.56580981]]
[[0.35551427]]
[[0.53469864]]
[[0.55558165]]
[[0.61901905]]
[[0.41438888]]
[[0.49951332]]
[[0.58661521]]
[[0.6123305]]
[[0.38146998]]
[[0.46043204]]
[[0.5397194]]
[[0.50190059]]
[[0.55811953]]
[[0.41143669]]
[[0.44242019]]
[[0.61901905]]
[[0.61901905]]
[[0.3934684]]
[[0.39528469]]
[[0.49174227]]
[[0.47154915]]
[[0.55849551]]
[[0.57376554]]
[[0.52475725]]
[[0.31449625]]
[[0.31449625]]
[[0.31449625]]
[[0.46812818]]
[[0.44458279]]
[[0.50848147]]
[[0.42916242]]
[[0.46043204]]
[[0.46043204]]
[[0.437145

[[0.40194937]]
[[0.43110217]]
[[0.46314861]]
[[0.58674055]]
[[0.39973636]]
[[0.44279201]]
[[0.55467119]]
[[0.53593051]]
[[0.58033127]]
[[0.4441936]]
[[0.47125199]]
[[0.46447765]]
[[0.53427709]]
[[0.36991663]]
[[0.33392634]]
[[0.45792363]]
[[0.36762072]]
[[0.36854702]]
[[0.51165132]]
[[0.58500041]]
[[0.47293453]]
[[0.38445941]]
[[0.43524983]]
[[0.60180543]]
[[0.39516756]]
[[0.60724664]]
[[0.39056568]]
[[0.54069327]]
[[0.47499641]]
[[0.49205983]]
[[0.43036735]]
[[0.50262662]]
[[0.51801518]]
[[0.40101018]]
[[0.5611494]]
[[0.20888474]]
[[0.57373868]]
[[0.38978648]]
[[0.45966654]]
[[0.36654749]]
[[0.39972167]]
[[0.56758638]]
[[0.49962438]]
[[0.45644809]]
[[0.33928732]]
[[0.52883464]]
[[0.55494664]]
[[0.45955047]]
[[0.28206958]]
[[0.4707475]]
[[0.48910634]]
[[0.55579018]]
[[0.34372307]]
[[0.51366078]]
[[0.28566875]]
[[0.37533744]]
[[0.60276693]]
[[0.28629141]]
[[0.49491452]]
[[0.4026682]]
[[0.58374512]]
[[0.45128816]]
[[0.45264705]]
[[0.47585259]]
[[0.28311721]]
[[0.50611512]]
[[0.6806408]]


In [42]:
sentenceEmbs = np.array(embeddings)
print(sentenceEmbs.shape)

(123385, 768)


In [43]:
sentenceEmbs = sentenceEmbs[remained_index]

In [44]:
sentenceEmbs.shape

(105175, 768)

In [46]:
lsh = LSH(sentenceEmbs)
lsh.train(num_bits=14)
table = lsh.model['table']

(105175,)


In [47]:
count = 0
for key, value in table.items():
    print(key, len(value))
    if len(value) > 150:
        count+=1
print(len(table))
print("#buckets with >150 sens: ", count)

11384 28
10973 16
1111 60
2762 42
1754 185
9589 2
1048 607
217 83
3230 160
1678 23
8344 85
1082 190
11996 133
725 55
10888 15
85 152
11992 118
1681 118
119 13
23 68
9370 89
9745 33
3157 102
8922 40
8849 17
1143 62
9809 46
1162 146
3226 228
1210 217
3774 77
2072 207
88 555
1339 34
11994 55
1330 45
3772 125
3608 249
2170 18
1290 16
1691 175
732 186
10388 10
650 74
600 328
152 580
1621 52
1298 148
1694 124
11740 10
11486 67
9434 57
16 204
89 165
3742 183
8312 5
10972 123
2248 61
149 156
2184 45
2266 151
1112 693
11338 6
1079 26
2141 58
3096 199
11004 17
11931 7
11484 66
20 41
1682 105
1736 49
11928 126
2260 69
50 17
11386 20
1114 335
602 99
2790 1
1331 47
11934 60
9371 30
1051 249
3245 1
3154 38
1245 106
11930 63
11420 60
3282 10
410 205
1624 431
1738 33
1720 188
1065 7
2776 416
1224 40
11485 21
551 1
11356 31
7304 5
72 75
11512 19
542 23
2780 380
1147 31
2264 295
1117 159
728 489
3102 72
2252 40
136 121
1598 11
90 179
1047 57
29 51
1434 471
3690 30
1692 246
3740 339
1178 692
1756 183
891

3483 13
12122 2
1322 11
7610 8
9878 6
1140 24
10892 15
7822 7
318 2
1000 6
10296 7
3161 46
1442 14
8824 14
1669 2
1960 11
11350 2
663 33
8540 7
2242 7
5267 11
683 5
8862 13
2172 27
3762 6
529 57
11000 26
2576 42
5146 10
1548 11
1060 1
522 27
681 6
10702 1
8636 12
2164 17
184 60
9978 25
11348 9
499 2
2713 18
2491 3
721 45
595 8
3519 15
6837 2
12280 6
8913 12
15290 1
1024 4
3737 66
2670 3
104 19
1362 40
10744 1
884 1
3223 41
3997 12
122 20
1338 73
731 22
11377 5
520 45
729 64
2684 31
9937 12
8408 117
9682 9
16120 1
14008 4
3596 27
338 39
9490 24
1174 31
4602 4
2458 49
114 13
5806 3
8657 7
11677 4
538 65
1456 31
4504 7
920 71
3481 7
6018 1
1139 70
3796 22
2102 11
9844 3
2003 2
11711 1
11410 2
112 29
539 47
2452 18
2190 20
1528 28
8729 15
211 32
3032 21
9850 16
4187 5
5245 2
526 6
1744 54
1721 37
2618 15
347 14
1424 84
11510 4
8846 1
10325 22
1052 156
2613 20
9050 8
8720 21
9930 15
336 30
1026 24
156 154
12764 3
11162 3
952 14
8341 35
11476 12
1394 16
11358 18
2379 1
9369 34
1152 6
9400 35

2611 2
10352 5
6894 3
371 8
2394 16
9619 14
1206 5
1941 12
9915 14
2015 3
10134 3
22 12
14262 1
3312 4
10865 2
2896 2
3347 4
754 2
10014 3
8670 11
9176 14
14300 1
111 3
3540 4
11447 2
9875 8
10902 5
6407 3
5771 9
9686 1
9247 12
2480 4
386 17
8309 5
4745 2
3725 6
10484 2
3088 35
4107 3
1067 11
2770 8
3803 14
1437 20
11857 10
2089 11
3451 4
11775 3
1230 8
1595 39
6024 3
597 39
2156 25
9554 9
8916 15
8855 3
3900 3
6878 6
6684 6
12124 1
3317 14
12010 6
2671 1
1203 48
2838 1
2004 3
1283 10
13660 1
2604 11
9744 30
1532 7
2642 10
9832 2
7656 2
11801 12
180 10
1577 4
369 5
9427 6
9557 5
6164 7
3222 18
9364 18
100 1
8864 1
10847 3
14298 1
6040 5
9596 4
11921 4
3541 1
488 7
11704 15
16316 6
6392 2
7903 5
6796 7
4239 6
3028 2
3379 4
10257 2
4570 1
6872 13
7722 1
8654 4
3600 22
4062 15
8530 4
11378 4
6038 1
1392 8
1802 5
9813 9
4247 7
4027 2
13960 1
3641 30
9906 5
2223 2
11984 7
8040 1
1843 3
6794 1
108 5
2990 4
5212 2
7326 13
1011 1
8599 6
9183 4
6156 4
10074 4
3826 3
10128 18
5387 3
8404 11
4699

9608 1
4809 1
3928 7
3037 2
2544 4
1809 6
1542 2
11948 4
11644 4
15576 2
8335 2
10216 2
16031 5
2418 4
4426 1
12112 2
3766 4
10987 1
1455 1
690 4
1883 5
8249 4
6792 4
1523 6
2314 11
11321 18
6362 3
10652 3
818 1
1225 2
3185 8
4025 2
10716 4
11882 4
5518 2
3921 1
2105 8
11658 4
8832 4
4688 1
6654 3
2475 2
5663 2
11312 2
4808 5
11927 4
10190 4
9932 5
10390 5
14486 1
39 2
5304 11
9768 8
4125 2
779 1
5062 1
2271 12
6288 1
7562 2
2090 11
11224 3
3411 3
47 3
5662 1
15132 2
4153 1
6301 2
2582 11
9748 4
5783 3
4488 5
1979 7
11530 1
13809 1
12728 3
6856 4
8724 5
7825 1
9567 13
9550 3
10478 1
5264 2
7626 2
11647 5
10651 1
129 3
1872 8
2735 1
5149 2
11066 1
3956 1
4270 1
2364 2
4630 1
7402 3
4540 4
10150 1
3243 3
2807 2
518 3
6638 2
10012 1
4476 1
872 1
3215 8
1935 1
2600 16
11804 10
10129 6
7519 2
15920 2
9482 4
8468 3
6662 1
4749 2
5915 2
3444 1
7355 5
2478 5
9812 2
2378 3
1099 5
4102 1
3636 3
4491 1
9263 1
309 4
2555 2
1815 2
3638 5
9834 1
7598 4
13586 1
7196 4
9521 2
1264 8
2167 11
10005 1
39

15607 1
5214 1
10236 4
9399 2
8298 1
10013 1
16351 1
10437 1
11325 1
8434 1
5054 3
1391 2
8660 4
637 4
496 1
78 3
8915 2
7979 1
14474 1
10305 1
486 2
15956 1
5760 1
5753 2
2861 1
3108 4
12442 1
1541 1
1231 1
12307 1
1961 1
2158 6
4095 2
4085 1
1800 1
15948 1
4697 3
5773 3
4014 8
9612 2
6559 1
11391 4
4721 1
10229 2
5015 1
7752 1
12266 2
11963 3
9788 4
2519 3
8927 7
9333 2
1661 28
574 4
3321 4
3359 7
5305 2
2289 1
2421 2
6623 1
6478 1
7382 2
3719 2
11128 1
8156 2
6552 3
5137 2
5884 4
5823 2
12446 1
4043 1
617 3
2603 1
2444 4
11467 1
9227 3
5298 2
5023 2
14227 2
2057 2
3501 2
2688 1
6678 1
8317 1
7884 2
9344 1
895 1
6926 1
7166 1
2210 1
4778 6
11548 3
2030 1
13275 1
3581 3
6065 1
6276 1
5982 2
2958 4
420 1
3848 1
10289 1
15508 1
8369 1
9395 2
7116 1
10353 1
9150 1
15583 1
14226 1
5682 1
6582 3
433 2
6749 1
12246 1
5515 3
7401 1
8894 4
2299 2
4593 1
1853 3
8506 2
7080 2
15930 2
10698 1
3513 2
679 2
15069 1
191 6
5948 2
3791 2
3573 2
13404 1
2080 1
2420 3
6268 1
4787 1
11798 1
271 3
1642 4

3532 1
8195 1
6591 2
687 1
1611 1
6861 1
7658 1
2793 1
11764 1
1217 1
1609 1
5624 1
4380 1
4526 1
4339 1
1812 1
1261 1
3440 1
9854 1
6583 1
10621 1
2788 1
2059 1
12370 1
8821 1
205 1
11306 1
11572 1
3979 1
4390 1
11101 1
7596 1
8192 1
6981
#buckets with >150 sens:  120


In [48]:
lex = Lexrank(sentenceEmbs, lsh)

In [49]:
%%time
lex.build_graph(search_radius = 1, percent=0.03)

#buckets: 6954
.......Buck: 0, vec: (336, 768)
.......Buck: 100, vec: (35, 768)
.......Buck: 200, vec: (1875, 768)
.......Buck: 300, vec: (264, 768)
.......Buck: 400, vec: (121, 768)
.......Buck: 500, vec: (80, 768)
.......Buck: 600, vec: (234, 768)
.......Buck: 700, vec: (8, 768)
.......Buck: 800, vec: (2141, 768)
.......Buck: 900, vec: (171, 768)
.......Buck: 1000, vec: (1060, 768)
.......Buck: 1100, vec: (114, 768)
.......Buck: 1200, vec: (155, 768)
.......Buck: 1300, vec: (412, 768)
.......Buck: 1400, vec: (1287, 768)
.......Buck: 1500, vec: (54, 768)
.......Buck: 1600, vec: (148, 768)
.......Buck: 1700, vec: (31, 768)
.......Buck: 1800, vec: (187, 768)
.......Buck: 1900, vec: (290, 768)
.......Buck: 2000, vec: (285, 768)
.......Buck: 2100, vec: (150, 768)
.......Buck: 2200, vec: (537, 768)
.......Buck: 2300, vec: (176, 768)
.......Buck: 2400, vec: (295, 768)
.......Buck: 2500, vec: (155, 768)
.......Buck: 2600, vec: (63, 768)
.......Buck: 2700, vec: (74, 768)
.......Buck: 2800, ve

In [50]:
lex.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [70]:
data[data['Tweet'].str.contains('green card')]

Unnamed: 0,Id,Tweet,Tweet1,uniWPercent
1216,825064765261176832,company sent out a notice about trump's muslim...,company sent notice trumps muslim ban green ca...,10
2323,825137645504172032,visas being denied immediately . chaos at airp...,visas denied immediately chaos airports air mu...,13
2419,825141021906382848,"ban applies if you have a visa , green card , ...",ban applies visa green card even dual citizen ...,9
2609,825147334342303745,current concern : what about people with green...,current concern people green cards currently a...,13
2755,825152564652081157,many elderly come to green card interview with...,many elderly come green card interview suit ti...,12
...,...,...,...,...
103920,827623785662644224,my uncle from syria needs a green card any tak...,uncle syria needs green card takers,6
104019,827626771994533889,why is tomi defending the rights of milo ( a ...,tomi defending rights milo non american rights...,12
104760,827658820658851840,green card coming soon !,green card coming soon,4
104876,827664285862129668,alert : dos issues stmnt clarifying ban dn app...,alert dos issues stmnt clarifying ban dn apply...,18


In [82]:
data.iloc[2146]

Id                                            825128489326088196
Tweet          trump signs executive order on ‘ extreme vetti...
Tweet1               trump signs executive order extreme vetting
uniWPercent                                                    6
Name: 2146, dtype: object

In [83]:
lsh.model[]

214

In [60]:
# 14 bins
sentIds = lex.extract_summary(n_sents = 20, cosine_thres = 0.89, max_sent=500)

Extracting sentences....
Sent scores: 105175
selected one: 42773, 0.00010440142035777392
selected one: 70826, 0.00010220440798520725
selected one: 72679, 9.952317010832082e-05
Sent 67230 is similar to a 70826: 0.9897810275772859
selected one: 23584, 9.790741698338028e-05
Sent 66616 is similar to a 70826: 0.9818513941126814
Sent 65594 is similar to a 23584: 0.8982108068636003
selected one: 15323, 9.108063279018107e-05
selected one: 31600, 8.952282977429021e-05
selected one: 2754, 8.94612728374494e-05
selected one: 38737, 8.916163568577142e-05
Sent 96348 is similar to a 70826: 0.9162692941825311
selected one: 15006, 8.773176537896183e-05
selected one: 34043, 8.760762320177292e-05
selected one: 59451, 8.726155740041375e-05
selected one: 9399, 8.668920580458683e-05
selected one: 49222, 8.600378997886487e-05
Sent 76628 is similar to a 23584: 0.8911665337519232
Sent 101509 is similar to a 42773: 0.8944038170610407
Sent 9573 is similar to a 70826: 0.9093985959365581
Sent 6261 is similar to a 

In [61]:
for i, idx in enumerate(sentIds):
    print("{}. {}".format(i, data.iloc[idx]['Tweet']))

0. " this policy is going to get americans killed : " sen. chris murphy on trump's refugee ban • #antitrumpmvmt …
1. us muslim leaders sue trump over ' fear-mongering ' travel ban via …
2. pragmactivist : trump fired ag #sallyyates for enforcing laws and #resist ing a #muslimban . will he fire seanspi … 
3.  remove trump ! commit/lock trump away ! aclu exec director anthony romero comes out aclu just argued won block trump's muslim ban
4. breaking - #cair planning lawsuit against trump over #muslimban i guess ur lawyers don't know a president is immune ? https :/ …
5. if muslims cause further bloodshed to americans it will be on the hands of the federal judge that blocked the trump enforcement order ! #maga
6. trump blasted by top democrat for imposing a religious test on fleeing refugees via …
7. thr : ' we can't keep pretending ' conway defends trump immigration ban , rips media ' a new one ' for bias …
8. tech finds its voice , opposing trump’s muslim ban : " so un-american , it pai

In [57]:
print("Id", "Index", "lexrank")
for i, idx in enumerate(sentIds):
    print(i, len(lex.graph[idx]), lex.scores[idx])

Id Index lexrank
0 3382 0.00010440142035777392
1 3538 0.00010220440798520725
2 3442 9.952317010832082e-05
3 3391 9.790741698338028e-05
4 3227 9.367025901136787e-05
5 3189 9.108063279018107e-05
6 3096 8.952282977429021e-05
7 3014 8.94612728374494e-05
8 3141 8.916163568577142e-05
9 2817 8.773176537896183e-05
10 2833 8.760762320177292e-05
11 2853 8.726155740041375e-05
12 3063 8.668920580458683e-05
13 2805 8.600378997886487e-05
14 2703 8.509400810913653e-05
15 2810 8.270572045910686e-05
16 2757 8.257767461424093e-05
17 2617 8.183067572493015e-05
18 2689 8.029157895416175e-05
19 2798 8.02714257585991e-05


In [78]:
len(lex.graph[43711])

1205

## Combination of LSH with tfidf and lex rank with sent embeddings

In [29]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

(105175, 48876)


In [31]:
lsh = LSH(tfidfData)
lsh.train(num_bits = 8)
lex = LexRank(sentenceEmbs, lsh)

(105175,)


NameError: name 'LexRank' is not defined

In [30]:
for key, value in table.items():
    print(key, len(value))
print(len(table))

11384 28
10973 16
1111 60
2762 42
1754 185
9589 2
1048 607
217 83
3230 160
1678 23
8344 85
1082 190
11996 133
725 55
10888 15
85 152
11992 118
1681 118
119 13
23 68
9370 89
9745 33
3157 102
8922 40
8849 17
1143 62
9809 46
1162 146
3226 228
1210 217
3774 77
2072 207
88 555
1339 34
11994 55
1330 45
3772 125
3608 249
2170 18
1290 16
1691 175
732 186
10388 10
650 74
600 328
152 580
1621 52
1298 148
1694 124
11740 10
11486 67
9434 57
16 204
89 165
3742 183
8312 5
10972 123
2248 61
149 156
2184 45
2266 151
1112 693
11338 6
1079 26
2141 58
3096 199
11004 17
11931 7
11484 66
20 41
1682 105
1736 49
11928 126
2260 69
50 17
11386 20
1114 335
602 99
2790 1
1331 47
11934 60
9371 30
1051 249
3245 1
3154 38
1245 106
11930 63
11420 60
3282 10
410 205
1624 431
1738 33
1720 188
1065 7
2776 417
1224 40
11485 21
551 1
11356 31
7304 5
72 75
11512 19
542 23
2780 380
1147 31
2264 295
1117 159
728 488
3102 72
2252 40
136 121
1598 11
90 179
1047 57
29 51
1434 471
3690 30
1692 246
3740 339
1178 692
1756 183
891

112 29
539 47
2452 18
2190 20
1528 28
8729 15
211 32
3032 21
9850 16
4187 5
5245 2
526 6
1744 54
1721 37
2618 15
347 14
1424 84
11510 4
8846 1
10325 22
1052 156
2613 20
9050 8
8720 21
9930 15
336 30
1026 24
156 154
12764 3
11162 3
952 14
8341 35
11476 12
1394 16
11358 18
2379 1
9369 34
1152 6
9400 35
6087 1
2506 8
468 4
8840 21
9438 24
10394 11
2577 17
1552 126
1726 44
10714 4
3986 7
3125 25
9688 37
3794 6
3318 12
1527 7
2442 14
682 20
1130 12
2229 11
351 7
9322 1
12028 31
15614 3
8785 23
598 6
4751 3
11869 9
3304 13
6862 6
7310 4
4245 5
439 2
649 23
10909 18
387 16
2042 20
3252 10
6685 6
8221 11
656 101
12110 2
1563 85
1742 7
8464 20
2970 15
1104 217
3797 42
64 8
3370 8
9867 3
794 8
15230 1
2492 18
2650 57
11438 2
2774 7
7870 7
370 8
5403 6
392 19
5290 10
8475 4
115 26
10676 3
16121 2
339 20
2206 70
5534 18
1686 29
9407 2
2527 3
2334 11
8407 7
98 1
691 4
1237 47
1100 4
5127 1
5139 12
2126 9
16366 1
8284 25
223 33
8843 3
6904 5
7060 1
2570 15
11381 9
9898 17
1599 14
727 6
3078 6
4634 3

10900 10
2668 8
8440 17
2504 8
5853 4
1122 1
1808 14
2711 27
10200 37
6165 7
2430 7
11795 2
572 8
2153 11
3387 8
2137 22
6106 6
8339 7
13 5
2815 4
6526 2
1664 9
134 8
10195 1
11578 7
9782 1
3227 47
3607 6
2185 1
1589 14
915 4
5819 6
6204 9
9610 9
11835 2
2117 1
1329 11
12634 3
2775 10
1906 4
3120 14
11417 11
623 1
10361 3
1093 1
12703 4
11786 2
9654 2
10387 2
3121 4
6618 2
10453 18
9644 2
8384 3
10778 8
9969 2
3131 32
8248 11
9631 9
703 8
1415 4
7711 3
4106 1
10474 1
10000 8
2898 1
748 3
4023 3
2564 1
1851 6
565 9
378 4
10684 6
5522 10
6236 12
11946 7
3703 9
1153 5
10961 3
15870 2
8624 9
10640 1
4189 6
6701 1
9940 9
9116 8
11678 10
978 7
1358 3
9107 3
2800 7
12191 3
7554 2
5567 6
467 6
9977 6
4248 9
2975 3
9616 23
852 2
7305 2
1354 6
141 6
674 2
9651 6
2009 7
477 3
8334 4
10327 3
8874 5
7901 3
4616 3
907 2
9234 8
9164 1
4496 2
8260 2
6366 7
2542 4
9598 3
3801 28
1788 28
2745 4
9650 11
7796 1
2805 7
10360 11
3992 27
1848 15
2532 1
8857 11
2834 3
9592 7
10201 2
8202 5
4442 1
11285 5
1327

8223 6
10071 1
3566 8
1489 11
10781 5
7148 1
8783 1
4795 1
10894 5
2212 1
4241 2
11050 1
10459 2
5466 3
2484 9
5706 1
9938 11
3077 2
10647 2
10056 1
15547 2
15626 1
15896 1
6865 1
8199 2
7773 3
2358 4
5129 1
5532 4
6029 2
2500 1
7450 4
13337 2
4283 3
12236 1
6384 1
8473 2
10553 1
4682 1
2498 1
253 9
9390 3
5661 5
9493 3
13327 1
1923 1
1696 2
7134 1
6282 2
11929 10
7319 7
5307 7
14188 1
9820 6
9532 4
5177 7
11140 1
2157 5
13912 2
16088 2
9036 1
544 2
8968 1
5898 1
5626 2
3988 4
9439 14
7839 6
3983 1
4936 1
11981 3
2988 2
7163 1
10262 2
6326 4
11703 1
2175 2
441 4
11978 8
569 9
436 5
13744 1
4111 2
8714 1
9393 5
3562 9
942 5
14908 2
15550 3
1250 1
15834 1
3999 5
7223 1
825 1
630 1
9942 1
1582 4
12094 3
10556 1
2403 1
1614 3
2747 3
7608 3
9292 2
5436 2
281 6
4375 3
6188 2
13012 1
1281 1
8735 2
3648 1
6528 1
5406 3
4888 2
9738 6
5270 1
9100 2
7692 2
6173 4
10368 2
11848 4
14360 1
8436 1
10990 3
2699 2
7350 1
1741 1
385 1
35 2
12240 4
1313 1
7706 2
709 2
2189 5
12940 2
4408 3
10312 4
396 4


331 2
14495 2
3586 1
5179 3
7426 2
14029 1
4617 2
305 1
1346 2
6180 1
2675 1
6568 1
10205 3
15679 1
6034 2
2417 1
10788 1
5809 2
13272 2
9994 2
8909 1
8397 2
14748 3
14484 2
7861 1
2876 1
5020 2
741 2
3886 1
384 1
1899 1
883 1
11594 1
9723 2
1003 1
11852 3
6235 1
2227 1
11695 1
8727 1
1387 2
4798 2
10156 1
10231 1
7867 2
4993 1
9739 2
4895 1
4063 2
7824 2
3759 1
13049 1
302 1
14798 1
2159 2
3209 2
8499 1
8500 1
11787 1
7325 2
7831 1
8828 2
10655 1
6264 2
6199 6
7696 2
2878 1
12107 1
7346 1
15513 1
7092 1
14352 1
13718 2
7288 1
13791 1
3072 1
7225 2
4573 1
495 1
6351 1
13692 1
9983 1
1475 1
13981 2
3616 2
6621 1
15382 1
5141 1
11434 3
5334 1
6450 1
7448 2
6192 2
14366 1
4371 1
7230 7
1030 8
10897 3
10963 1
7185 2
4097 1
550 1
7208 2
7283 1
4424 1
2562 2
8244 2
10792 3
7548 1
14584 1
10451 1
3349 3
1804 1
3575 2
946 1
7639 1
14076 1
9890 1
820 1
6604 2
445 2
6664 1
11560 1
4822 1
11897 2
5082 1
1529 4
10992 1
8087 1
6764 2
3835 2
6902 1
7150 1
1838 2
2950 1
3081 2
2061 2
4830 1
4842 1
31

In [None]:
%%time
lex.build_graph(search_radius = 1, percent=0.01)

In [None]:
lex.train(lexrank_iter = 100, damping_factor = 0.85)

In [None]:
sentIds = lex.extract_sentences(n_sents = 20, cosThres = 0.85)

In [None]:
sentIds

In [None]:
for i, idx in enumerate(sentIds):
    print(i, len(lex.graph[idx]))

# Test: 

In [107]:
tfidfData.shape

(105176, 48876)

In [108]:
tfidfCosine = cosine_similarity(tfidfData[0], tfidfData)

In [109]:
# extract 10 sentence with highest similarity
indices = np.argpartition(tfidfCosine[0], -10)[-10:]
elements = tfidfCosine[0][indices]
print("Indices: {}".format(indices))
print("Cosine values: {}".format(elements))
print("Tweet 0:", str(data.iloc[0]['Tweet1']))
for i in indices:
    print("{}\t{}\t{}".format(i, tfidfCosine[0][i], str(data.iloc[i]['Tweet1'])))

Indices: [69002 89160 45071 97336  8934 47837 23965 97958 79194     0]
Cosine values: [0.29885418 0.31449294 0.31054197 0.32932623 0.3548277  0.37053829
 0.35177915 0.32968794 0.33406313 1.        ]
Tweet 0: emergency rally trumps muslim travel ban nyc 125 5 pm
69002	0.29885417674875303	watch live protesters gather rally hate nyc response trumps refugee ban
89160	0.314492944844012	google workers rally trumps travel ban
45071	0.3105419662113536	many american flags todays nyc rally trumps muslim ban america resist htt
97336	0.3293262319915567	defend immigrants trump rally nyc 2 pm today near nyu nobannowall
8934	0.35482769835646444	protesters rally trumps muslim immigration ban
47837	0.3705382876374108	senator chuck schumer slams trumps travel ban nyc rally
23965	0.35177914692132783	blocks call trumps unconstitutional muslim ban comes emergency hearing nyc
97958	0.32968793637635174	judge extends emergency stay blocking trumps travel ban via newyork nyc
79194	0.334063133607824	know friday

In [110]:
embCosine = cosine_similarity(sentenceEmbs[0].reshape(1, -1), sentenceEmbs)
# extract 10 sentence with highest similarity
indices = np.argpartition(embCosine[0], -10)[-10:]
elements = embCosine[0][indices]
print("Indices: {}".format(indices))
print("Cosine values: {}".format(elements))
print("Tweet 0:", str(data.iloc[0]['Tweet1']))
for i in indices:
    print("{}\t{}\t{}".format(i, embCosine[0][i], str(data.iloc[i]['Tweet1'])))

Indices: [ 1279 44958 40096 52607 37999 52571 25520 97778 90082     0]
Cosine values: [0.8552602  0.85747457 0.8578343  0.87147146 0.87335134 0.8906126
 0.897992   0.8847035  0.8772365  1.        ]
Tweet 0: emergency rally trumps muslim travel ban nyc 125 5 pm
1279	0.8552601933479309	protest today trump dapl downtown 5pm 611 woodward
44958	0.8574745655059814	seattle protest muslimban tomorrow westlake park 5pm
40096	0.8578342795372009	emergency demo york tomorrow 5pm st helens square protest muslimban uk complicity mu
52607	0.8714714646339417	morning half 5 today protest trumps muslim ban hopefully see ️️
37999	0.8733513355255127	cambridge friends rally 5pm tomorrow gsm trumps muslimban uk govts complicity whos coming ht
52571	0.8906126022338867	seattle join us 5pm westlake park protest president trumps immigration refugee ban
25520	0.8979920148849487	protest muslimban tomorrow dallas city hall 5 pm
97778	0.884703516960144	west hollywood holding antitrump rally today 5pm pst ill defend

In [111]:
for i in indices:
    print("{}\t{}\t{}".format(i, cosine_similarity(sentenceEmbs[0].reshape(1, -1), sentenceEmbs[i].reshape(1, -1)), str(data.iloc[i]['Tweet1'])))

1279	[[0.85526013]]	protest today trump dapl downtown 5pm 611 woodward
44958	[[0.85747457]]	seattle protest muslimban tomorrow westlake park 5pm
40096	[[0.8578342]]	emergency demo york tomorrow 5pm st helens square protest muslimban uk complicity mu
52607	[[0.8714714]]	morning half 5 today protest trumps muslim ban hopefully see ️️
37999	[[0.87335134]]	cambridge friends rally 5pm tomorrow gsm trumps muslimban uk govts complicity whos coming ht
52571	[[0.8906126]]	seattle join us 5pm westlake park protest president trumps immigration refugee ban
25520	[[0.897992]]	protest muslimban tomorrow dallas city hall 5 pm
97778	[[0.8847035]]	west hollywood holding antitrump rally today 5pm pst ill defend great president
90082	[[0.87723655]]	today 5pm est take questions president trumps immigration ban response new york
0	[[1.]]	emergency rally trumps muslim travel ban nyc 125 5 pm


In [116]:
sents = sentenceEmbs[indices]
print(sents.shape)

(10, 768)


In [117]:
num = np.dot(sents, sents.T)
            
if scipy.sparse.issparse(sents):
    magnitude = norm(sents.toarray(), axis = 1)
else:
    magnitude = norm(sents, axis = 1)

den = np.dot(magnitude.reshape(-1, 1), magnitude.T.reshape(1, -1))


cosine_matrix = np.array(num/den)

(10, 10)


In [118]:
cosine_matrix[9]

array([0.8552605 , 0.85747445, 0.85783404, 0.8714711 , 0.8733514 ,
       0.8906124 , 0.89799184, 0.88470346, 0.87723655, 0.9999999 ],
      dtype=float32)

## 5. Lexrank with tfidf and bert embeddings

In [13]:
lsh = LSH(embeddingData)
lsh.train(num_bits = 32)
lex = LexRank(embeddingData, lsh)

(123385,)


In [None]:
%%time
lex.build_graph(search_radius = 1, percent = 0.05)

In [16]:
# lex.matrix.getnnz(axis = 1)

array([  1,   0, 346, ...,   1,   0,  87], dtype=int32)

In [15]:
lex.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [16]:
sentIds2 = lex.extract_sentences(n_sents = 15)

Extracting sentences.....
0 ,  id:  105661 :  when will we get a prime minister who stands up against trump for british values top q by today
1 ,  id:  108290 :  trumps america is a rogue state the remaining free world nations to impose sanctions before its too late
2 ,  id:  24813 :  at jfk where protesters have totally shut down roads to terminal 4 a cheer erupts as news breaks that the
3 ,  id:  75672 :  farsi speakers needed at sfo see below
4 ,  id:  79361 :  honestly how many people does the uk government deport every single day why would they speak up against the muslimban
5 ,  id:  118736 :  48 of trump voters think airport protesters across the country last weekend were paid to do so by george soros
6 ,  id:  63237 :  why did miami submit to trumps executive order culture
7 ,  id:  63259 :  over 1 million sign u k petition to ban trump from state visit
8 ,  id:  119399 :  48 ppl killed by white terrorists in us while 26 were killed by radical islamists since 911
9 ,  id:  5528

In [19]:
print(len(lex.graph[105661]))
print(len(lex.graph[106760]))

3464
5


In [22]:
print(lex.scores[105661])
print(lex.scores[106760])

0.00012323896237877713
1.8887369727844597e-06


In [23]:
lex.graph[105661]

{105583: 0.97628,
 57303: 0.97627753,
 67104: 0.97632,
 10301: 0.9763286,
 31791: 0.9763022,
 43892: 0.9762989,
 50161: 0.9764722,
 53830: 0.976532,
 41023: 0.9764174,
 46918: 0.9765652,
 47171: 0.9765777,
 39246: 0.9765649,
 55682: 0.9763928,
 66840: 0.97648364,
 99827: 0.97657996,
 118794: 0.9763814,
 90836: 0.97684056,
 12303: 0.976931,
 18219: 0.9766817,
 74522: 0.976638,
 5131: 0.97694504,
 76507: 0.9769491,
 14629: 0.9768063,
 29746: 0.9816727,
 113268: 0.97800463,
 17582: 0.97975427,
 39416: 0.97879833,
 30186: 0.97864956,
 13449: 0.97726583,
 123130: 0.98138463,
 64841: 0.982156,
 29567: 0.9792158,
 123296: 0.97778106,
 79791: 0.9867069,
 81831: 0.97874546,
 38027: 0.979946,
 44901: 0.98216265,
 29266: 0.9771631,
 90241: 0.9800799,
 79415: 0.97824323,
 22204: 0.980727,
 17229: 0.9787729,
 30901: 0.9806733,
 41087: 0.9839976,
 21952: 0.97744894,
 122933: 0.97750103,
 6392: 0.9783072,
 79364: 0.9871277,
 12152: 0.978695,
 109107: 0.9776435,
 41223: 0.97880656,
 11196: 0.97715986,

In [26]:
list(lex.graph[105583].keys())

[105661,
 79364,
 83836,
 84909,
 113268,
 109107,
 68403,
 21779,
 21856,
 66034,
 74068,
 75674,
 114106,
 121686,
 77974,
 30186,
 29567,
 79791,
 29266,
 78870,
 11203,
 15640,
 63832,
 77513,
 25830,
 10582,
 25681,
 77394,
 11319,
 32230,
 113082,
 24814,
 23210,
 74656,
 2821,
 90533,
 22313,
 119408,
 69621,
 3144,
 72935,
 45239,
 118794,
 71829,
 118742,
 110774,
 95338,
 105754,
 66840,
 43883,
 59431,
 117818,
 115730,
 110993,
 100946,
 38442,
 68522,
 68189,
 62648,
 60359,
 49601,
 47424,
 39268,
 38995,
 17933,
 5278,
 77262,
 14370,
 82668,
 57205,
 123296,
 28050,
 36493,
 118536,
 105074,
 1503,
 40476,
 76939,
 36791,
 35305,
 82317,
 84770,
 102309,
 55908,
 99827,
 66520,
 5320,
 47005,
 14894,
 44759,
 40557,
 76108,
 73708,
 47882,
 70192,
 38501,
 42196,
 1979,
 77240,
 19000,
 4763,
 35544,
 98908,
 88126,
 13224,
 101956,
 81117,
 50077,
 15202,
 92210,
 86655,
 59696,
 13477,
 87315,
 91455,
 11223,
 33812,
 112417,
 10896,
 62832,
 62914,
 62936,
 15974,
 1

In [None]:
with open('embedding_lex.pkl', 'wb') as f:
    pickle.dump(lex, f)