In [1]:
from bert_score import BERTScorer
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from lsh import LSH
from fast_lexrank import Lexrank
import time, emoji, string
# hide the loading messages
import re
import warnings; warnings.simplefilter('ignore')

In [2]:
# read data
data = pd.read_csv('/home/ehoang/hnt/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '',  string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—|→', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

# remove tweets #unique words less than haft of length
data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['uniWPercent']!=0]
# # remove tweets with lengths < 3, duplicates
while data['uniWPercent'].min() <=2:
    data = data[data['uniWPercent'] >2]
    data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
# # # remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.head())

                   Id                                              Tweet  \
0  824941360449015808  emergency rally against trump's muslim travel ...   
1  824941519857610752  theresa may has not apologized to trump for in...   
2  824941616314122240  trump's immigration ban excludes countries wit...   
3  824942056741167105  trump's immigration order expands the definiti...   
4  824942966875774976  alert : senator john mccain threatens action o...   

                                              Tweet1  uniWPercent  
0  emergency rally trumps muslim travel ban nyc 1...           10  
1  theresa may apologized trump insulting fails t...           11  
2  trumps immigration ban excludes countries busi...            9  
3  trumps immigration order expands definition cr...            6  
4  alert senator john mccain threatens action pre...            8  


In [4]:
data.shape

(105175, 4)

In [5]:
remained_index = data.index

In [6]:
data = data.reset_index(drop=True)

In [7]:
data.shape

(105175, 4)

In [8]:
# data = data.iloc[0:10000]

In [9]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

(105175, 48876)


In [10]:
lsh_tfidf = LSH(tfidfData)
lsh_tfidf.train(num_bits = 8)


(105175,)


In [11]:
buckets = lsh_tfidf.extract_nearby_bins(max_search_radius = 0)

In [12]:
for b in buckets:
    print(len(b))

540
535
474
493
539
607
538
628
589
575
496
501
586
600
559
584
370
374
369
457
477
425
491
558
396
429
398
415
409
419
411
491
424
470
469
510
470
495
574
578
550
446
454
437
523
556
558
543
297
378
321
424
412
387
504
519
374
379
347
361
443
422
433
456
407
419
355
400
437
477
386
463
435
480
386
440
441
510
413
467
272
292
245
331
301
321
345
468
323
350
300
379
313
384
337
446
349
407
349
427
372
444
444
470
378
477
363
388
405
458
376
473
278
267
282
368
351
319
376
429
296
315
216
293
313
354
231
393
527
503
484
459
564
570
542
622
571
517
460
457
534
603
467
533
405
406
338
423
425
419
498
546
365
354
351
360
390
418
350
391
421
438
426
445
529
509
582
611
471
479
433
385
509
443
480
429
363
376
358
402
436
478
502
574
317
332
338
333
393
374
411
397
420
385
376
405
366
478
377
481
428
456
350
405
392
465
381
432
312
296
278
314
301
284
303
378
289
295
256
327
290
339
251
367
327
400
314
387
392
360
363
458
353
377
309
324
348
408
342
338
266
303
244
341
295
331
369
362
271
298


In [13]:
# scorers = []
# for i in range(2):
#     scorers.append(BERTScorer(lang='en', rescale_with_baseline = True, idf = True, 
#                               idf_sents = list(data['Tweet']), device = 'cuda:'+str(i)))


In [14]:
scorer = BERTScorer(lang='en', rescale_with_baseline = True, idf = True, 
                              idf_sents = list(data['Tweet']), device = 'cuda:0')

In [None]:
%%time
lex_tfidf = Lexrank(np.array(data['Tweet']), lsh_tfidf)
lex_tfidf.build_graph_bert_score(scorer, nJobs = 4, search_radius = 0, sim_thres = 0.2)

#buckets: 256
buc: 0-len: 540--315.7909483909607, 0.022743484224965707
buc: 1-len: 535--292.17539715766907, 0.014058869770285615
buc: 2-len: 474--247.8776695728302, 0.014865851270273638
buc: 3-len: 493--247.12881898880005, 0.015774596891984745


In [13]:
lex_tfidf.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [17]:
sentIds = lex_tfidf.extract_summary(n_sents = 20, cosine_thres=0.3)

Extracting sentences....
Sent scores: 10000
selected one: 1708, 0.000556831422727555
selected one: 1430, 0.0004976549535058439
selected one: 373, 0.00047330930829048157
selected one: 214, 0.0004665775632020086
selected one: 1689, 0.0004551210440695286
selected one: 417, 0.000394654693081975
selected one: 3080, 0.000389478518627584
selected one: 25, 0.0003785667649935931
selected one: 1369, 0.000376440875697881
selected one: 455, 0.0003686040872707963
selected one: 6907, 0.0003566291998140514
selected one: 369, 0.0003566105442587286
selected one: 3029, 0.0003495930868666619
selected one: 80, 0.0003444254689384252
selected one: 6395, 0.00034216122003272176
selected one: 2730, 0.00034100376069545746
selected one: 7144, 0.00033905619056895375
selected one: 1471, 0.0003382976574357599
selected one: 163, 0.00033712407457642257
selected one: 7719, 0.0003328357997816056


In [18]:
print("Id", "#adjacentEdges", "lexrank")
for i, idx in enumerate(sentIds):
    print(i, idx, len(lex_tfidf.graph[idx]), lex_tfidf.scores[idx])

Id #adjacentEdges lexrank
0 1708 12 tensor(0.0006)
1 1430 20 tensor(0.0005)
2 373 16 tensor(0.0005)
3 214 10 tensor(0.0005)
4 1689 12 tensor(0.0005)
5 417 8 tensor(0.0004)
6 3080 8 tensor(0.0004)
7 25 11 tensor(0.0004)
8 1369 7 tensor(0.0004)
9 455 10 tensor(0.0004)
10 6907 10 tensor(0.0004)
11 369 6 tensor(0.0004)
12 3029 6 tensor(0.0003)
13 80 5 tensor(0.0003)
14 6395 6 tensor(0.0003)
15 2730 6 tensor(0.0003)
16 7144 4 tensor(0.0003)
17 1471 14 tensor(0.0003)
18 163 12 tensor(0.0003)
19 7719 10 tensor(0.0003)


In [19]:
# with idf
for i, idx in enumerate(sentIds):
    print(i, data.iloc[idx]['Tweet'])

0 president trump signs executive order targeting refugees
1 trump says syrian christian refugees will be given priority for entering u.s. !
2 statement from on trump's immigration executive orders
3 link : trump's radical immigration plan : enforce the law .
4 president trump has signed an executive action implementing " new vetting measures " for immigrants …
5 breaking : president trump expected to sign executive order restricting immigration from 7 muslim countries !
6 the disastrous consequences of trump’s new immigration rules
7 trump's immigration ban excludes countries with business ties via
8 take action against trump's executive order slamming the door on refugees .
9 watch prime minister theresa may’s meeting with president trump
10 refugees challenge trump executive order after being detained at u.s. airports
11 trump’s proposed refugee ban would abandon iraqis who risked their lives working for the u.s. military …
12 #trump trump suspends refugee entry , vows priority for 

In [17]:
# with idf
for i, idx in enumerate(sentIds):
    print(i, data.iloc[idx]['Tweet'])

0 president trump signs executive order targeting refugees
1 trump says syrian christian refugees will be given priority for entering u.s. !
2 statement from on trump's immigration executive orders
3 link : trump's radical immigration plan : enforce the law .
4 president trump has signed an executive action implementing " new vetting measures " for immigrants …
5 breaking : president trump expected to sign executive order restricting immigration from 7 muslim countries !
6 the disastrous consequences of trump’s new immigration rules
7 trump's immigration ban excludes countries with business ties via
8 take action against trump's executive order slamming the door on refugees .
9 watch prime minister theresa may’s meeting with president trump
10 refugees challenge trump executive order after being detained at u.s. airports
11 trump’s proposed refugee ban would abandon iraqis who risked their lives working for the u.s. military …
12 #trump trump suspends refugee entry , vows priority for 

In [None]:
following trump’s executive order , green card , visa holders already blocked at airports …
1 trump executive order : refugees detained at us airports follow for more
2 trump signs executive order for ‘ extreme vetting ’ of refugees
3 trump executive order : refugees detained at us airports
4 trump's state visit to the uk :
5 list of trump's executive orders |
6 trump executive order : refugees detained at us airports - bbc news
7 breaking : prime minister theresa may has arrived at the white house for talks with president trump .
8 trump signs executive actions on immigration , military
9 president trump signs executive order temporarily halting all refugees
10 trump signs ' new vetting ' immigration order
11 trump says syrian christian refugees will be given priority for entering u.s. !
12 ' we don't want them here ' president trump signs executive order for ' extreme vetting ' of refugees …
13 trump’s immigration ban excludes countries with business ties
14 breaking : trump to sign executive order today to temporarily halt refugees from some muslim-majority countries - white house
15 trump orders ' extreme vetting ' of refugees
16 refugees on the way to the u.s. when president trump's executive order was signed were detained at airports …
17 trump signs ' extreme vetting ' order to block refugees
18 prime minister theresa may arrives at the white house for a meeting with president donald trump
19 donald trump has gone too far for dick cheney . dick cheney !!! that dick cheney .

In [21]:
lex_tfidf.graph[1708]

{318: tensor(0.4154),
 721: tensor(0.1207),
 1072: tensor(0.4366),
 2981: tensor(0.2801),
 3612: tensor(0.1630),
 4336: tensor(0.3717),
 4422: tensor(0.1017),
 4492: tensor(0.1199),
 4604: tensor(0.1046),
 6040: tensor(0.4027),
 6088: tensor(0.1614),
 7588: tensor(0.2022)}

In [18]:
lex_tfidf.graph[4375] #first tweet of tfidf model

{3083: tensor(0.2183),
 4099: tensor(0.2883),
 4318: tensor(0.3272),
 4593: tensor(0.3452),
 4962: tensor(0.1903),
 5040: tensor(0.1836),
 5144: tensor(0.1750),
 6179: tensor(0.2547),
 6600: tensor(0.1153),
 6698: tensor(0.3994),
 6907: tensor(0.2541),
 7123: tensor(0.1382),
 7353: tensor(0.6009)}

In [None]:
following trump’s executive order , green card , visa holders already blocked at airports …
1 trump executive order : refugees detained at us airports follow for more
2 trump signs executive order for ‘ extreme vetting ’ of refugees
3 trump executive order : refugees detained at us airports
4 trump's state visit to the uk :
5 list of trump's executive orders |
6 trump executive order : refugees detained at us airports - bbc news
7 breaking : prime minister theresa may has arrived at the white house for talks with president trump .
8 trump signs executive actions on immigration , military
9 president trump signs executive order temporarily halting all refugees
10 trump signs ' new vetting ' immigration order
11 trump says syrian christian refugees will be given priority for entering u.s. !
12 ' we don't want them here ' president trump signs executive order for ' extreme vetting ' of refugees …
13 trump’s immigration ban excludes countries with business ties
14 breaking : trump to sign executive order today to temporarily halt refugees from some muslim-majority countries - white house
15 trump orders ' extreme vetting ' of refugees
16 refugees on the way to the u.s. when president trump's executive order was signed were detained at airports …
17 trump signs ' extreme vetting ' order to block refugees
18 prime minister theresa may arrives at the white house for a meeting with president donald trump
19 donald trump has gone too far for dick cheney . dick cheney !!! that dick cheney .

In [21]:
# without idf
for i, idx in enumerate(sentIds):
    print(i, data.iloc[idx]['Tweet'])

0 not everyone in the u.k. is overjoyed about may’s meeting with trump
1 link : trump's radical immigration plan : enforce the law .
2 the real danger is the rippling effect of trump’s ban on syrian refugees , both abroad and in the u.s.
3 take action against trump's executive order slamming the door on refugees .
4 trump’s proposed refugee ban would abandon iraqis who risked their lives working for the u.s. military …
5 president trump signs executive order targeting refugees
6 trump says syrian christian refugees will be given priority for entering u.s. !
7 read the full text of trump's executive order limiting muslim entry to the u.s. …
8 breaking : president trump expected to sign executive order restricting immigration from 7 muslim countries !
9 entire senior management of state department quit in apparent gesture of defiance to trump …
10 wsj : trump signs executive action that he says would keep “ radical islamic terrorists ” out of the u.s.
11 donald trump and theresa may hold

In [29]:
data[data['Tweet'].str.contains('green card')]

Unnamed: 0,Id,Tweet,Tweet1,uniWPercent
1216,825064765261176832,company sent out a notice about trump's muslim...,company sent notice trumps muslim ban green ca...,10
2323,825137645504172032,visas being denied immediately . chaos at airp...,visas denied immediately chaos airports air mu...,13
2419,825141021906382848,"ban applies if you have a visa , green card , ...",ban applies visa green card even dual citizen ...,9
2609,825147334342303745,current concern : what about people with green...,current concern people green cards currently a...,13
2755,825152564652081157,many elderly come to green card interview with...,many elderly come green card interview suit ti...,12
...,...,...,...,...
9902,825405997065830402,i want to repeat : green card holders were han...,want repeat green card holders handcuffed soci...,12
9918,825406219376525312,"dems , where is your response to what is happe...",dems response happening wrt muslimban refugees...,12
9922,825406307452801028,green card holders included in trump ban : hom...,green card holders included trump ban homeland...,9
9975,825406932395626497,"and here .... green card holders , too . may w...",green card holders may whatever god universe b...,9
