# Summarization
1. Built-in Textrank with gensim
2. Built-in Lexrank
3. Lexrank with tfidf & LSH
4. Lexrank with sentence embedding & LSH

In [1]:
from numpy.linalg import norm
from fast_pagerank import pagerank
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
# from sentence_transformers import SentenceTransformer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from fast_lexrank import Lexrank
import numpy as np
from lsh import LSH
import pandas as pd
import scipy
import pickle
import time
import re
import emoji, string

In [2]:
# read data
data = pd.read_csv('/home/nguyen/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI))
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '',  string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—|→', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

# remove tweets #unique words less than haft of length
data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['uniWPercent']!=0]
# # remove tweets with lengths < 3, duplicates
while data['uniWPercent'].min() <=2:
    data = data[data['uniWPercent'] >2]
    data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
# # # remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.head())

                   Id                                              Tweet  \
0  824941360449015808  emergency rally against trump's muslim travel ...   
1  824941519857610752  theresa may has not apologized to trump for in...   
2  824941616314122240  trump's immigration ban excludes countries wit...   
3  824942056741167105  trump's immigration order expands the definiti...   
4  824942966875774976  alert : senator john mccain threatens action o...   

                                              Tweet1  uniWPercent  
0  emergency rally trumps muslim travel ban nyc 1...           10  
1  theresa may apologized trump insulting fails t...           11  
2  trumps immigration ban excludes countries busi...            9  
3  trumps immigration order expands definition cr...            6  
4  alert senator john mccain threatens action pre...            8  


In [4]:
data.shape

(105175, 4)

In [5]:
# def lemmatize_stemming(text):
#     return WordNetLemmatizer().lemmatize(text, pos='v')
# data['Tweet1'] = data['Tweet1'].apply(lambda x: ' '.join(lemmatize_stemming(y) for y in x.split(" ") if y.strip()!= ""))

In [6]:
data.iloc[21284]

Id                                            825521541756645377
Tweet          if you or someone you know has been affected b...
Tweet1         someone know affected new refugeeimmigrationtr...
uniWPercent                                                   10
Name: 23995, dtype: object

In [7]:
remained_index = data.index

In [8]:
data = data.reset_index(drop=True)

In [9]:
# data[data['Tweet'].str.contains("president trump fires acting attorne")]

In [10]:
print(list(data.iloc[0:10]['Tweet1']))

['emergency rally trumps muslim travel ban nyc 125 5 pm', 'theresa may apologized trump insulting fails today trump send back b', 'trumps immigration ban excludes countries business ties via democracyfor', 'trumps immigration order expands definition criminal', 'alert senator john mccain threatens action president trump', 'kiva still distracted trump gets peoples business', 'ty bailing gmb today piers morgan drank trump kool aid vocal opponent', 'trump sign eo temporary ban suspending visas syria six african countries buildthewall', 'moral obligation stop hitler moral obligation stop trump', 'people getting radicalized trump always hate freedom']


In [11]:
data.shape

(105175, 4)

## 1. Lexrank + lsh + tfidf

In [84]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

(105175, 48876)


In [85]:
lsh_tfidf = LSH(tfidfData)
lsh_tfidf.train(num_bits = 8)
lex_tfidf = Lexrank(tfidfData, lsh_tfidf)

(105175,)


In [None]:
%%time
lex_tfidf.build_graph(search_radius = 1, cosine_sim = 0.3)

#buckets: 256
.......Buck: 0, vec: (4405, 48876)
.......Buck: 100, vec: (3664, 48876)


In [17]:
lex.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [18]:
sentIds = lex.extract_summary(n_sents = 20, cosine_thres=0.5)

Extracting sentences....
Sent scores: 105175
selected one: 62658, 0.00020242707405364153
Sent 80396 is similar to a 62658: 0.7907479705005656
selected one: 13109, 0.00018880050779761126
selected one: 32294, 0.00017162311007124417
selected one: 91027, 0.00015780050458681738
selected one: 16346, 0.00013832732862091305
Sent 62378 is similar to a 32294: 0.800885372372297
selected one: 55536, 0.00012749418907202905
selected one: 22089, 0.0001244623936573965
Sent 10126 is similar to a 13109: 0.8216643750207849
Sent 33510 is similar to a 22089: 0.720695735370632
selected one: 103476, 0.00011871329635948848
selected one: 39127, 0.00011697775526405306
selected one: 41746, 0.00011553935404484163
selected one: 15499, 0.00011492288316361148
Sent 85108 is similar to a 16346: 0.5930587680453396
Sent 61849 is similar to a 62658: 0.672487377445378
Sent 65930 is similar to a 55536: 0.932304152068153
Sent 47447 is similar to a 62658: 0.7400724194755225
selected one: 71398, 0.00010798483103909025
selecte

In [19]:
print("Id", "#adjacentEdges", "lexrank")
for i, idx in enumerate(sentIds):
    print(i, len(lex.graph[idx]), lex.scores[idx])

Id Index lexrank
0 356 0.00020242707405364153
1 342 0.00018880050779761126
2 215 0.00017162311007124417
3 159 0.00015780050458681738
4 259 0.00013832732862091305
5 149 0.00012749418907202905
6 258 0.0001244623936573965
7 102 0.00011871329635948848
8 228 0.00011697775526405306
9 95 0.00011553935404484163
10 149 0.00011492288316361148
11 149 0.00010798483103909025
12 147 0.00010697164164957916
13 475 0.0001059602037582633
14 158 0.00010534054644177753
15 250 0.00010525056852056679
16 129 9.781553837955189e-05
17 103 9.574184662715573e-05
18 105 9.462894645832923e-05
19 78 9.19887782750149e-05


In [20]:
for i, idx in enumerate(sentIds):
    print(i, data.iloc[idx]['Tweet'])

0  on executive order immigration 
1 green card holders 
2 ban all muslim countries 
3 what do people in the middle east think about terrorism ? for more on the middle east visit http …
4 our statement on president trump’s executive order on immigration : 
5  make america great . again ?
6  from saudi arabia . #trump's ban doesn't include saudi arabia .
7 keep america safe president trump keep the ban on muslim refugees , keep them out and keep us safe . 
8 my statement on president trump's executive order on refugees : 
9  it is a religion ban trump
10 protest at the airport !!! #muslimban 
11 the president of the united states fires the attorney general of the united states .
12  are you from any of the 7 banned countries ? what are you on about ?
13 trump's state visit to the uk :
14 it's not a muslim ban . it's a ban from muslim countries where trump doesn't do business . 
15 trump signs executive order on ‘ extreme vetting ’ 
16 jfk protest . #muslimban #nobannowall 
17 it's a tem

In [21]:
with open('saved_models/tfidf_model.pkl', "wb") as f:
    pickle.dump(lex, f)

#####

* <b>Sub-events captured:</b>
    1. green card orders
    2. protest at jfk airport
    3. attorney general get fired
    4. ban 7 countries, muslim countries
    5. trump's ban doesn't include saudi arabia .
    6. trump visit uk
    7. executive order on ‘ extreme vetting ’ 
* <b>Lack</b>:
    1. starbuck hires 10K refugees
    2. trump’s deportation orders 
    3. washington state will sue to stop trump's immigration
    4. canada will accept the refugees 
    5. quebec city mosque shooting 


## 2. Lexrank with sentEmbeddings and LSH

In [41]:
#load embeddings
with open('/home/nguyen/data/travel_ban_sentence_transformers_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)
print(len(embeddings))

123385


In [None]:
embeddings[0].shape
count = 0
list = []
for i in range(1000):
    cos_sim = cosine_similarity(embeddings[0].reshape(1, -1), embeddings[i].reshape(1, -1))
    print(cos_sim)
    
    if cos_sim >0.65:
        count+=1
        list.append(i)
print("Number of sentences that are similar with the first sen:", count)

In [42]:
sentenceEmbs = np.array(embeddings)
print(sentenceEmbs.shape)

(123385, 768)


In [43]:
sentenceEmbs = sentenceEmbs[remained_index]

In [44]:
sentenceEmbs.shape

(105175, 768)

In [46]:
lsh = LSH(sentenceEmbs)
lsh.train(num_bits=14)
table = lsh.model['table']

(105175,)


In [None]:
count = 0
for key, value in table.items():
    print(key, len(value))
    if len(value) > 150:
        count+=1
print(len(table))
print("#buckets with >150 sens: ", count)

In [48]:
lex = Lexrank(sentenceEmbs, lsh)

In [None]:
%%time
lex.build_graph(search_radius = 1, percent=0.03)

In [50]:
lex.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [70]:
data[data['Tweet'].str.contains('green card')]

Unnamed: 0,Id,Tweet,Tweet1,uniWPercent
1216,825064765261176832,company sent out a notice about trump's muslim...,company sent notice trumps muslim ban green ca...,10
2323,825137645504172032,visas being denied immediately . chaos at airp...,visas denied immediately chaos airports air mu...,13
2419,825141021906382848,"ban applies if you have a visa , green card , ...",ban applies visa green card even dual citizen ...,9
2609,825147334342303745,current concern : what about people with green...,current concern people green cards currently a...,13
2755,825152564652081157,many elderly come to green card interview with...,many elderly come green card interview suit ti...,12
...,...,...,...,...
103920,827623785662644224,my uncle from syria needs a green card any tak...,uncle syria needs green card takers,6
104019,827626771994533889,why is tomi defending the rights of milo ( a ...,tomi defending rights milo non american rights...,12
104760,827658820658851840,green card coming soon !,green card coming soon,4
104876,827664285862129668,alert : dos issues stmnt clarifying ban dn app...,alert dos issues stmnt clarifying ban dn apply...,18


In [82]:
data.iloc[2146]

Id                                            825128489326088196
Tweet          trump signs executive order on ‘ extreme vetti...
Tweet1               trump signs executive order extreme vetting
uniWPercent                                                    6
Name: 2146, dtype: object

In [83]:
lsh.model[]

214

In [60]:
# 14 bins
sentIds = lex.extract_summary(n_sents = 20, cosine_thres = 0.89, max_sent=500)

Extracting sentences....
Sent scores: 105175
selected one: 42773, 0.00010440142035777392
selected one: 70826, 0.00010220440798520725
selected one: 72679, 9.952317010832082e-05
Sent 67230 is similar to a 70826: 0.9897810275772859
selected one: 23584, 9.790741698338028e-05
Sent 66616 is similar to a 70826: 0.9818513941126814
Sent 65594 is similar to a 23584: 0.8982108068636003
selected one: 15323, 9.108063279018107e-05
selected one: 31600, 8.952282977429021e-05
selected one: 2754, 8.94612728374494e-05
selected one: 38737, 8.916163568577142e-05
Sent 96348 is similar to a 70826: 0.9162692941825311
selected one: 15006, 8.773176537896183e-05
selected one: 34043, 8.760762320177292e-05
selected one: 59451, 8.726155740041375e-05
selected one: 9399, 8.668920580458683e-05
selected one: 49222, 8.600378997886487e-05
Sent 76628 is similar to a 23584: 0.8911665337519232
Sent 101509 is similar to a 42773: 0.8944038170610407
Sent 9573 is similar to a 70826: 0.9093985959365581
Sent 6261 is similar to a 

In [61]:
for i, idx in enumerate(sentIds):
    print("{}. {}".format(i, data.iloc[idx]['Tweet']))

0. " this policy is going to get americans killed : " sen. chris murphy on trump's refugee ban • #antitrumpmvmt …
1. us muslim leaders sue trump over ' fear-mongering ' travel ban via …
2. pragmactivist : trump fired ag #sallyyates for enforcing laws and #resist ing a #muslimban . will he fire seanspi … 
3.  remove trump ! commit/lock trump away ! aclu exec director anthony romero comes out aclu just argued won block trump's muslim ban
4. breaking - #cair planning lawsuit against trump over #muslimban i guess ur lawyers don't know a president is immune ? https :/ …
5. if muslims cause further bloodshed to americans it will be on the hands of the federal judge that blocked the trump enforcement order ! #maga
6. trump blasted by top democrat for imposing a religious test on fleeing refugees via …
7. thr : ' we can't keep pretending ' conway defends trump immigration ban , rips media ' a new one ' for bias …
8. tech finds its voice , opposing trump’s muslim ban : " so un-american , it pai

In [57]:
print("Id", "Index", "lexrank")
for i, idx in enumerate(sentIds):
    print(i, len(lex.graph[idx]), lex.scores[idx])

Id Index lexrank
0 3382 0.00010440142035777392
1 3538 0.00010220440798520725
2 3442 9.952317010832082e-05
3 3391 9.790741698338028e-05
4 3227 9.367025901136787e-05
5 3189 9.108063279018107e-05
6 3096 8.952282977429021e-05
7 3014 8.94612728374494e-05
8 3141 8.916163568577142e-05
9 2817 8.773176537896183e-05
10 2833 8.760762320177292e-05
11 2853 8.726155740041375e-05
12 3063 8.668920580458683e-05
13 2805 8.600378997886487e-05
14 2703 8.509400810913653e-05
15 2810 8.270572045910686e-05
16 2757 8.257767461424093e-05
17 2617 8.183067572493015e-05
18 2689 8.029157895416175e-05
19 2798 8.02714257585991e-05


In [78]:
len(lex.graph[43711])

1205

## Combination of LSH with tfidf and lex rank with sent embeddings

In [29]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

(105175, 48876)


In [31]:
lsh = LSH(tfidfData)
lsh.train(num_bits = 8)
lex = LexRank(sentenceEmbs, lsh)

(105175,)


NameError: name 'LexRank' is not defined

In [None]:
for key, value in table.items():
    print(key, len(value))
print(len(table))

In [None]:
%%time
lex.build_graph(search_radius = 1, percent=0.01)

In [None]:
lex.train(lexrank_iter = 100, damping_factor = 0.85)

In [None]:
sentIds = lex.extract_sentences(n_sents = 20, cosThres = 0.85)

In [None]:
sentIds

In [None]:
for i, idx in enumerate(sentIds):
    print(i, len(lex.graph[idx]))

# Test: 

In [107]:
tfidfData.shape

(105176, 48876)

In [108]:
tfidfCosine = cosine_similarity(tfidfData[0], tfidfData)

In [109]:
# extract 10 sentence with highest similarity
indices = np.argpartition(tfidfCosine[0], -10)[-10:]
elements = tfidfCosine[0][indices]
print("Indices: {}".format(indices))
print("Cosine values: {}".format(elements))
print("Tweet 0:", str(data.iloc[0]['Tweet1']))
for i in indices:
    print("{}\t{}\t{}".format(i, tfidfCosine[0][i], str(data.iloc[i]['Tweet1'])))

Indices: [69002 89160 45071 97336  8934 47837 23965 97958 79194     0]
Cosine values: [0.29885418 0.31449294 0.31054197 0.32932623 0.3548277  0.37053829
 0.35177915 0.32968794 0.33406313 1.        ]
Tweet 0: emergency rally trumps muslim travel ban nyc 125 5 pm
69002	0.29885417674875303	watch live protesters gather rally hate nyc response trumps refugee ban
89160	0.314492944844012	google workers rally trumps travel ban
45071	0.3105419662113536	many american flags todays nyc rally trumps muslim ban america resist htt
97336	0.3293262319915567	defend immigrants trump rally nyc 2 pm today near nyu nobannowall
8934	0.35482769835646444	protesters rally trumps muslim immigration ban
47837	0.3705382876374108	senator chuck schumer slams trumps travel ban nyc rally
23965	0.35177914692132783	blocks call trumps unconstitutional muslim ban comes emergency hearing nyc
97958	0.32968793637635174	judge extends emergency stay blocking trumps travel ban via newyork nyc
79194	0.334063133607824	know friday

In [110]:
embCosine = cosine_similarity(sentenceEmbs[0].reshape(1, -1), sentenceEmbs)
# extract 10 sentence with highest similarity
indices = np.argpartition(embCosine[0], -10)[-10:]
elements = embCosine[0][indices]
print("Indices: {}".format(indices))
print("Cosine values: {}".format(elements))
print("Tweet 0:", str(data.iloc[0]['Tweet1']))
for i in indices:
    print("{}\t{}\t{}".format(i, embCosine[0][i], str(data.iloc[i]['Tweet1'])))

Indices: [ 1279 44958 40096 52607 37999 52571 25520 97778 90082     0]
Cosine values: [0.8552602  0.85747457 0.8578343  0.87147146 0.87335134 0.8906126
 0.897992   0.8847035  0.8772365  1.        ]
Tweet 0: emergency rally trumps muslim travel ban nyc 125 5 pm
1279	0.8552601933479309	protest today trump dapl downtown 5pm 611 woodward
44958	0.8574745655059814	seattle protest muslimban tomorrow westlake park 5pm
40096	0.8578342795372009	emergency demo york tomorrow 5pm st helens square protest muslimban uk complicity mu
52607	0.8714714646339417	morning half 5 today protest trumps muslim ban hopefully see ️️
37999	0.8733513355255127	cambridge friends rally 5pm tomorrow gsm trumps muslimban uk govts complicity whos coming ht
52571	0.8906126022338867	seattle join us 5pm westlake park protest president trumps immigration refugee ban
25520	0.8979920148849487	protest muslimban tomorrow dallas city hall 5 pm
97778	0.884703516960144	west hollywood holding antitrump rally today 5pm pst ill defend

In [111]:
for i in indices:
    print("{}\t{}\t{}".format(i, cosine_similarity(sentenceEmbs[0].reshape(1, -1), sentenceEmbs[i].reshape(1, -1)), str(data.iloc[i]['Tweet1'])))

1279	[[0.85526013]]	protest today trump dapl downtown 5pm 611 woodward
44958	[[0.85747457]]	seattle protest muslimban tomorrow westlake park 5pm
40096	[[0.8578342]]	emergency demo york tomorrow 5pm st helens square protest muslimban uk complicity mu
52607	[[0.8714714]]	morning half 5 today protest trumps muslim ban hopefully see ️️
37999	[[0.87335134]]	cambridge friends rally 5pm tomorrow gsm trumps muslimban uk govts complicity whos coming ht
52571	[[0.8906126]]	seattle join us 5pm westlake park protest president trumps immigration refugee ban
25520	[[0.897992]]	protest muslimban tomorrow dallas city hall 5 pm
97778	[[0.8847035]]	west hollywood holding antitrump rally today 5pm pst ill defend great president
90082	[[0.87723655]]	today 5pm est take questions president trumps immigration ban response new york
0	[[1.]]	emergency rally trumps muslim travel ban nyc 125 5 pm


In [116]:
sents = sentenceEmbs[indices]
print(sents.shape)

(10, 768)


In [117]:
num = np.dot(sents, sents.T)
            
if scipy.sparse.issparse(sents):
    magnitude = norm(sents.toarray(), axis = 1)
else:
    magnitude = norm(sents, axis = 1)

den = np.dot(magnitude.reshape(-1, 1), magnitude.T.reshape(1, -1))


cosine_matrix = np.array(num/den)

(10, 10)


In [118]:
cosine_matrix[9]

array([0.8552605 , 0.85747445, 0.85783404, 0.8714711 , 0.8733514 ,
       0.8906124 , 0.89799184, 0.88470346, 0.87723655, 0.9999999 ],
      dtype=float32)

## 5. Lexrank with tfidf and bert embeddings

In [13]:
lsh = LSH(embeddingData)
lsh.train(num_bits = 32)
lex = LexRank(embeddingData, lsh)

(123385,)


In [None]:
%%time
lex.build_graph(search_radius = 1, percent = 0.05)

In [16]:
# lex.matrix.getnnz(axis = 1)

array([  1,   0, 346, ...,   1,   0,  87], dtype=int32)

In [15]:
lex.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [16]:
sentIds2 = lex.extract_sentences(n_sents = 15)

Extracting sentences.....
0 ,  id:  105661 :  when will we get a prime minister who stands up against trump for british values top q by today
1 ,  id:  108290 :  trumps america is a rogue state the remaining free world nations to impose sanctions before its too late
2 ,  id:  24813 :  at jfk where protesters have totally shut down roads to terminal 4 a cheer erupts as news breaks that the
3 ,  id:  75672 :  farsi speakers needed at sfo see below
4 ,  id:  79361 :  honestly how many people does the uk government deport every single day why would they speak up against the muslimban
5 ,  id:  118736 :  48 of trump voters think airport protesters across the country last weekend were paid to do so by george soros
6 ,  id:  63237 :  why did miami submit to trumps executive order culture
7 ,  id:  63259 :  over 1 million sign u k petition to ban trump from state visit
8 ,  id:  119399 :  48 ppl killed by white terrorists in us while 26 were killed by radical islamists since 911
9 ,  id:  5528

In [19]:
print(len(lex.graph[105661]))
print(len(lex.graph[106760]))

3464
5


In [22]:
print(lex.scores[105661])
print(lex.scores[106760])

0.00012323896237877713
1.8887369727844597e-06


In [None]:
with open('embedding_lex.pkl', 'wb') as f:
    pickle.dump(lex, f)