In [1]:
from bert_score import BERTScorer
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from lsh import LSH
from fast_lexrank import Lexrank
import time, emoji, string
# hide the loading messages
import re
import warnings; warnings.simplefilter('ignore')

In [2]:
# read data
data = pd.read_csv('/home/ehoang/hnt/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '',  string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—|→', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

# remove tweets #unique words less than haft of length
data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['uniWPercent']!=0]
# # remove tweets with lengths < 3, duplicates
while data['uniWPercent'].min() <=2:
    data = data[data['uniWPercent'] >2]
    data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
# # # remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.head())

                   Id                                              Tweet  \
0  824941360449015808  emergency rally against trump's muslim travel ...   
1  824941519857610752  theresa may has not apologized to trump for in...   
2  824941616314122240  trump's immigration ban excludes countries wit...   
3  824942056741167105  trump's immigration order expands the definiti...   
4  824942966875774976  alert : senator john mccain threatens action o...   

                                              Tweet1  uniWPercent  
0  emergency rally trumps muslim travel ban nyc 1...           10  
1  theresa may apologized trump insulting fails t...           11  
2  trumps immigration ban excludes countries busi...            9  
3  trumps immigration order expands definition cr...            6  
4  alert senator john mccain threatens action pre...            8  


In [4]:
data.shape

(105175, 4)

In [5]:
remained_index = data.index

In [6]:
data = data.reset_index(drop=True)

In [7]:
data.shape

(105175, 4)

In [8]:
# data = data.iloc[0:10000]

In [None]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

In [None]:
lsh_tfidf = LSH(tfidfData)
lsh_tfidf.train(num_bits = 8)


In [None]:
buckets = lsh_tfidf.extract_nearby_bins(max_search_radius = 0)

In [None]:
for b in buckets:
    print(len(b))

In [None]:
# scorers = []
# for i in range(2):
#     scorers.append(BERTScorer(lang='en', rescale_with_baseline = True, idf = True, 
#                               idf_sents = list(data['Tweet']), device = 'cuda:'+str(i)))


In [None]:
# scorer = BERTScorer(lang='en', rescale_with_baseline = True, idf = True, 
#                               idf_sents = list(data['Tweet']), device = 'cuda:0')

In [None]:
%%time
lex_tfidf = Lexrank(np.array(data['Tweet']), lsh_tfidf)
lex_tfidf.build_graph_bert_score(scorer, nJobs = 4, search_radius = 0, sim_thres = 0.0)

In [16]:
lex_tfidf.train(lexrank_iter = 100, damping_factor = 0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [21]:
sentIds = lex_tfidf.extract_summary(n_sents = 20, cosine_thres=0.05)

Extracting sentences....
Sent scores: 105175
selected one: 2308, 0.00016267717545550078
Sent 2232 is similar to a 2308: 0.06183760240674019
selected one: 6554, 0.0001435023933956082
selected one: 9699, 0.00013920528327345002
selected one: 9266, 0.0001366544223113021
selected one: 1274, 0.00013273544847640603
selected one: 11341, 0.00012631667204310456
selected one: 10777, 0.00012625004801884056
selected one: 214, 0.00012066388655633267
selected one: 3896, 0.00012033727094492562
selected one: 8132, 0.000119573180849666
selected one: 1739, 0.00011814550255408933
selected one: 4945, 0.00011669149593890295
selected one: 8771, 0.00011033140623668407
selected one: 8934, 0.00010782450588396402
selected one: 8536, 0.00010773261463037123
selected one: 5200, 0.00010726311248413626
selected one: 7450, 0.00010631571531539408
selected one: 9432, 0.00010585763972435693
selected one: 20323, 0.00010549149471261904
selected one: 373, 0.00010543462928375536


In [22]:
print("Id", "#adjacentEdges", "lexrank")
for i, idx in enumerate(sentIds):
    print(i, idx, len(lex_tfidf.graph[idx]), lex_tfidf.scores[idx])

Id #adjacentEdges lexrank
0 2308 243 0.00016267717545550078
1 6554 344 0.0001435023933956082
2 9699 169 0.00013920528327345002
3 9266 193 0.0001366544223113021
4 1274 201 0.00013273544847640603
5 11341 278 0.00012631667204310456
6 10777 193 0.00012625004801884056
7 214 149 0.00012066388655633267
8 3896 219 0.00012033727094492562
9 8132 190 0.000119573180849666
10 1739 279 0.00011814550255408933
11 4945 342 0.00011669149593890295
12 8771 186 0.00011033140623668407
13 8934 279 0.00010782450588396402
14 8536 256 0.00010773261463037123
15 5200 146 0.00010726311248413626
16 7450 135 0.00010631571531539408
17 9432 163 0.00010585763972435693
18 20323 226 0.00010549149471261904
19 373 199 0.00010543462928375536


In [23]:
# with idf
for i, idx in enumerate(sentIds):
    print(i, data.iloc[idx]['Tweet'])

0 president trump's " extreme vetting " plans is causing anxiety for u.s. muslims
1 breaking : refugees being detained at u.s. airports
2 growing fallout from trump's new immigration crackdown :
3 calls to ban muslims from entering the u.s. are offensive and unconstitutional .
4 obama’s open borders policy undone : trump reverses course
5 iran retaliates against trump order by banning u.s. visitors .
6 please oppose the immigrant ban .
7 link : trump's radical immigration plan : enforce the law .
8 as trump issues his order on refugees and immigration , pro-lifers march in d.c. will they speak out on this issue ?
9 muslim ban : refugees detained at u.s. airports under trump's immigration order ..
10 trump’s immigration actions reverse obama’s open borders policy
11 trump’s immigration ban is illegal
12 . signs executive order banning syrian refugees from entering the u.s.
13 protesters rally against trump's muslim immigration ban .
14 hey america ! the last six presidents have blocked 

In [26]:
len(lex_tfidf.graph[373]) # last selected tweet

199

In [28]:
len(lex_tfidf.graph[74531]) # the one selected by lex_tfidf, but not bert_score

16

In [None]:
#load lex_tfidf

In [14]:
import pickle
with open('bert_score_lsh_tfidf.pkl', 'rb') as f:
    lex_tfidf = pickle.load(f)

In [None]:
lex_tfidf

In [8]:
green_card_idx = data[data['Tweet'].str.contains('green card')].index

In [9]:
green_card_idx

Int64Index([  1216,   2323,   2419,   2609,   2755,   2995,   3099,   3202,
              3259,   3385,
            ...
            102906, 102993, 103099, 103206, 103512, 103920, 104019, 104760,
            104876, 104916],
           dtype='int64', length=1446)

In [10]:
len(green_card_idx)

1446

In [11]:
green_card_idx[0]

1216

In [12]:
data.iloc[green_card_idx[0]]

Id                                            825064765261176832
Tweet          company sent out a notice about trump's muslim...
Tweet1         company sent notice trumps muslim ban green ca...
uniWPercent                                                   10
Name: 1216, dtype: object

In [13]:
file="green_card_bertScore.txt"

In [14]:
def compute_bert(start_idx, batch, device, thres):
   
    if start_idx + batch > len(green_card_idx):
        end_idx = len(green_card_idx)
    else:
        end_idx = start_idx + batch
    scorer = BERTScorer(lang='en', rescale_with_baseline = True, idf = False, 
                               device = 'cuda:'+str(device))
    print("device: {}..running {}-{}".format(device, start_idx, end_idx))
    
    for idx in range(start_idx, end_idx):
        # compute bert score
        count = 0
        count_green_card = 0
        time_start = time.time()
        batch_size = 1000
        for i in range(0, data.shape[0], batch_size):
            rightBound = i+batch_size
            if i + batch_size > data.shape[0]:
                rightBound = data.shape[0]
            can1 = scorer.score([str(data.iloc[green_card_idx[idx]]['Tweet'])]*(rightBound -i), list(data.iloc[i:rightBound]['Tweet']))[0]
            count += sum(x>thres for x in can1)
        can2 = scorer.score([str(data.iloc[green_card_idx[idx]]['Tweet'])]*len(green_card_idx), list(data.iloc[green_card_idx]['Tweet']))[0]
        count_green_card = int(sum(x>thres for x in can2))
        print(time.time() - time_start)
        with open(file, 'a') as f:
            f.write("{}, {}, {}, {}\n".format(green_card_idx[idx], int(count), count_green_card, round(count_green_card/int(count), 2)))
#         print("idx: {}, #neighbors by bert: {}, cuda: {}, time: {}".format(green_card_idx[idx], count, device, time.time()-time_start))
#         print("...............................................")
        
#     return idx_neighbors_count
        

In [17]:
indices = np.arange(0, len(green_card_idx), 145)

In [18]:
indices

array([   0,  145,  290,  435,  580,  725,  870, 1015, 1160, 1305])

In [19]:
from joblib import Parallel, delayed

In [20]:
batch = 145
results_xx = Parallel(n_jobs = 10)(delayed(compute_bert)(start, batch, i%10+1, 0.2) for i, start in enumerate(indices))

KeyboardInterrupt: 