In [1]:
from bert_score import BERTScorer
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from lsh import LSH
from fast_lexrank import Lexrank
import time, emoji, string
import re

In [2]:
# read data
data = pd.read_csv('/home/nguyen/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '',  string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—|→', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

# remove tweets #unique words less than haft of length
data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['uniWPercent']!=0]
# # remove tweets with lengths < 3, duplicates
while data['uniWPercent'].min() <=2:
    data = data[data['uniWPercent'] >2]
    data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
# # # remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.head())

                   Id                                              Tweet  \
0  824941360449015808  emergency rally against trump's muslim travel ...   
1  824941519857610752  theresa may has not apologized to trump for in...   
2  824941616314122240  trump's immigration ban excludes countries wit...   
3  824942056741167105  trump's immigration order expands the definiti...   
4  824942966875774976  alert : senator john mccain threatens action o...   

                                              Tweet1  uniWPercent  
0  emergency rally trumps muslim travel ban nyc 1...           10  
1  theresa may apologized trump insulting fails t...           11  
2  trumps immigration ban excludes countries busi...            9  
3  trumps immigration order expands definition cr...            6  
4  alert senator john mccain threatens action pre...            8  


In [4]:
data.shape

(105175, 4)

In [5]:
remained_index = data.index

In [6]:
data = data.reset_index(drop=True)

In [7]:
data.shape

(105175, 4)

In [8]:
data = data.iloc[0:1000]

In [9]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

(1000, 3313)


In [10]:
lsh_tfidf = LSH(tfidfData)
lsh_tfidf.train(num_bits = 8)


(1000,)


In [11]:
scorer = BERTScorer(lang='en', rescale_with_baseline = True, idf = True, idf_sents = list(data['Tweet']), device='cuda:3')

In [13]:
%%time
lex_tfidf = Lexrank(np.array(data['Tweet']), lsh_tfidf)
lex_tfidf.build_graph_bert_score(scorer, search_radius = 0, sim_thres = 0.1)

#buckets: 220
.......buck: 0, vec: 3


IndexError: list index out of range

In [24]:
sents = np.array(data['Tweet'])

In [17]:
buckets = lsh_tfidf.extract_nearby_bins(max_search_radius = 1)


In [18]:
len(buckets)

248

In [31]:
print("#buckets: {}".format(len(buckets)))
k = 0
graph = {}
sim_thres = 0.1
for b in buckets:

    print("Bucket: ", b)
    # compute bert_score and build graph
    for i in range(len(b)-1):
        _, _, f1 = scorer.score([sents[b[i]]]*len(refs), list(sents[refs]))
        for idx, score in enumerate(f1):
            if score > sim_thres:
                if b[i] not in graph:
                    graph[b[i]] = {}
                if b[idx] not in graph:
                    graph[b[idx]] = {}
                graph[b[i]][b[idx]] = score
                graph[b[idx]][b[i]] = score
    k+=1

#buckets: 248
Bucket:  [48, 492, 723, 305, 522, 600, 202, 486, 527, 998, 101, 250, 303, 347, 450, 632, 634, 661, 762, 16, 18, 493, 547, 810, 846, 863, 967, 52, 372, 894, 468, 916]
[492, 723, 305, 522, 600, 202, 486, 527, 998, 101, 250, 303, 347, 450, 632, 634, 661, 762, 16, 18, 493, 547, 810, 846, 863, 967, 52, 372, 894, 468, 916]
[723, 305, 522, 600, 202, 486, 527, 998, 101, 250, 303, 347, 450, 632, 634, 661, 762, 16, 18, 493, 547, 810, 846, 863, 967, 52, 372, 894, 468, 916]
[305, 522, 600, 202, 486, 527, 998, 101, 250, 303, 347, 450, 632, 634, 661, 762, 16, 18, 493, 547, 810, 846, 863, 967, 52, 372, 894, 468, 916]
[522, 600, 202, 486, 527, 998, 101, 250, 303, 347, 450, 632, 634, 661, 762, 16, 18, 493, 547, 810, 846, 863, 967, 52, 372, 894, 468, 916]
[600, 202, 486, 527, 998, 101, 250, 303, 347, 450, 632, 634, 661, 762, 16, 18, 493, 547, 810, 846, 863, 967, 52, 372, 894, 468, 916]
[202, 486, 527, 998, 101, 250, 303, 347, 450, 632, 634, 661, 762, 16, 18, 493, 547, 810, 846, 863, 967, 5

KeyboardInterrupt: 