In [1]:
from bert_score import BERTScorer
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from lsh import LSH
from fast_lexrank import Lexrank
from numpy.linalg import norm
import time, emoji, string
from joblib import Parallel, delayed
# hide the loading messages
import scipy
import re
import warnings; warnings.simplefilter('ignore')

In [3]:
# read data
# data = pd.read_csv('/home/ehoang/hnt/data/processed_travel_ban.csv')
data = pd.read_csv('/home/ehoang/git/python/tweet_classification/data/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv')
print(data.head())
print(data.shape)

               tweet id                                              tweet  \
0  '262596552399396864'  I've got enough candles to supply a Mexican fa...   
1  '263044104500420609'  Sandy be soooo mad that she be shattering our ...   
2  '263309629973491712'  @ibexgirl thankfully Hurricane Waugh played it...   
3  '263422851133079552'  @taos you never got that magnificent case of B...   
4  '262404311223504896'  I'm at Mad River Bar &amp; Grille (New York, N...   

       label  
0  off-topic  
1   on-topic  
2  off-topic  
3  off-topic  
4  off-topic  
(10008, 3)


In [5]:
data.columns = ['TweetId', 'Tweet', 'label']
data = data[data['label'] == 'on-topic']
data.shape

(6138, 3)

In [6]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '',  string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—|→', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

# remove tweets #unique words less than haft of length
data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['uniWPercent']!=0]
# # remove tweets with lengths < 3, duplicates
while data['uniWPercent'].min() <=2:
    data = data[data['uniWPercent'] >2]
    data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
# # # remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.head())

                 TweetId                                              Tweet  \
1   '263044104500420609'  sandy be soooo mad that she be shattering our ...   
5   '263101347421888513'  neighborly duties. @cory_kennedy arrives to th...   
7   '263298821189156865'  i don't know how i'm getting back to jersey si...   
10  '262914476989358080'  already flooded so much #sandy @ hoboken http:...   
12  '262991999911743490'  on that note, i pray that everyone stays safe,...   

       label                                             Tweet1  uniWPercent  
1   on-topic  sandy soooo mad shattering doors shiet hurrica...            7  
5   on-topic  neighborly duties corykennedy arrives rescue s...           12  
7   on-topic  know im getting back jersey since trains subwa...            9  
10  on-topic  already flooded much sandy hoboken httptcomphf...            6  
12  on-topic  note pray everyone stays safe keeps positive a...            9  


In [7]:
data.shape

(5563, 5)

In [8]:
remained_index = data.index

In [9]:
data = data.reset_index(drop=True)

In [10]:
data.shape

(5563, 5)

In [257]:
data = data.iloc[0:5000]

In [11]:
class Lexrank:
    """
    lexrank model combined with lsh & cosine similarity
    """

    def __init__(self, data):
        self.data = data
        self.graph = {}
        self.matrix = None
        self.scores = None
           

    def build_graph_bertscore(self, input_file, sim_thres=0.3):        
        
        with open(input_file, 'r') as f:
            for i, line in enumerate(f):
                if i == 4999:
                    break
                content = line[0:line.index('[')].split(',')
                idx = int(content[0])
                sim_scores = [float(x) for x in line[line.index('[')+1:len(line)-2].split(',')]
                if idx not in self.graph:
                    self.graph[idx] = {}
                for j, score in enumerate(sim_scores):
                    if score <=sim_thres:
                        continue
                    self.graph[idx][idx+j+1] = score
                    if idx+j+1 not in self.graph:
                        self.graph[idx+j+1] = {}
                    self.graph[idx+j+1][idx] = score
#                     print(idx, idx+j+1, score)
                if i%200 == 0:
                    print("Line: ", i, idx)

    def build_graph_cosine(self, sim_thres = 0.3, batch_size = 1000):
        for i in range(0, data.shape[0], batch_size):
            rightBound = i+batch_size
            if rightBound > data.shape[0]:
                rightBound = data.shape[0]
            
            sents = self.data[i: rightBound]  # get list of sentVecs
            
            num = np.dot(sents, sents.T)
            if scipy.sparse.issparse(sents):
                magnitude = norm(sents.toarray(), axis=1)
            else:
                magnitude = norm(sents, axis=1)
            den = np.dot(magnitude.reshape(-1, 1), magnitude.T.reshape(1, -1))

            cosine_matrix = np.array(num / den)
            indices = np.where(cosine_matrix > sim_thres)
            if len(indices[0]) == 0:
                continue
            for row, col in zip(indices[0], indices[1]):
                if row != col:  # ignore self-links
#                     matrix_indices.append([b[row], b[col]])
#                     weights.append(cosine_matrix[row][col])
#                     weights.append(1)

                    if i+row not in self.graph:
                        self.graph[i+row] = {}
                    if i+col not in self.graph:
                        self.graph[i+col] = {}
                        
                    self.graph[i+row][i+col] = cosine_matrix[row][col]
                    self.graph[i+col][i+row] = cosine_matrix[col][row]
        
    # using pagerank pagekage
    def page_rank(self, damping_factor=0.85):
        pr = pagerank(self.matrix, p=damping_factor)
        self.scores = pr

    def train(self, lexrank_iter=100, damping_factor=0.85):
        n = self.data.shape[0]

        # for each node: compute sum of weights of adjacent nodes
        sum_weights = {}
        for sent, adj in self.graph.items():
            sum_weights[sent] = sum(adj.values())

        self.scores = [1 / n] * n  # initialize pagerank scores

        for iter in range(lexrank_iter):
            if iter % 10 == 0:
                print("Iteration: {}".format(iter))
            for sent, adjs in self.graph.items():
                score = 0
                for adj, value in adjs.items():
                    score += self.scores[adj] * value / sum_weights[adj]
                self.scores[sent] = (1 - damping_factor)/n +damping_factor * score

    def extract_summary(self, n_sents=10, cosine_thres=0.5, max_sent=100):

        sentIds = []
        sentScores = np.array(self.scores.copy())

        print("Extracting sentences....")
        # get #max_sent maximal scores along with its indices
        print("Sent scores: {}".format(len(sentScores)))

        indices = np.argpartition(sentScores, -max_sent)[-max_sent:]
        values = sentScores[indices]
        max_index_value = {key: value for key, value in zip(indices, values)}
        max_index_value = sorted(max_index_value.items(), key=lambda x: (x[1], x[0]))

        i = 0
        while i < n_sents:
            index, value = max_index_value.pop()
            if index not in self.graph:
                print("Sent {} not in graph".format(index))
                continue
            assign = 1
            # iterate selected sentences
            for idx in sentIds:
                # if new index is not an ajdacent node of the selected one
                if idx not in self.graph[index]:
                    continue
                similarity = self.graph[index][idx]
                if similarity > cosine_thres:
                    print("Sent {} is similar to a {}: {}".format(index, idx, similarity))
                    assign = 0
                    break
            if assign == 1:
#                 print(i, ", ", 'TweetId: ', self.data.iloc[index]['Id'], ": ", self.data.iloc[index]['Tweet'])
                print("selected one: {}, {}".format(index, value))
                sentIds.append(index)
                i += 1
        return sentIds

In [12]:
lex = Lexrank(data)

In [14]:
lex.build_graph_bertscore(input_file = "files/bertscore_sandy.txt", sim_thres=0.15)

Line:  0 40
Line:  200 200
Line:  400 400
Line:  600 600
Line:  800 800
Line:  1000 1000
Line:  1200 1200
Line:  1400 1400
Line:  1600 1600
Line:  1800 1800
Line:  2000 2000
Line:  2200 2200
Line:  2400 2400
Line:  2600 2600
Line:  2800 2800
Line:  3000 3000
Line:  3200 3200
Line:  3400 3400
Line:  3600 3600
Line:  3800 3801
Line:  4000 3978
Line:  4200 4207
Line:  4400 4396
Line:  4600 4537
Line:  4800 4776


In [15]:
lex.train(lexrank_iter=100, damping_factor=0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [16]:
bertscore_dict = {}
for key, value in lex.graph.items():
    bertscore_dict[key] = len(value)


In [17]:
bertscore_dict = {k: v for k, v in sorted(bertscore_dict.items(), key=lambda item: item[1], reverse = True)}

In [18]:
bertscore_dict

{99: 2249,
 620: 1203,
 350: 1172,
 378: 1153,
 1030: 1135,
 327: 1127,
 246: 1045,
 1175: 1042,
 3259: 1040,
 1294: 1016,
 1348: 999,
 1121: 992,
 997: 983,
 2849: 981,
 1606: 975,
 519: 957,
 1633: 954,
 728: 937,
 849: 929,
 2111: 885,
 130: 879,
 1428: 819,
 248: 818,
 2019: 816,
 2069: 811,
 986: 803,
 1181: 794,
 2258: 786,
 2427: 780,
 2132: 778,
 1205: 775,
 2178: 761,
 201: 758,
 2143: 752,
 6: 747,
 959: 744,
 2368: 736,
 1538: 733,
 1873: 703,
 1417: 689,
 2881: 684,
 2113: 683,
 1159: 670,
 2487: 662,
 3053: 649,
 751: 642,
 3161: 630,
 2976: 614,
 1331: 602,
 3323: 602,
 1430: 600,
 2644: 596,
 3096: 593,
 3843: 583,
 2693: 574,
 2335: 571,
 3622: 558,
 1861: 556,
 2562: 554,
 881: 547,
 2024: 544,
 806: 543,
 2200: 534,
 1147: 527,
 3332: 516,
 3913: 498,
 3052: 497,
 3094: 494,
 1940: 489,
 2353: 480,
 2707: 480,
 2105: 475,
 3953: 473,
 1997: 472,
 3208: 469,
 4048: 467,
 3821: 467,
 2280: 456,
 1938: 453,
 4226: 449,
 4105: 446,
 3684: 443,
 4055: 443,
 211: 439,
 1605

In [19]:
count = 0
selected = []
for key, value in bertscore_dict.items():
    
    if count>0:
        added = True
        for k in selected:
            if k in lex.graph[key]:
                if lex.graph[key][k]>0.18:
                    added = False
                    break
        
        if added==True:
            selected.append(key)
            count+=1
            print(count, ".", key, str(data.iloc[key]['Tweet']))
    else:
        selected.append(key)
        count+=1
        print(key, str(data.iloc[key]['Tweet']))
            
    if count > 20: 
        break

99 attention everyone i lost power
2 . 350 she's prepared for the hurricane :d http://t.co/n6imkdfb
3 . 1538 hurricane sandy map shows storm's path towards u.s. northeast http://t.co/ddehpmzy
4 . 1605 u.k. stocks fall on potential hurricane sandy costs http://t.co/fqw6us4a
5 . 3860 hurricane sandy to put 50 million people at risk http://t.co/xvlaxaik
6 . 334 #sandy #longbranch #hurricane @ long branch beach http://t.co/r8srs7ho
7 . 740 the hurricane sucks but these tweets are hilarious
8 . 2135 i'm so ready for this hurricane. this will be awesome.
9 . 1497 praying for the families on the east coast that are affected by hurricane sandy
10 . 276 i feel like this reporter needs to get out of the waist-high water he's in. #hurricanesandy http://t.co/tmqzkfw4
11 . 901 i hope everyone stays safe in tthe storm .
12 . 821 priorities. @ frankenstorm apocalypse - hurricane sandy http://t.co/ufsinzih
13 . 1469 my prayers go out to all those affected by hurricane #sandy
14 . 198 i'm at frankensto

In [20]:
sentIds = lex.extract_summary(n_sents=20, cosine_thres=0.18, max_sent=1500)

Extracting sentences....
Sent scores: 5563
selected one: 99, 0.020362938570255562
Sent 2849 is similar to a 99: 0.24236867
Sent 3259 is similar to a 99: 0.22208646
Sent 248 is similar to a 99: 0.284607
Sent 1417 is similar to a 99: 0.2097813
Sent 751 is similar to a 99: 0.2026759
selected one: 6, 0.004577398203453475
Sent 1331 is similar to a 99: 0.3155448
Sent 1147 is similar to a 99: 0.18579854
Sent 169 is similar to a 99: 0.24745522
Sent 935 is similar to a 99: 0.5884399
Sent 336 is similar to a 99: 0.26807207
selected one: 1030, 0.0028076002278925273
Sent 620 is similar to a 99: 0.21526583
Sent 350 is similar to a 6: 0.18906683
Sent 1876 is similar to a 99: 0.22974372
Sent 1656 is similar to a 99: 0.25457937
Sent 1038 is similar to a 99: 0.2897222
selected one: 740, 0.002532463970487118
Sent 1294 is similar to a 99: 0.18819058
Sent 378 is similar to a 6: 0.26080358
Sent 327 is similar to a 99: 0.20043525
Sent 389 is similar to a 99: 0.45595524
selected one: 2135, 0.0023856271511411

In [21]:
for idx in sentIds:
    print(idx, data.iloc[idx]['Tweet'])

99 attention everyone i lost power
6 debating going home in prep for #sandy
1030 preparing for hurricane sandy obvs @ cafeteria http://t.co/7nknjjfa
740 the hurricane sucks but these tweets are hilarious
2135 i'm so ready for this hurricane. this will be awesome.
1497 praying for the families on the east coast that are affected by hurricane sandy
1522 no classes tomorrow. clutch hurricane!!!
1469 my prayers go out to all those affected by hurricane #sandy
901 i hope everyone stays safe in tthe storm .
3440 when is the hurricane hitting us?
2065 knee surgery in a hurricane ??
211 flood watch issued for west new york, nj http://t.co/xi4smzjr
3861 this hurricane has got me wondering if the world is going to end.
1546 this is the least transparent hurricane in history.
3342 who names a hurricane #sandy
168 the struggle is real for the east coast right now.
11 one more shower before the power goes out lol
3232 that hurricane wasn't shit, honestly.
334 #sandy #longbranch #hurricane @ long br

# Tfidf

In [22]:
#extract tfidf vector
tfidf = TfidfVectorizer()
tfidfData = tfidf.fit_transform(data['Tweet1'])
print(tfidfData.shape)

(5563, 12759)


In [23]:
lex_tfidf = Lexrank(tfidfData)

In [24]:
lex_tfidf.build_graph_cosine(sim_thres = 0.3, batch_size = 1000)

In [25]:
lex_tfidf.train(lexrank_iter=100, damping_factor=0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [26]:
sentIds_cosine = lex_tfidf.extract_summary(n_sents=15, cosine_thres=0.32, max_sent=100)

Extracting sentences....
Sent scores: 5563
selected one: 4654, 0.0007415051510650309
selected one: 3884, 0.0007409162929365912
Sent 4587 is similar to a 4654: 0.6895422698793127
selected one: 935, 0.000673296762688381
selected one: 336, 0.0006604461953503624
selected one: 5077, 0.0005927192696921286
selected one: 1469, 0.0005377706145639321
selected one: 3452, 0.0005317420420181366
selected one: 2823, 0.0005110743244175504
selected one: 3647, 0.0005043781709206044
Sent 2077 is similar to a 2823: 0.5719794184426914
Sent 1903 is similar to a 1469: 0.5281151522125032
selected one: 3370, 0.0004698985784048764
selected one: 169, 0.0004468896492297515
Sent 1586 is similar to a 1469: 0.5701009469766751
selected one: 5520, 0.0004407903318076213
selected one: 4418, 0.00044071521298858
selected one: 1423, 0.00042753521092546914
selected one: 96, 0.00042753521092546914


In [27]:
for idx in sentIds_cosine:
    print(idx, data.iloc[idx]['Tweet'])

4654 our thoughts are with everyone affected by hurricane sandy. stay safe everyone!
3884 it was a hurricane in new york. ??????
935 and i just lost power... #sandy
336 i hope everyone is safe
5077 the hurricane #sandy is a bitch
1469 my prayers go out to all those affected by hurricane #sandy
3452 f u hurricane sandy you r ruining my plans
2823 my thoughts and prayers go out to all those affected by hurricane sandy. everyone stay safe out there.
3647 thoughts and prayers with those seriously affected by hurricane sandy ??
3370 now how do hurricane sandy got a twitter?
169 fuck sandy. im so bored.
5520 prayers for those affected by hurricane sandy
4418 praying for our friends on the east coast and all those affected by hurricane sandy!
1423 that hurricane aint gonna be shit
96 all i want is my power back


## bert sentence + cosine

In [302]:
# extracting bertsen embeddings
from sentence_transformers import SentenceTransformer
import torch

In [190]:
batch_size = 1000
modelSent = SentenceTransformer('bert-base-nli-mean-tokens', device='cuda:3')
embeddings = np.empty((0, 768))

leftBound = 0
with torch.no_grad():
    while  leftBound < data.shape[0]:
        rightBound = leftBound+batch_size
        if rightBound > data.shape[0]:
            rightBound = data.shape[0]
        embeddings = np.concatenate((embeddings, modelSent.encode(list(data.iloc[leftBound:rightBound]['Tweet']))), axis=0)
        leftBound += batch_size
        print("Len: ", embeddings.shape)

Len:  (1000, 768)
Len:  (2000, 768)
Len:  (3000, 768)
Len:  (4000, 768)
Len:  (5000, 768)
Len:  (5563, 768)


In [191]:
embeddings = np.array(embeddings)

In [192]:
lex_tfidf = Lexrank(embeddings)

In [193]:
lex_tfidf.build_graph_cosine(sim_thres = 0.78, batch_size = 1000)

In [194]:
lex_tfidf.train(lexrank_iter=100, damping_factor=0.85)

Iteration: 0
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90


In [199]:
sentIds_cosine = lex_tfidf.extract_summary(n_sents=15, cosine_thres=0.95, max_sent=100)

Extracting sentences....
Sent scores: 5563
selected one: 2049, 0.0009398526444886435
Sent 2056 is similar to a 2049: 0.9754064283235309
selected one: 2175, 0.0008315276926358679
selected one: 1215, 0.0008292351641471199
selected one: 2642, 0.0008085009316160764
selected one: 3757, 0.0007779894939829897
selected one: 3692, 0.0007684638926116131
selected one: 3222, 0.0007609810184126736
selected one: 4393, 0.0007590110479251326
selected one: 3219, 0.0007478553930113903
selected one: 5341, 0.0007394949521352687
selected one: 1171, 0.0007326094482242041
selected one: 3067, 0.0007287842355116134
selected one: 1500, 0.0007233649325520445
selected one: 1357, 0.0007157112723014797
selected one: 1374, 0.0007153152139621953


In [200]:
for idx in sentIds_cosine:
    print(idx, data.iloc[idx]['Tweet'])

2049 @lmao_twitpics: due to hurricane sandy n!ggas be like... http://t.co/srrunufz
2175 wow rt @thenotoriousniv: damn...the news just said hurricane sandy is gonna re-draw the shoreline! smh
1215 getting ready for hurricane sandy :) (@ frankenstorm apocalypse - hurricane sandy w/ 2656 others) http://t.co/95qhqudy
2642 tell me more about this devastating category 1 storm... ?? #hurricane #sandy #sigh http://t.co/jjor0e9z
3757 “@ifly_to_high “@lmao_twitpics due to hurricane sandy n!ggas be like... http://t.co/p7fmttde””
3692 @capitalweather: unbelievable before and after photos from hurricane #sandy: http://t.co/wgd3j8gk
3222 told u xd @therealshouq_ rt@ispeakcomedy: hurricane sandy's coming! http://t.co/xxuscxic
4393 @hurricannesandy what if hurricane sandy destroys you...
3219 @thereporters: news: sandy loses hurricane status & still dangerous will be referred to as superstorm sandy - eye nears landfal ...
5341 so how bad does hurricane sandy have to get before my work gets cancelled? 

In [201]:
bertscore_dict = {}
for key, value in lex_tfidf.graph.items():
    bertscore_dict[key] = len(value)


In [202]:
bertscore_dict = {k: v for k, v in sorted(bertscore_dict.items(), key=lambda item: item[1], reverse = True)}

In [203]:
bertscore_dict

{2049: 144,
 2056: 132,
 1215: 131,
 2642: 121,
 2175: 118,
 1500: 117,
 1171: 116,
 1374: 116,
 1357: 111,
 3757: 110,
 4393: 108,
 3692: 106,
 1124: 103,
 3222: 103,
 1675: 101,
 2168: 100,
 3219: 100,
 3458: 100,
 4190: 98,
 4749: 98,
 4295: 98,
 2414: 97,
 1616: 96,
 3906: 96,
 1518: 95,
 2926: 94,
 4122: 94,
 2401: 93,
 4513: 93,
 2353: 92,
 3824: 91,
 3540: 91,
 1411: 90,
 4274: 90,
 1098: 89,
 1590: 89,
 1806: 89,
 3677: 88,
 4974: 88,
 1134: 87,
 1404: 87,
 3700: 87,
 3394: 87,
 3795: 87,
 1085: 86,
 2123: 86,
 2719: 86,
 3286: 86,
 4348: 86,
 4774: 86,
 4758: 86,
 4765: 86,
 3351: 83,
 4908: 83,
 3067: 82,
 1407: 81,
 1749: 81,
 1872: 81,
 2945: 81,
 2179: 80,
 2863: 80,
 4211: 80,
 4560: 79,
 1009: 78,
 3881: 78,
 3096: 78,
 4866: 78,
 4954: 78,
 4953: 78,
 1543: 77,
 2265: 77,
 3328: 77,
 1848: 76,
 3834: 76,
 4853: 76,
 4632: 76,
 4102: 76,
 4956: 76,
 1899: 75,
 2104: 75,
 2412: 75,
 3737: 75,
 3768: 74,
 4875: 74,
 1990: 73,
 3679: 73,
 3044: 73,
 1786: 72,
 2133: 72,
 32

In [204]:
count = 0
selected = []
for key, value in bertscore_dict.items():
    
    if count>0:
        added = True
        for k in selected:
            if k in lex.graph[key]:
                if lex.graph[key][k]>0.18:
                    added = False
                    break
        
        if added==True:
            selected.append(key)
            count+=1
            print(count, ".", key, str(data.iloc[key]['Tweet']))
    else:
        selected.append(key)
        count+=1
        print(key, str(data.iloc[key]['Tweet']))
            
    if count > 15: 
        break

2049 @lmao_twitpics: due to hurricane sandy n!ggas be like... http://t.co/srrunufz
2 . 1215 getting ready for hurricane sandy :) (@ frankenstorm apocalypse - hurricane sandy w/ 2656 others) http://t.co/95qhqudy
3 . 2642 tell me more about this devastating category 1 storm... ?? #hurricane #sandy #sigh http://t.co/jjor0e9z
4 . 2175 wow rt @thenotoriousniv: damn...the news just said hurricane sandy is gonna re-draw the shoreline! smh
5 . 1500 hurricane sandy: the transformation. rt @reko_trill's photo http://t.co/xkqtp6fw
6 . 1374 @bj_hobbs: and here's olly with a look at what hurricane #sandy is bringing. http://t.co/pjzg0v0c
7 . 1357 @hurrrcanesandy: niggas be like hurricane #sandy http://t.co/vyibllbc
8 . 4393 @hurricannesandy what if hurricane sandy destroys you...
9 . 3222 told u xd @therealshouq_ rt@ispeakcomedy: hurricane sandy's coming! http://t.co/xxuscxic
10 . 2168 @wheeler03: bettman, hurricane sandy is comming for u. take the deal, @nonhllockout12
11 . 3219 @thereporters: new

# 2. Trying with first 5.000 tweets of the travel ban event

In [212]:
data = data.iloc[0:5000]

In [248]:
file = "bertscore_travel_ban.txt"
def compute_bert(start_idx, batch, device, thres):
   
    if start_idx + batch > data.shape[0]:
        end_idx = data.shape[0]
    else:
        end_idx = start_idx + batch
    scorer = BERTScorer(lang='en', rescale_with_baseline = True, idf = False, 
                               device = 'cuda:'+str(device))
    print("device: {}..running {}-{}".format(device, start_idx, end_idx))
    
    for idx in range(start_idx, end_idx):
        # compute bert score
        batch_size = 1000
        sim_score = []
        for i in range(idx+1, data.shape[0], batch_size):
            rightBound = i+batch_size
            if i + batch_size > data.shape[0]:
                rightBound = data.shape[0]
            sim = scorer.score([str(data.iloc[idx]['Tweet'])]*(rightBound -i), list(data.iloc[i:rightBound]['Tweet']))[0]
    
            sim_score+=list(sim.numpy())
        with open(file, 'a') as f:
            f.write("{},{},{}\n".format(idx, len(sim_score), str(sim_score)))


In [243]:
indices = np.arange(0, data.shape[0], 150)

In [244]:
indices

array([   0,  150,  300,  450,  600,  750,  900, 1050, 1200, 1350, 1500,
       1650, 1800, 1950, 2100, 2250, 2400, 2550, 2700, 2850, 3000, 3150,
       3300, 3450, 3600, 3750, 3900, 4050, 4200, 4350, 4500, 4650, 4800,
       4950])

In [246]:
indices=[4950]

In [249]:
batch = 150
results_xx = Parallel(n_jobs = 1)(delayed(compute_bert)(start, batch, i%10+1, 0.15) for i, start in enumerate(indices))

device: 1..running 4950-5000
