In [1]:
from bert_score import BERTScorer
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from lsh import LSH
from fast_lexrank import Lexrank
import time, emoji, string
from joblib import Parallel, delayed
# hide the loading messages
import re
import warnings; warnings.simplefilter('ignore')

In [2]:
# read data
data = pd.read_csv('/home/ehoang/git/python/tweet_classification/data/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv')
print(data.head())
print(data.shape)

               tweet id                                              tweet  \
0  '262596552399396864'  I've got enough candles to supply a Mexican fa...   
1  '263044104500420609'  Sandy be soooo mad that she be shattering our ...   
2  '263309629973491712'  @ibexgirl thankfully Hurricane Waugh played it...   
3  '263422851133079552'  @taos you never got that magnificent case of B...   
4  '262404311223504896'  I'm at Mad River Bar &amp; Grille (New York, N...   

       label  
0  off-topic  
1   on-topic  
2  off-topic  
3  off-topic  
4  off-topic  
(10008, 3)


In [3]:
data.columns = ['TweetId', 'Tweet', 'label']

In [4]:
data = data[data['label'] == 'on-topic']
data.shape

(6138, 3)

In [5]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI).strip())
# remove stopwords, punctuation
stopWords = stopwords.words('english')
data['Tweet1'] = data['Tweet'].apply(lambda x: ' '.join(y for y in x.split(" ") if y not in stopWords))
data['Tweet1'] = data['Tweet1'].apply(lambda x: x.translate(str.maketrans('', '',  string.punctuation)))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub('“|…|’|‘|”|—|→', "", x))
data['Tweet1'] = data['Tweet1'].apply(lambda x: re.sub(' +', ' ',x).strip())

# remove tweets #unique words less than haft of length
data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
data = data[data['uniWPercent']!=0]
# # remove tweets with lengths < 3, duplicates
while data['uniWPercent'].min() <=2:
    data = data[data['uniWPercent'] >2]
    data['uniWPercent'] = data['Tweet1'].apply(lambda x: 0 if len(set(x.split(" ")))/len(x.split(" ")) <= 0.5 else len(x.split(" ")))
# # # remove duplicates
data.drop_duplicates(subset=['Tweet1'], keep='first', inplace = True)
print(data.head())

                 TweetId                                              Tweet  \
1   '263044104500420609'  sandy be soooo mad that she be shattering our ...   
5   '263101347421888513'  neighborly duties. @cory_kennedy arrives to th...   
7   '263298821189156865'  i don't know how i'm getting back to jersey si...   
10  '262914476989358080'  already flooded so much #sandy @ hoboken http:...   
12  '262991999911743490'  on that note, i pray that everyone stays safe,...   

       label                                             Tweet1  uniWPercent  
1   on-topic  sandy soooo mad shattering doors shiet hurrica...            7  
5   on-topic  neighborly duties corykennedy arrives rescue s...           12  
7   on-topic  know im getting back jersey since trains subwa...            9  
10  on-topic  already flooded much sandy hoboken httptcomphf...            6  
12  on-topic  note pray everyone stays safe keeps positive a...            9  


In [6]:
data.shape

(5563, 5)

In [7]:
remained_index = data.index

In [8]:
data = data.reset_index(drop=True)

In [9]:
data.shape

(5563, 5)

In [None]:
class Lexrank:
    """
    lexrank model combined with lsh & cosine similarity
    """

    def __init__(self, data, lsh):
        self.data = data
        self.lsh = lsh
        self.graph = {}
        self.matrix = None
        self.scores = None
           

    def build_graph(self, input_file, sim_thres=0.3):        
        
        with open(file, 'r') as f:
            for line in f:
                print(line.split(',')[0])
            break
            
        
           

    # using pagerank pagekage
    def page_rank(self, damping_factor=0.85):
        pr = pagerank(self.matrix, p=damping_factor)
        self.scores = pr

    def train(self, lexrank_iter=100, damping_factor=0.85):
        n = self.data.shape[0]

        # for each node: compute sum of weights of adjacent nodes
        sum_weights = {}
        for sent, adj in self.graph.items():
            sum_weights[sent] = sum(adj.values())

        self.scores = [1 / n] * n  # initialize pagerank scores

        for iter in range(lexrank_iter):
            if iter % 10 == 0:
                print("Iteration: {}".format(iter))
            for sent, adjs in self.graph.items():
                score = 0
                for adj, value in adjs.items():
                    score += self.scores[adj] * value / sum_weights[adj]
                self.scores[sent] = (1 - damping_factor)/n +damping_factor * score

    def extract_summary(self, n_sents=10, cosine_thres=0.5, max_sent=100):

        sentIds = []
        sentScores = np.array(self.scores.copy())

        print("Extracting sentences....")
        # get #max_sent maximal scores along with its indices
        print("Sent scores: {}".format(len(sentScores)))

        indices = np.argpartition(sentScores, -max_sent)[-max_sent:]
        values = sentScores[indices]
        max_index_value = {key: value for key, value in zip(indices, values)}
        max_index_value = sorted(max_index_value.items(), key=lambda x: (x[1], x[0]))

        i = 0
        while i < n_sents:
            index, value = max_index_value.pop()
            if index not in self.graph:
                print("Sent {} not in graph".format(index))
                continue
            assign = 1
            # iterate selected sentences
            for idx in sentIds:
                # if new index is not an ajdacent node of the selected one
                if idx not in self.graph[index]:
                    continue
                similarity = self.graph[index][idx]
                if similarity > cosine_thres:
                    print("Sent {} is similar to a {}: {}".format(index, idx, similarity))
                    assign = 0
                    break
            if assign == 1:
#                 print(i, ", ", 'TweetId: ', self.data.iloc[index]['Id'], ": ", self.data.iloc[index]['Tweet'])
                print("selected one: {}, {}".format(index, value))
                sentIds.append(index)
                i += 1
        return sentIds