# Summarization:
1. K mean with sentence embeddings returned by SentenceTransformer
2. Biterm
3. LDA

In [2]:
import os
import sys

nlp_path = os.path.abspath('../')
if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)
from utils import tokenizeRawTweetText

In [3]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import cosine_similarity
import torch
import re
import nltk
from nltk.corpus import stopwords
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import string
import emoji
import nltk, time
# nltk.download('wordnet')
from collections import Counter
import pyLDAvis
from biterm.cbtm import oBTM 
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary # helper functions
# import warnings
# warnings.filterwarnings('ignore')


In [4]:
# read data
# data = pd.read_csv('/home/nguyen/data/processed_travel_ban.csv')
data = pd.read_csv('/home/ehoang/git/python/tweet_classification/data/CrisisLexT6/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv')
print(data.head())
print(data.shape)

               tweet id                                              tweet  \
0  '262596552399396864'  I've got enough candles to supply a Mexican fa...   
1  '263044104500420609'  Sandy be soooo mad that she be shattering our ...   
2  '263309629973491712'  @ibexgirl thankfully Hurricane Waugh played it...   
3  '263422851133079552'  @taos you never got that magnificent case of B...   
4  '262404311223504896'  I'm at Mad River Bar &amp; Grille (New York, N...   

       label  
0  off-topic  
1   on-topic  
2  off-topic  
3  off-topic  
4  off-topic  
(10008, 3)


In [6]:
data.columns = ['TweetId', 'Tweet', 'label']
data = data[data['label'] == 'on-topic']
data = data.reset_index(drop=True)
data.shape

(6138, 3)

In [7]:
list(data[0:10]['Tweet'])

['Sandy be soooo mad that she be shattering our doors and shiet #HurricaneSandy',
 'Neighborly duties. @Cory_Kennedy arrives to the rescue sporting some spelunking equipment #sandy @ 300 Squad http://t.co/QbpGdm3w',
 "I don't know how I'm getting back to Jersey since the trains and subways aren't running...",
 'Already flooded so much #SANDY @ Hoboken http://t.co/MPhft4a8',
 'On that note, i pray that everyone stays safe, and keeps a positive attitude ! #godisgood',
 "@codyfinz my house is creeking... Does that mean she's trying to break in?",
 'debating going home in prep for #sandy',
 "By 11am it's going to be 100% chance of rain #HurricaneSandy",
 '@newscaster we are 5 blocks from the water. First two blocks were evacuated. Sounds like a train just went by. Stay safe! Thanks.',
 "It's crazy out there, not gonna lie I'm kind of scared."]

In [8]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: ' '.join(tokenizeRawTweetText(x)))
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('TWEETMENTION', "").replace("HTTPURL", "").
                                    replace("EMAILADDRESS", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI))
print(data.head())

                TweetId                                              Tweet  \
0  '263044104500420609'  sandy be soooo mad that she be shattering our ...   
1  '263101347421888513'  neighborly duties . arrives to the rescue spor...   
2  '263298821189156865'  i don't know how i'm getting back to jersey si...   
3  '262914476989358080'          already flooded so much #sandy @ hoboken    
4  '262991999911743490'  on that note , i pray that everyone stays safe...   

      label  
0  on-topic  
1  on-topic  
2  on-topic  
3  on-topic  
4  on-topic  


In [9]:
list(data[0:10]['Tweet'])

['sandy be soooo mad that she be shattering our doors and shiet #hurricanesandy',
 'neighborly duties . arrives to the rescue sporting some spelunking equipment #sandy @ 300 squad ',
 "i don't know how i'm getting back to jersey since the trains and subways aren't running ...",
 'already flooded so much #sandy @ hoboken ',
 'on that note , i pray that everyone stays safe , and keeps a positive attitude ! #godisgood',
 " my house is creeking ... does that mean she's trying to break in ?",
 'debating going home in prep for #sandy',
 "by 11am it's going to be 100% chance of rain #hurricanesandy",
 ' we are 5 blocks from the water . first two blocks were evacuated . sounds like a train just went by . stay safe ! thanks .',
 "it's crazy out there , not gonna lie i'm kind of scared ."]

In [10]:
def kMean(data, sentenceEmbs, n_clusters = 20, random_states = 10):
    print("data Shape: ", data.shape)
    print("Embedding shape: ", sentenceEmbs.shape)
    sentences = []
    for x in sentenceEmbs:
        sentences.append(x.ravel())
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_states)
    kmeans.fit(sentences)
    labels = kmeans.labels_.tolist()
    
    centers = np.array(kmeans.cluster_centers_)
    # compute cluster size:
    cluster_size = {}
    for i in range(n_clusters):
        cluster_size[i] = labels.count(i)
        
    # find elements closest to the cluster centers
    closest_data = []
    for i in range(n_clusters):
        center_vec = centers[i].reshape(1, -1)
        data_idx_within_i_cluster = [ idx for idx, clu_num in enumerate(labels) if clu_num == i ]

        one_cluster_tf_matrix = np.zeros( (len(data_idx_within_i_cluster) , centers.shape[1] ) )
        for row_num, data_idx in enumerate(data_idx_within_i_cluster):
            one_row = sentences[data_idx]
            one_cluster_tf_matrix[row_num] = one_row


        closest, _ = pairwise_distances_argmin_min(center_vec, one_cluster_tf_matrix)
        closest_idx_in_one_cluster_tf_matrix = closest[0]
        closest_data_row_num = data_idx_within_i_cluster[closest_idx_in_one_cluster_tf_matrix]
    #     data_id = all_data[closest_data_row_num]

        closest_data.append(closest_data_row_num)

    closest_data = list(set(closest_data))
    
    return labels, closest_data


## 1. Kmean: first token bert embeddings

In [11]:
from utils import embedding_extraction as model

In [12]:
sentenceEmbs = model.get_bert_first_token_embeddings(data, cuda="cuda:2", dataColumn="Tweet", max_len=40)

Max_len (99% data): 37.0




Encoded data: 
                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...   
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   

                                           input_ids  \
0  [101, 7525, 2022, 17111, 9541, 5506, 2008, 201...   
1  [101, 11429, 2135, 5704, 1012, 8480, 2000, 199...   
2  [101, 1045, 2123, 1005, 1056, 2113, 2129, 1045...   
3  [101, 2525, 10361, 2061, 2172, 1001, 7525, 103...   
4  [101, 2006, 2008, 3602, 1010, 1045, 11839, 200...   

                                      token_type_ids  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [8]:
with open('/home/nguyen/data/travel_ban_first_token_embeddings.pkl', 'rb') as f:
    sentenceEmbs = pickle.load(f)

In [13]:
cluster_size, closest_data = kMean(data, sentenceEmbs)

data Shape:  (6138, 4)
Embedding shape:  (6138, 768)


In [14]:
for i in closest_data:
    print(i, str(data.iloc[i]['Tweet']))

3843 the latest full disk image of earth puts hurricane sandy into perspective ... it's massive : #fra ...
5257 i can't tweet this enough .... stay safe everyone that will be affected by hurricane sandy . sending prayers .
4237 can we have a hurricane sandy day ? too much wind to be driving ...
3470 here's a copy of the lawsuit give msm a chance to make all the $ off you they can w/hurricane san ...
2201 hope that hurricane is causing toooo much harm ! \ #missyou
1820 real talk all my prayers go out to those dealing with hurricane sandy ! #staysafe
541 me & my new #pet turtle read this together ... #frankenstorm : hurricane pet emergency plan 
2334 nice game : i like that a hurricane ruining all your shit cant ruin your spirit . #positivity #newstuff #sandysa ...
2722 “ : amazing picture from ocean grove , new jersey . check these mofo waves ! #sandy #frankenstorm ” 
3234 hurricane party at the altman resident tomorrow wooooooooh
934 with all of the storms & other events that have left

##### 

* <b>Sub-events captured:</b>
        1. protest at the airport
        2. starbuck hires 10K refugees
        3. attorney general get fired
        4. trump’s deportation orders 
        5. restricting refugees & all entrants from 7 countries
        6. washington state will sue to stop trump's immigration
        7. canada will accept the refugees 
* <b>Lack</b>
        1. green card holders
        2. trump's ban doesn't include saudi arabia .
        3. trump visit uk
        4. trump sign a new executive order
        5. quebec city mosque shooting 

## 2. Kmean: sentence transformers embeddings

In [15]:
sentenceEmbs = model.get_sentence_transformers_embedings(data, cuda='cuda:2')

Len:  (1000, 768)
Len:  (2000, 768)
Len:  (3000, 768)
Len:  (4000, 768)
Len:  (5000, 768)
Len:  (6000, 768)
Len:  (6138, 768)


In [7]:
with open('/home/nguyen/data/travel_ban_sentence_transformers_embeddings.pkl', 'rb') as f:
    sentenceEmbs = pickle.load(f)

In [16]:
cluster_size, closest_data = kMean(data, sentenceEmbs)

data Shape:  (6138, 4)
Embedding shape:  (6138, 768)


In [17]:
for i in closest_data:
    print(i, str(data.iloc[i]['Tweet']))

769 according to this i live in a zone-free area . still got power & everything's working fine ! #lucky #staysafenyc 
4225 i'm ready for this hurricane . got my toilet paper , flashlight , and doritos . i should be aight ...
1670 huge . rt : rt the newest satellite view of hurricane sandy shows how massive this storm is : http ...
397 before and after #sandy @ breezy point , ny 
401 i'm at frankenstorm apocalypse - hurricane sandy ( new york , ny ) w/ 2852 others 
4894 what if gahgnam styke was actually a giant rain dance and we've brought this hurricane upon ourselves ... #sandy
3870 “ : r.i.p to the 65 victims who lost their lives because of hurricane sandy . rt for respect <3 ”
4005 i hope this hurricane don't take my power out ??
1462 hurricane sandy ain't even a scary name tho . i need a terrifying name like hurricane shanaynay or sum shit .
3517 we aren't even being hit with the hurricane and classes are still cancelled #soundsguccimynigga
5187 google has set up a “ crisis map ” 

##### 

* <b>Sub-events captured:</b>
        1. protest at the airport
        2. attorney general get fired
        3. quebec city mosque shooting 
        4. trump visit uk
* <b>Lack</b>
        1. trump’s deportation orders 
        2. restricting refugees & all entrants from 7 countries
        3. washington state will sue to stop trump's immigration
        4. canada will accept the refugees 
        5. green card holders
        6. starbuck hires 10K refugees
        7. trump's ban doesn't include saudi arabia .
        9. trump sign a new executive order