# Summarization:
1. K mean with sentence embeddings returned by SentenceTransformer
2. Biterm
3. LDA

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import cosine_similarity
import torch
import re
import nltk
from nltk.corpus import stopwords
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import string
import emoji
import nltk, time
nltk.download('wordnet')
from collections import Counter
import pyLDAvis
from biterm.cbtm import oBTM 
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary # helper functions
import warnings
warnings.filterwarnings('ignore')


unable to import 'smart_open.gcs', disabling that module
[nltk_data] Downloading package wordnet to /home/nguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# read data
data = pd.read_csv('/home/nguyen/data/processed_travel_ban.csv')
print(data.head())
print(data.shape)

                   Id                                              Tweet
0  824941360449015808  RT @MENTION : Emergency Rally Against Trump's ...
1  824941519857610752  RT @MENTION : Theresa May has not apologized t...
2  824941616314122240  RT @MENTION : Trump's Immigration Ban Excludes...
3  824942056741167105  RT @MENTION : Trump's immigration order expand...
4  824942966875774976  ALERT : Senator John McCain Threatens Action O...
(123385, 2)


In [3]:
list(data[0:10]['Tweet'])

["RT @MENTION : Emergency Rally Against Trump's Muslim Travel Ban in NYC , 1/25 at 5 p.m. @URL",
 'RT @MENTION : Theresa May has not apologized to Trump for insulting him . If she fails to do that today , Trump should just send her back to B …',
 "RT @MENTION : Trump's Immigration Ban Excludes Countries with Business Ties @URL via @MENTION #DemocracyFor …",
 'RT @MENTION : Trump\'s immigration order expands the definition of " criminal " @URL @URL',
 'ALERT : Senator John McCain Threatens Action On President Trump If He Does This @URL',
 "@MENTION @MENTION @MENTION @MENTION @MENTION Kiva still distracted while Trump gets on with people's business .",
 'RT @MENTION : TY @MENTION for bailing on GMB & @MENTION today . Piers Morgan drank the Trump Kool Aid & is a vocal opponent o …',
 'RT @MENTION : ✍🏻 #Trump to sign EO temporary ban suspending visas for Syria & six other ME , African countries #BuildTheWall 👍🏼 …',
 'RT @MENTION : Did we have a moral obligation to stop Hitler ? If so we ha

In [4]:
# remove rt, @USER, @URL, emoji
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('@MENTION', "").replace("@URL", "").
                                    replace("@EMAIL", "").lower())
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("  +", " ", x))
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub("^ ?(rt ?)+", "", x))                              
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub('^( ?: ?)', '', x))
data['Tweet'] = data['Tweet'].apply(lambda x: ''.join(c for c in x if c not in emoji.UNICODE_EMOJI))
print(data.head())

                   Id                                              Tweet
0  824941360449015808  emergency rally against trump's muslim travel ...
1  824941519857610752  theresa may has not apologized to trump for in...
2  824941616314122240  trump's immigration ban excludes countries wit...
3  824942056741167105  trump's immigration order expands the definiti...
4  824942966875774976  alert : senator john mccain threatens action o...


In [5]:
list(data[0:10]['Tweet'])

["emergency rally against trump's muslim travel ban in nyc , 1/25 at 5 p.m. ",
 'theresa may has not apologized to trump for insulting him . if she fails to do that today , trump should just send her back to b …',
 "trump's immigration ban excludes countries with business ties via #democracyfor …",
 'trump\'s immigration order expands the definition of " criminal " ',
 'alert : senator john mccain threatens action on president trump if he does this ',
 " kiva still distracted while trump gets on with people's business .",
 'ty for bailing on gmb & today . piers morgan drank the trump kool aid & is a vocal opponent o …',
 ' #trump to sign eo temporary ban suspending visas for syria & six other me , african countries #buildthewall  …',
 'did we have a moral obligation to stop hitler ? if so we have a moral obligation to stop trump .',
 'are these people just now getting radicalized by trump or did they always hate our freedom ? ']

In [6]:
def kMean(data, sentenceEmbs, n_clusters = 20, random_states = 10):
    print("data Shape: ", data.shape)
    print("Embedding shape: ", sentenceEmbs.shape)
    sentences = []
    for x in sentenceEmbs:
        sentences.append(x.ravel())
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(sentences)
    labels = kmeans.labels_.tolist()
    
    centers = np.array(kmeans.cluster_centers_)
    # compute cluster size:
    cluster_size = {}
    for i in range(n_clusters):
        cluster_size[i] = labels.count(i)
        
    # find elements closest to the cluster centers
    closest_data = []
    for i in range(n_clusters):
        center_vec = centers[i].reshape(1, -1)
        data_idx_within_i_cluster = [ idx for idx, clu_num in enumerate(labels) if clu_num == i ]

        one_cluster_tf_matrix = np.zeros( (len(data_idx_within_i_cluster) , centers.shape[1] ) )
        for row_num, data_idx in enumerate(data_idx_within_i_cluster):
            one_row = sentences[data_idx]
            one_cluster_tf_matrix[row_num] = one_row


        closest, _ = pairwise_distances_argmin_min(center_vec, one_cluster_tf_matrix)
        closest_idx_in_one_cluster_tf_matrix = closest[0]
        closest_data_row_num = data_idx_within_i_cluster[closest_idx_in_one_cluster_tf_matrix]
    #     data_id = all_data[closest_data_row_num]

        closest_data.append(closest_data_row_num)

    closest_data = list(set(closest_data))
    
    return labels, closest_data


## 1. Kmean: first token bert embeddings

In [8]:
with open('/home/nguyen/data/travel_ban_first_token_embeddings.pkl', 'rb') as f:
    sentenceEmbs = pickle.load(f)

In [None]:
cluster_size, closest_data = kMean(data, sentenceEmbs)

data Shape:  (123385, 2)
Embedding shape:  (123385, 768)


In [None]:
for i in closest_data:
    print(i, str(data.iloc[i]['Tweet']))

## 2. Kmean: sentence transformers embeddings

In [None]:
with open('/home/nguyen/data/travel_ban_sentence_transformers_embeddings.pkl', 'rb') as f:
    sentenceEmbs = pickle.load(f)

In [None]:
cluster_size, closest_data = kMean(data, sentenceEmbs)

In [None]:
for i in closest_data:
    print(i, str(data.iloc[i]['Tweet']))