# Unsupervised Tweet Summarization with Network Analysis

In [1]:
import os
from os.path import join
import pandas as pd
import numpy as np
import nltk
import networkx as nx
from collections import Counter
import eland as ed
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from numpy import dot
from numpy.linalg import norm
import matplotlib.pyplot as plt
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from networkx.algorithms import community
import pointofview as pov

pd.set_option('display.max_colwidth', -1)



In [2]:
project_dir = join(os.getcwd(), os.pardir)
models_dir = join(project_dir, 'models')

In [3]:
TERMS = ['sympathy', 'complaint', 'hope', 'job', 'relief measures', 'compensation',
        'evacuation', 'income', 'ecosystem', 'government', 'corruption', 'news updates', 
        'volunteers', 'donation', 'mobile network', 'housing', 'farm', 'utilities', 
        'water supply', 'power supply', 'food supply', 'medical assistance', 'coronavirus', 
        'petition', 'poverty']

THRESHOLD = 0.7

## Import data from Elasticsearch

In [4]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['full_text', 'full_text_processed', 'sentiment', 'retweet_count', 'lang', 'description', 'location', 'name'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False
query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"false"},
        },
        "filter": {
            "term":{"is_quote_status":"false"}
        },
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_tweets = df_ed.to_pandas()

In [5]:
df_tweets['length'] = df_tweets['full_text_processed'].apply(lambda x: len([w for w in x.split()]))
df_tweets = df_tweets[df_tweets['length']>5]

In [6]:
df_tweets.shape

(99241, 9)

In [7]:
df_tweets.head()

Unnamed: 0,full_text,full_text_processed,sentiment,retweet_count,lang,description,location,name,length
1262961673708675072,Live Cyclone Amphan Map: Tracking the Storm’s Path https://t.co/CvPTrhe4r3,live cyclone amphan map tracking storm’s path,0.0,0,en,,,newspointpn,7
1262961660932894720,NYT Live Cyclone Amphan Map: Tracking the Storm’s Path https://t.co/t6vyqfQSjS,nyt live cyclone amphan map tracking storm’s path,0.0,0,en,Get The Best,"Varanasi, India",Vishal Tripathi,8
1262961652359729152,"LIVE Now news update on Super Cyclone Amphan #AmphanUpdate #CycloneAmphan #AmphanCyclone #CycloneAmphanUpdate \n120 km nearly south of Paradip (Odisha), \n200 km south-southwest of Digha (West Bengal) and \n360 km south-southwest of Khepupara (Bangladesh). https://t.co/xi9OImeXCe",live news update super cyclone amphan amphanupdate cycloneamphan amphancyclone cycloneamphanupdate 120 km nearly south paradip odisha 200 km southsouthwest digha west bengal 360 km southsouthwest khepupara bangladesh,0.5994,0,en,"News, Media, Smartphone, Tech, Review & More https://www.facebook.com/mobilejudgement/",India,MJ News,27
1262960808742522880,"আর কাছাকাছি চলে এলো সাইক্লোন ঝড় আমফান।\nLIVE Super Cyclone Amphan update\n--ওড়িশা থেকে ১২০ কিলোমিটার দূরে Paradip (Odisha),\n--পশ্চিমবঙ্গ থেকে ২০০ কিলোমিটার Digha (West Bengal)\n--বাংলাদেশ থেকে ৩৬০ কিলোমিটার Khepupara (Bangladesh).\n#বাংলাদেশ #বাংলা #পশ্চিমবঙ্গ #সাইক্লোন #আমফান https://t.co/vuGPh7rKca",cyclone ampan came closer live super cyclone amphan update pradip odisha 120 km orissa digha west bengal 200 km west bengal khepupara bangladesh 360 km bangladesh bangladesh bangla west bengal cyclone amphan,0.5994,0,und,"News, Media, Smartphone, Tech, Review & More https://www.facebook.com/mobilejudgement/",India,MJ News,32
1262937945214005248,"LIVE news update on Super Cyclone Amphan #AmphanUpdate #CycloneAmphan #AmphanCyclone #CycloneAmphanUpdate \n125 km nearly south of Paradip (Odisha), \n225 km south-southwest of Digha (West Bengal) and \n380 km south-southwest of Khepupara (Bangladesh). https://t.co/RcApLEBp5K",live news update super cyclone amphan amphanupdate cycloneamphan amphancyclone cycloneamphanupdate 125 km nearly south paradip odisha 225 km southsouthwest digha west bengal 380 km southsouthwest khepupara bangladesh,0.5994,0,en,"News, Media, Smartphone, Tech, Review & More https://www.facebook.com/mobilejudgement/",India,MJ News,27


## Load the Tweet2Vec Model

In [11]:
## Loading the tweet2vec model
model = Doc2Vec.load(join(models_dir,'tweet2VecJared.model'))
doc_tags = list(model.docvecs.doctags.keys())   ## Tweet Ids
doc_vectors = model.docvecs.vectors_docs        ## Tweet Vectors

In [12]:
print(len(doc_tags))
print(len(doc_vectors))

113342
113342


## Loading the tweet labels

In [13]:
df_labels = pd.read_json(join(models_dir,'zstc_labels.json'), orient='index', convert_axes=False)

In [14]:
df_labels.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
1264253979002843136,"[relief measures, 0.67]","[complaint, 0.63]","[poverty, 0.48]","[evacuation, 0.46]","[sympathy, 0.44]","[medical assistance, 0.32]","[income, 0.30000000000000004]","[housing, 0.29]","[petition, 0.23]","[corruption, 0.22]",...,"[food supply, 0.07]","[hope, 0.07]","[utilities, 0.05]","[news updates, 0.05]","[coronavirus, 0.04]","[farm, 0.03]","[donation, 0.03]","[volunteers, 0.02]","[government, 0.02]","[job, 0.01]"
1264253959918632960,"[relief measures, 0.5]","[job, 0.06]","[farm, 0.04]","[volunteers, 0.01]","[evacuation, 0.01]","[petition, 0.0]","[complaint, 0.0]","[sympathy, 0.0]","[compensation, 0.0]","[income, 0.0]",...,"[medical assistance, 0.0]","[mobile network, 0.0]","[power supply, 0.0]","[government, 0.0]","[poverty, 0.0]","[housing, 0.0]","[corruption, 0.0]","[food supply, 0.0]","[hope, 0.0]","[water supply, 0.0]"
1264253893632016384,"[government, 0.98]","[hope, 0.97]","[power supply, 0.93]","[sympathy, 0.86]","[ecosystem, 0.77]","[medical assistance, 0.72]","[complaint, 0.6900000000000001]","[relief measures, 0.67]","[petition, 0.5700000000000001]","[income, 0.44]",...,"[evacuation, 0.24]","[housing, 0.21]","[poverty, 0.2]","[job, 0.18]","[farm, 0.16]","[food supply, 0.16]","[coronavirus, 0.12]","[donation, 0.1]","[corruption, 0.08]","[volunteers, 0.0]"


In [15]:
'''Method to return topics for every tweet with confidence score above threshold'''
def get_labels(tweet, threshold=THRESHOLD):
    topics = []
    for topic in tweet:
        topic_name, value = topic[0], topic[1]
        if value>threshold:
            topics.append((topic_name, np.round(value,2)))
    if not topics:
        topics.append((tweet[0][0], tweet[0][1]))
    return topics  

In [16]:
df_labels['labels'] = df_labels.apply(lambda x: get_labels(x, THRESHOLD), axis=1)
df_labels = df_labels[['labels']]

In [17]:
df_labels.head()

Unnamed: 0,labels
1264253979002843136,"[(relief measures, 0.67)]"
1264253959918632960,"[(relief measures, 0.5)]"
1264253893632016384,"[(government, 0.98), (hope, 0.97), (power supply, 0.93), (sympathy, 0.86), (ecosystem, 0.77), (medical assistance, 0.72)]"
1264253882580045824,"[(income, 0.75)]"
1264253658763612160,"[(complaint, 0.94), (relief measures, 0.93), (petition, 0.88), (donation, 0.87), (sympathy, 0.86)]"


## Merging the Tweets with Labels

In [18]:
df_tweet_labels = pd.merge(df_tweets, df_labels, left_index=True, right_index=True)
df_tweet_labels.head(3)

Unnamed: 0,full_text,full_text_processed,sentiment,retweet_count,lang,description,location,name,length,pov,labels
1262961673708675072,Live Cyclone Amphan Map: Tracking the Storm’s Path https://t.co/CvPTrhe4r3,live cyclone amphan map tracking storm’s path,0.0,0,en,,,newspointpn,7,,"[(sympathy, 0.11)]"
1262961660932894720,NYT Live Cyclone Amphan Map: Tracking the Storm’s Path https://t.co/t6vyqfQSjS,nyt live cyclone amphan map tracking storm’s path,0.0,0,en,Get The Best,"Varanasi, India",Vishal Tripathi,8,,"[(utilities, 0.25)]"
1262961652359729152,"LIVE Now news update on Super Cyclone Amphan #AmphanUpdate #CycloneAmphan #AmphanCyclone #CycloneAmphanUpdate \n120 km nearly south of Paradip (Odisha), \n200 km south-southwest of Digha (West Bengal) and \n360 km south-southwest of Khepupara (Bangladesh). https://t.co/xi9OImeXCe",live news update super cyclone amphan amphanupdate cycloneamphan amphancyclone cycloneamphanupdate 120 km nearly south paradip odisha 200 km southsouthwest digha west bengal 360 km southsouthwest khepupara bangladesh,0.5994,0,en,"News, Media, Smartphone, Tech, Review & More https://www.facebook.com/mobilejudgement/",India,MJ News,27,,"[(news updates, 1.0)]"


In [19]:
df_tweet_labels.shape

(99241, 11)

In [20]:
df_tweet_labels = df_tweet_labels[df_tweet_labels['lang'] == 'en']
df_tweet_labels['pov'] = df_tweet_labels['full_text'].apply(lambda x: pov.get_text_pov(x))
df_tweet_labels = df_tweet_labels[df_tweet_labels['pov'] == 'first']

## Filter on Label

In [31]:
LABEL = 'housing'

In [32]:
df_tweet_labels['labels_list'] = df_tweet_labels['labels'].apply(lambda x: x if LABEL in [item[0] for item in x] else np.nan)
df_label = df_tweet_labels[df_tweet_labels['labels_list'].notnull()][['full_text_processed', 'full_text', 'length', 'sentiment', 'retweet_count', 'labels', 'pov', 'location']]

In [33]:
df_label.head()

Unnamed: 0,full_text_processed,full_text,length,sentiment,retweet_count,labels,pov,location
1262961420364386304,dharmorakshathi yes sri im well safe little rain amphan effect everyone stay indoors safe🙏,@DharmoRakshathi Yes Sri. I'm well and safe. Just a little rain here. Amphan effect. Everyone stay indoors and safe.🙏,14,0.7717,0,"[(relief measures, 0.95), (sympathy, 0.88), (housing, 0.72)]",first,Panem
1262959134036688896,hope পদ্মানদীর মাঝি safe amphan malicartoonist aamphan stayhomestaysafe,"I hope all ""পদ্মানদীর মাঝি"" will be safe from Amphan.\n@Malicartoonist \n#Aamphan #StayHomeStaySafe https://t.co/OKM4nXHaw1",8,0.7003,0,"[(hope, 1.0), (housing, 0.98), (medical assistance, 0.95), (relief measures, 0.88), (sympathy, 0.87), (petition, 0.82)]",first,
1262963583501172736,everything wa going fine got panicked think close window room hostel properly feel helpless amphan,"Everything was going fine until I got panicked to think it over again and AGAIN, ""DID I CLOSE THE WINDOW OF MY ROOM IN HOSTEL PROPERLY?"" I feel so helpless! #Amphan",15,-0.6369,0,"[(housing, 0.98), (complaint, 0.76)]",first,"Sherrinford, Britain"
1262970174971363328,kanaknews currentno inverter back paradeep please give u update amphan house wall ha fallen one side,"@kanak_news No current,no inverter back up in Paradeep. please give us more update about AMPHAN.\nHere my house wall has fallen in one side.",16,0.296,0,"[(housing, 0.99), (news updates, 0.96), (power supply, 0.95), (utilities, 0.82), (complaint, 0.73)]",first,paradeep
1262947966467407872,special control room cyclone amphan functional tomorrow 8am onwards case emergency please get touch u number 03322143024 03322141310 03322143230 whatsapp 9432624365 dial100 stayhomestaysafe wecarewedare,A special Control Room for cyclone #Amphan will be functional from tomorrow 8am onwards. \n\nIn case of any emergency please do get in touch with us on the below numbers\n033-2214-3024\n033-2214-1310\n033-2214-3230\nWhatsApp No. 9432624365\n#Dial100\n#StayHomeStaySafe\n#WeCareWeDare,24,0.4019,0,"[(relief measures, 0.74), (housing, 0.73)]",first,"Kolkata, India"


In [34]:
df_label.shape

(442, 8)

## Creating Summaries based on Similarity & Connected Components

In [35]:
label_vectors = np.array([doc_vectors[doc_tags.index(idx)] for idx in df_label.index.tolist()])

In [36]:
cos_distances = pairwise_distances(label_vectors, n_jobs=-1, metric='cosine')

In [37]:
"""Given pairwise distances and length parameter, returns the summary"""
def create_summary(cos_distances , K):
    summary = [] ## Store the summary
    remove = set() ## Set of nodes to be ignored (iteratively)
    threshold = 0.1

    while len(summary)<K and threshold<=1:
        print('Threshold =',threshold)
        G = nx.Graph()   ## Create a new Graph
        result = np.where(cos_distances<=threshold)
        listOfCoordinates = list(zip(result[0], result[1]))
        listOfCoordinates = [item for item in listOfCoordinates if item[0]!=item[1]]

        ## Create graph edges
        for node_1, node_2 in listOfCoordinates:
            item_1 = df_label.iloc[node_1]
            item_2 = df_label.iloc[node_2]
            id_1 = item_1.name
            id_2 = item_2.name

            ## Check if node hasn't been removed
            if id_1 not in remove and id_2 not in remove:
                if not G.has_node(id_1):
                    G.add_node(
                        id_1,
                        node_size=np.log(item_1['length']),
                        sentiment=item_1['sentiment'],
                        labels=item_1['labels'],
                        text=item_1['full_text_processed']
                    )
                if not G.has_node(id_2):
                    G.add_node(
                        id_2,
                        node_size=np.log(item_2['length']),
                        sentiment=item_2['sentiment'],
                        labels=item_2['labels'],
                        text=item_2['full_text_processed']
                    )
                if not G.has_edge(id_1, id_2):
                    G.add_edge(id_1, id_2, weight=(1-cos_distances[node_1][node_2]))
        
        conn_components = nx.connected_components(G)
        conn_components = sorted(conn_components, key=len, reverse=True)  ## Sorting the connected components based on length or size
        print("Number of connected components =",len(conn_components))
        
        ## Sort each connected component based on the aggregated score of node_size and pick the highest scoring tweet from each             component. The other nodes are removed.
        for component in conn_components:
            if len(component)>=3 and len(summary)<K:
                node_pick = sorted(component, 
                            key=lambda x: (np.log(G.degree[x]+1)+G.nodes()[x]['node_size']), 
                            reverse=True)
                summary.append((node_pick[0], df_label.loc[node_pick[0]]['full_text'], 
                                G.nodes()[node_pick[0]]['sentiment'],
                                df_label.loc[node_pick[0]]['location']))
                remove.update(node_pick)
                
        threshold = threshold+0.1
    
    return summary

In [38]:
K = 50 ## Number of tweets to create the summary
summary = create_summary(cos_distances, K)

Threshold = 0.1
Number of connected components = 19
Threshold = 0.2
Number of connected components = 26
Threshold = 0.30000000000000004
Number of connected components = 44
Threshold = 0.4
Number of connected components = 20
Threshold = 0.5
Number of connected components = 15
Threshold = 0.6
Number of connected components = 1
Threshold = 0.7
Number of connected components = 2
Threshold = 0.7999999999999999
Number of connected components = 2
Threshold = 0.8999999999999999
Number of connected components = 1
Threshold = 0.9999999999999999
Number of connected components = 1


In [39]:
df_summary = pd.DataFrame(summary, columns=['tweet_id', 'full_text', 'sentiment', 'location'])

In [40]:
df_summary

Unnamed: 0,tweet_id,full_text,sentiment,location
0,1264255236404686848,"@siddhagroup Plz don't fool people. We r residents of Siddha Galaxia Oceania block. We r suffering from poor quality windows, bedrooms of residents flooded during Amphan cyclone. Lifts are not working since Amphan cyclone. No update from Siddha when the lifts will be repaired. Shame on u.",-0.8589,"INDIA,KOLKATA."
1,1262757695284629504,A special Control Room for cyclone #Amphan will be functional from tomorrow 8am onwards. \n\nIn case of any emergency please do get in touch with us on the below numbers\n033-2214-3024\n033-2214-1310\n033-2214-3230\nWhatsApp No. 9432624365\n#Dial100\n#StayHomeStaySafe\n#WeCareWeDare,0.4019,"Rajarhat, Bidhan Nagar"
2,1264545617788973056,"Where is Home? What A Pandemic, Lockdown and Cyclone Amphan Taught us https://t.co/cCzcitjhA1",0.0,
3,1265892052450988032,@HYDTP @TelanganaDGP I have applied for epass to travel from Hyderabad to Kolkata on 23rd May but the status is still showing pending...it is very urgent as my house is damaged by amphan cyclone...can it be approved early or can the process be fast tracked?,0.1779,"Hyderabad, India"
4,1264601431484403712,Corona broke the livelihoods and Amphan Cyclone broke the household infrastructure of Bengal. It's time when we as a nation stand with Sundarban &amp; Bengal.\n\nWatch This Full Video of @aviksahaindia : https://t.co/DKnlwYzqDT,-0.5859,India
5,1265290103133540352,"I received these heart-breaking pictures from a Bangladeshi brother. Muslims in Cyclone Amphan affected area performed Eid's prayer within the water. Millions of people lost their house, property and crops. Many of them are shelterless, please pray for them. #CycloneAmphan https://t.co/ByJ2TjmLMH",-0.3182,"Gaya, India"
6,1264493723443818496,"@HardeepSPuri Please do not cancel the flights.We have made arrangements to travel from the origin to the airport and from the airport to the destination.Nobody is a fool who has booked a flight during such times, even we don't want to waste our money.The house broken by Amphan might be ours.",-0.2937,
7,1263791172415377408,"@satyaprad1 \n@osdmaodisha \n@rdmodisha \n@SRC_Odisha \nI inform you that the ODRAF Team has cleared partly the Tamarind tree on yesterday which is fallen on the roof of a domestic house by cyclone AMPHAN in Pakhar GP,Soro,Balasore and after 60 hours cleaning work is incomplete. https://t.co/IodU8e35eP",0.0772,Orissa
8,1263790833238552576,#MyReport | Soon our house became a pond and all of us in the family sat in the corner of a room just waiting and wishing for #AmphanCyclone to pass' narrates citizen journalist Ritankar Mazumder from North Kolkata. \nhttps://t.co/Wvpr4KH1eS,0.2263,
9,1263632356839227392,@narendramodi I lost my home 20.05.2020. because 'amphan' Please help me sir recovery my home . Now It is dengar to stay my house.I lost totaly my house .\nAddress\nSanjib kumar De\nS/o- Ratan kumar De\nVill-Rathikulgeria\nP.o- Barachu\nP.s- Kharagpur local\nDist-Paschim medinipur\nWest bengal\n721301 https://t.co/pU2K0AkdlB,-0.4939,"खड़गपुर, भारत"


In [160]:
 def save_graph(graph,file_name):
    #initialze Figure
    plt.figure(num=None, figsize=(20, 20), dpi=80)
    plt.axis('off')
    fig = plt.figure(1)
    pos = nx.spring_layout(graph)
    nx.draw_networkx_nodes(graph,pos)
    nx.draw_networkx_edges(graph,pos)
    nx.draw_networkx_labels(graph,pos)

    cut = 1.00
    xmax = cut * max(xx for xx, yy in pos.values())
    ymax = cut * max(yy for xx, yy in pos.values())
    plt.xlim(0, xmax)
    plt.ylim(0, ymax)

    plt.savefig(file_name,bbox_inches="tight")
    del fig