# Analysing top N topic labels for top N users

In [151]:
import os
from os.path import join
import eland as ed
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter

project_dir = join(os.getcwd(), os.pardir)
models_dir = join(project_dir, 'models')

pd.set_option('display.max_colwidth', -1)

%config InlineBackend.figure_format = 'svg'

In [2]:
TERMS = ['resource availability', 'volunteers', 'power supply', 'relief measures', 
         'food supply', 'infrastructure', 'medical assistance', 'rescue', 'shelter', 
         'utilities', 'water supply', 'evacuation', 'government', 'crime violence', 
         'mobile network', 'sympathy', 'news updates', 'internet', 'grievance', 
         'livelihood', 'income', 'ecosystem', 'biodiversity', 'agriculture']
         
THRESHOLD = 0.6

## Import data from ES

In [119]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['user_id', 'name', 'description', 'full_text', 'verified', 'location', 'retweet_count', 'followers_count', 'sentiment'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False

query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"false"},
        },
        "filter": {
            "term":{"is_quote_status":"false"},
            "term":{"lang.keyword":"en"}
        },
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_tweets = df_ed.to_pandas()

In [120]:
#df_tweets = df_tweets.reset_index().rename(columns={'index':'tweet_id'})

In [121]:
df_tweets.head()

Unnamed: 0,user_id,name,description,full_text,verified,location,retweet_count,followers_count,sentiment
1262961673708675072,1245962651630534656,newspointpn,,Live Cyclone Amphan Map: Tracking the Storm’s Path https://t.co/CvPTrhe4r3,False,,0,2,0.0
1262961660932894720,926838660049158144,Vishal Tripathi,Get The Best,NYT Live Cyclone Amphan Map: Tracking the Storm’s Path https://t.co/t6vyqfQSjS,False,"Varanasi, India",0,76,0.0
1262961652359729152,1108690827658711040,MJ News,"News, Media, Smartphone, Tech, Review & More https://www.facebook.com/mobilejudgement/","LIVE Now news update on Super Cyclone Amphan #AmphanUpdate #CycloneAmphan #AmphanCyclone #CycloneAmphanUpdate \n120 km nearly south of Paradip (Odisha), \n200 km south-southwest of Digha (West Bengal) and \n360 km south-southwest of Khepupara (Bangladesh). https://t.co/xi9OImeXCe",False,India,0,48,0.5994
1262937945214005248,1108690827658711040,MJ News,"News, Media, Smartphone, Tech, Review & More https://www.facebook.com/mobilejudgement/","LIVE news update on Super Cyclone Amphan #AmphanUpdate #CycloneAmphan #AmphanCyclone #CycloneAmphanUpdate \n125 km nearly south of Paradip (Odisha), \n225 km south-southwest of Digha (West Bengal) and \n380 km south-southwest of Khepupara (Bangladesh). https://t.co/RcApLEBp5K",False,India,0,48,0.5994
1262961522994806784,27608007,Sourabh Mathur,"Founder & CEO of @esanosys,\n #MarketingAutomation #MarTech #CRM #Marketing #Startup #Entrepreneur\nमुझे भारतीय होने पर गर्व है 🇮🇳",Stay safe Odisha and West Bengal #AmphanUpdates #Amphan #AmphanSuperCyclone https://t.co/VnWegOcgrD,False,Global,0,2473,0.4404


## Top N most retweeted users

In [122]:
N = 50
df_tweets.groupby(['user_id', 'name', 'description'])['retweet_count'].agg({'sum'}).nlargest(N, 'sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum
user_id,name,description,Unnamed: 3_level_1
18839785,Narendra Modi,Prime Minister of India,124315
939091,Joe Biden,"Senator, Vice President, 2020 candidate for President of the United States, husband to @DrBiden, proud father & grandfather. Loves ice cream, aviators & @Amtrak",42995
471741741,PMO India,Office of the Prime Minister of India,27667
52544275,Ivanka Trump,"Wife, mother, sister, daughter. Advisor to POTUS on job creation + economic empowerment, workforce development & entrepreneurship. Personal Pg. Views are my own",24152
3171712086,Rahul Gandhi,This is the official account of Rahul Gandhi | Member of the Indian National Congress| Member of Parliament,22718
122453931,Imran Khan,Prime Minister of Pakistan,19733
2455740283,MrBeast,I want to make the world a better place before I die.,18841
204832963,God,Unverified. Unverified. Thee/thou/thine. \n\nINTJehovah.\n\nOff through August.\n\nHoly relics for sale at http://tweetofgod.shop.capthat.com/store,18489
355989081,ANI,"Asian News International. Multi-media news agency, content for information platforms: TV, Internet, broadband, newspapers, mobiles https://t.co/PjGRhL4qvg",17114
292558545,Norbert Elekes,"Data storyteller, minimalist, compulsive list-maker. World news and the latest updates on coronavirus.",15866


## Aggregating the columns to create a popularity measure

In [123]:
df_users = df_tweets.groupby('user_id').agg({
            'name': lambda x: x.iloc[0],
            'followers_count': 'max',
            'retweet_count': 'sum',
            'sentiment': 'mean',
            'verified': lambda x: x.iloc[0],
            'description': lambda x: x.iloc[0],
        })

### Normalising the columns using Z-Score

In [124]:
cols = list(df_users.columns)
cols.remove('name')
cols.remove('description')

for col in cols:
    col_zscore = col + '_zscore'
    df_users[col_zscore] = (df_users[col] - df_users[col].mean())/df_users[col].std(ddof=0)

## Popularity Measure = Sum of Z-Scores (to be refined)

In [125]:
df_users['popularity'] = df_users['followers_count_zscore']+df_users['retweet_count_zscore']+df_users['verified_zscore']

In [126]:
N = 50
df_users.nlargest(N, 'popularity')[['name', 'description', 'sentiment', 'popularity']]

Unnamed: 0_level_0,name,description,sentiment,popularity
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18839785,Narendra Modi,Prime Minister of India,0.428557,203.051266
25073877,Donald J. Trump,45th President of the United States of America🇺🇸,0.6486,94.913915
471741741,PMO India,Office of the Prime Minister of India,0.28691,70.064629
101311381,Shah Rukh Khan,,0.357333,66.985929
759251,CNN,It’s our job to #GoThere & tell the most difficult stories. Join us! For more breaking news updates follow @CNNBRK & download our app http://cnn.com/apps,0.199322,66.521153
939091,Joe Biden,"Senator, Vice President, 2020 candidate for President of the United States, husband to @DrBiden, proud father & grandfather. Loves ice cream, aviators & @Amtrak",0.0,61.171471
428333,CNN Breaking News,"Breaking news from CNN Digital. Now 58M strong. Check @cnn for all things CNN, breaking and more. Download the app for custom alerts: http://cnn.com/apps",0.0,56.39174
807095,The New York Times,News tips? Share them here: http://nyti.ms/2FVHq9v,-0.091964,54.310789
50393960,Bill Gates,Sharing things I'm learning through my foundation work and other interests.,0.5859,51.191212
31348594,Akshay Kumar,,0.7629,45.007026


## Loading the tweet labels & filtering on a threshold

In [127]:
df_labels = pd.read_json(join(models_dir,'zstc_labels.json'), orient='index', convert_axes=False)

In [128]:
df_labels.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
1264253979002843136,"[livelihood, 0.43]","[relief measures, 0.42]","[medical assistance, 0.31]","[grievance, 0.29]","[ecosystem, 0.28]","[evacuation, 0.14]","[sympathy, 0.1]","[resource availability, 0.08]","[shelter, 0.07]","[agriculture, 0.05]",...,"[rescue, 0.01]","[food supply, 0.01]","[mobile network, 0.0]","[power supply, 0.0]","[water supply, 0.0]","[utilities, 0.0]","[news updates, 0.0]","[volunteers, 0.0]","[government, 0.0]","[internet, 0.0]"
1264253893632016384,"[government, 0.8200000000000001]","[grievance, 0.72]","[crime violence, 0.59]","[ecosystem, 0.53]","[livelihood, 0.52]","[sympathy, 0.42]","[relief measures, 0.4]","[shelter, 0.33]","[news updates, 0.32]","[rescue, 0.28]",...,"[resource availability, 0.11]","[evacuation, 0.09]","[infrastructure, 0.08]","[mobile network, 0.08]","[power supply, 0.05]","[internet, 0.05]","[agriculture, 0.04]","[water supply, 0.02]","[food supply, 0.01]","[volunteers, 0.01]"
1264253882580045824,"[income, 0.91]","[livelihood, 0.88]","[grievance, 0.8200000000000001]","[sympathy, 0.73]","[shelter, 0.22]","[news updates, 0.13]","[evacuation, 0.1]","[infrastructure, 0.09]","[relief measures, 0.07]","[utilities, 0.05]",...,"[internet, 0.01]","[biodiversity, 0.01]","[government, 0.01]","[power supply, 0.01]","[ecosystem, 0.01]","[crime violence, 0.0]","[volunteers, 0.0]","[food supply, 0.0]","[water supply, 0.0]","[medical assistance, 0.0]"
1264253658763612160,"[grievance, 0.98]","[livelihood, 0.96]","[sympathy, 0.92]","[government, 0.88]","[relief measures, 0.8300000000000001]","[resource availability, 0.81]","[rescue, 0.71]","[shelter, 0.6900000000000001]","[income, 0.59]","[infrastructure, 0.59]",...,"[internet, 0.22]","[medical assistance, 0.1]","[mobile network, 0.07]","[biodiversity, 0.05]","[crime violence, 0.03]","[volunteers, 0.03]","[agriculture, 0.01]","[water supply, 0.0]","[food supply, 0.0]","[power supply, 0.0]"
1264253569525592064,"[grievance, 0.98]","[livelihood, 0.96]","[sympathy, 0.91]","[resource availability, 0.8300000000000001]","[relief measures, 0.81]","[rescue, 0.6900000000000001]","[income, 0.6900000000000001]","[government, 0.68]","[shelter, 0.68]","[infrastructure, 0.6000000000000001]",...,"[internet, 0.18]","[volunteers, 0.13]","[mobile network, 0.05]","[medical assistance, 0.05]","[crime violence, 0.03]","[biodiversity, 0.03]","[agriculture, 0.01]","[water supply, 0.0]","[food supply, 0.0]","[power supply, 0.0]"


In [129]:
def get_labels(tweet, threshold=THRESHOLD):
  topics = []
  for topic in tweet:
    topic_name, value = topic[0], topic[1]
    if value>threshold:
      topics.append((topic_name, np.round(value,2)))
  if not topics:
    topics.append((tweet[0][0], tweet[0][1]))
  return topics  

In [130]:
df_labels['labels'] = df_labels.apply(lambda x: get_labels(x, THRESHOLD), axis=1)

In [131]:
df_labels = df_labels[['labels']]

In [132]:
df_labels.head()

Unnamed: 0,labels
1264253979002843136,"[(livelihood, 0.43)]"
1264253893632016384,"[(government, 0.82), (grievance, 0.72)]"
1264253882580045824,"[(income, 0.91), (livelihood, 0.88), (grievance, 0.82), (sympathy, 0.73)]"
1264253658763612160,"[(grievance, 0.98), (livelihood, 0.96), (sympathy, 0.92), (government, 0.88), (relief measures, 0.83), (resource availability, 0.81), (rescue, 0.71), (shelter, 0.69)]"
1264253569525592064,"[(grievance, 0.98), (livelihood, 0.96), (sympathy, 0.91), (resource availability, 0.83), (relief measures, 0.81), (rescue, 0.69), (income, 0.69), (government, 0.68), (shelter, 0.68), (infrastructure, 0.6)]"


## Merging the tweet labels and user information

In [133]:
df_tweet_labels = pd.merge(df_tweets, df_labels, left_index=True, right_index=True)

In [134]:
df_tweet_labels.head()

Unnamed: 0,user_id,name,description,full_text,verified,location,retweet_count,followers_count,sentiment,labels
1262961673708675072,1245962651630534656,newspointpn,,Live Cyclone Amphan Map: Tracking the Storm’s Path https://t.co/CvPTrhe4r3,False,,0,2,0.0,"[(livelihood, 0.29)]"
1262961660932894720,926838660049158144,Vishal Tripathi,Get The Best,NYT Live Cyclone Amphan Map: Tracking the Storm’s Path https://t.co/t6vyqfQSjS,False,"Varanasi, India",0,76,0.0,"[(news updates, 0.93)]"
1262961652359729152,1108690827658711040,MJ News,"News, Media, Smartphone, Tech, Review & More https://www.facebook.com/mobilejudgement/","LIVE Now news update on Super Cyclone Amphan #AmphanUpdate #CycloneAmphan #AmphanCyclone #CycloneAmphanUpdate \n120 km nearly south of Paradip (Odisha), \n200 km south-southwest of Digha (West Bengal) and \n360 km south-southwest of Khepupara (Bangladesh). https://t.co/xi9OImeXCe",False,India,0,48,0.5994,"[(news updates, 1.0), (internet, 0.82), (shelter, 0.6)]"
1262937945214005248,1108690827658711040,MJ News,"News, Media, Smartphone, Tech, Review & More https://www.facebook.com/mobilejudgement/","LIVE news update on Super Cyclone Amphan #AmphanUpdate #CycloneAmphan #AmphanCyclone #CycloneAmphanUpdate \n125 km nearly south of Paradip (Odisha), \n225 km south-southwest of Digha (West Bengal) and \n380 km south-southwest of Khepupara (Bangladesh). https://t.co/RcApLEBp5K",False,India,0,48,0.5994,"[(news updates, 1.0), (internet, 0.84)]"
1262961522994806784,27608007,Sourabh Mathur,"Founder & CEO of @esanosys,\n #MarketingAutomation #MarTech #CRM #Marketing #Startup #Entrepreneur\nमुझे भारतीय होने पर गर्व है 🇮🇳",Stay safe Odisha and West Bengal #AmphanUpdates #Amphan #AmphanSuperCyclone https://t.co/VnWegOcgrD,False,Global,0,2473,0.4404,"[(news updates, 0.97), (shelter, 0.89), (sympathy, 0.78), (livelihood, 0.6)]"


## Extracting Top N topics for Top N users

In [143]:
N = 5
topic_users = []
for id, row in df_users.iterrows():
    label_list = df_tweet_labels[df_tweet_labels['user_id']==id]['labels']    
    label_counter = [item[0] for x in label_list for item in x]
    top_n = sorted(Counter(label_counter), key = Counter(label_counter).get, reverse = True)[:N]
    topic_users.append(top_n)

In [144]:
df_users['labels'] = topic_users

In [145]:
N = 500
top_users = df_users.nlargest(N, 'popularity')

In [150]:
top_users[['name', 'description', 'popularity', 'labels']].head(20)

Unnamed: 0_level_0,name,description,popularity,labels
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18839785,Narendra Modi,Prime Minister of India,203.051266,"[sympathy, relief measures, government, shelter, livelihood]"
25073877,Donald J. Trump,45th President of the United States of America🇺🇸,94.913915,[grievance]
471741741,PMO India,Office of the Prime Minister of India,70.064629,"[government, sympathy, relief measures, news updates, rescue]"
101311381,Shah Rukh Khan,,66.985929,"[relief measures, shelter, infrastructure, sympathy]"
759251,CNN,It’s our job to #GoThere & tell the most difficult stories. Join us! For more breaking news updates follow @CNNBRK & download our app http://cnn.com/apps,66.521153,"[relief measures, government, grievance, evacuation, shelter]"
939091,Joe Biden,"Senator, Vice President, 2020 candidate for President of the United States, husband to @DrBiden, proud father & grandfather. Loves ice cream, aviators & @Amtrak",61.171471,[ecosystem]
428333,CNN Breaking News,"Breaking news from CNN Digital. Now 58M strong. Check @cnn for all things CNN, breaking and more. Download the app for custom alerts: http://cnn.com/apps",56.39174,"[news updates, shelter, sympathy, livelihood]"
807095,The New York Times,News tips? Share them here: http://nyti.ms/2FVHq9v,54.310789,"[shelter, news updates, evacuation, livelihood, relief measures]"
50393960,Bill Gates,Sharing things I'm learning through my foundation work and other interests.,51.191212,[relief measures]
31348594,Akshay Kumar,,45.007026,"[relief measures, mobile network, shelter]"


## Label Distribution based on Popularity buckets

In [260]:
'''Extracting the label distribution for users falling in the bucket - popularity>value'''
def popularity_label_dist(value):
    label_list = df_users[df_users['popularity']>value]['labels']
    label_counter = Counter([item for x in label_list for item in x])
    sum_val = sum(label_counter.values())
    for la in label_counter.keys():
        label_counter[la] = np.round(label_counter[la]/sum_val,2)
    return label_counter

In [261]:
label_dist = popularity_label_dist(0)

In [262]:
fig = go.Figure()

fig.add_trace(go.Bar(y=list(label_dist.values()), x=list(label_dist.keys())))

fig.update_layout(
    title='Percentage Contribution per Label',
    autosize=False,
    width=1000,
    height=400,
    margin=dict(l=20, r=20, t=40, b=20),
    showlegend=False
)

fig.show()