# Analysing top N topics for top N users

In [1]:
import os
from os.path import join
import eland as ed
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

project_dir = join(os.getcwd(), os.pardir)
raw_dir = join(project_dir, 'data', 'raw')
interim_dir = join(project_dir, 'data', 'interim')

%config InlineBackend.figure_format = 'svg'

In [29]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['user_id', 'verified', 'name', 'location', 'retweet_count', 'followers_count', 'description'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False

query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"false"},
        },
        "filter": {
            "term":{"is_quote_status":"false"}
        },
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_users = df_ed.to_pandas()

In [30]:
df_users = df_users.reset_index().rename(columns={'index':'tweet_id'})

In [31]:
df_users.head()

Unnamed: 0,tweet_id,user_id,verified,name,location,retweet_count,followers_count,description
0,1264160647002103808,1256622599364214786,False,The Meraaki,"Ahmadabad City, India",0,9,We help businesses in revamping there brand or...
1,1264160609668599808,1251934220345208832,False,Newspapers,Dhaka,0,80,http://www.banglanews.news
2,1264121161589415936,1251934220345208832,False,Newspapers,Dhaka,0,80,http://www.banglanews.news
3,1264160569315209216,1113075640499036160,False,netvani,,0,12,केवल खबर
4,1264114187346874368,1113075640499036160,False,netvani,,0,12,केवल खबर


## Top N users with most original tweets

In [32]:
N = 50
top_users = df_users.groupby('name')\
    .size()\
    .to_frame()\
    .reset_index()\
    .rename(columns={0:'Count', 'name':'name'})\
    .set_index('name').nlargest(N, 'Count')

top_users

Unnamed: 0_level_0,Count
name,Unnamed: 1_level_1
The Wealth Home - Let's start building wealth,2439
MEDIAonINDIA,1566
Bharti Airtel India,713
ABP Ananda,491
Hindustan Times,340
Tata Sky,274
News18Bangla,201
Oneindia Bengali,200
S Newz,199
#India Important India News,173


## Top N most retweeted users

In [36]:
N = 50
df_users.groupby(['name', 'description'])['retweet_count'].agg({'sum'}).nlargest(N, 'sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,sum
name,description,Unnamed: 2_level_1
Narendra Modi,Prime Minister of India,41072
PMO India,Office of the Prime Minister of India,11439
Priyanka Gandhi Vadra,"General Secretary, Indian National Congress",6604
Amit Shah,"Union Home Minister, Government of India | MP, Gandhinagar Lok Sabha. http://www.instagram.com/amitshahofficial",4719
Norbert Elekes,"Data storyteller, minimalist, compulsive list-maker. World news and the latest updates on coronavirus.",4233
OpIndia.com,"News and opinions website, bringing the right side of India to you.",3871
BJP Bengal,Official Twitter handle of BJP West Bengal (BJPBengal),3641
Honeypreet Insan,"Philanthropist, Daughter of Saint Dr @gurmeetramrahim Singh Ji Insan. RTs not endorsements.",3639
Mamata Banerjee,"The official Twitter page of Mamata Banerjee, founder Chairperson All India Trinamool Congress. Honourable Chief Minister, West Bengal.",3590
AajTak,"AajTak covers breaking news, latest news in politics, sports, business & cinema. Follow us & stay ahead! Download the App: https://aajtak.app.link/QFAp3ZaHmQ",3194


## Aggregating the columns to create a popularity measure

In [37]:
df_users = df_users.groupby('user_id').agg({
            'name': lambda x: x.iloc[0],
            'followers_count': 'max',
            'retweet_count': 'sum',
            'verified': lambda x: x.iloc[0],
            'description': lambda x: x.iloc[0]
        })

### Normalising the columns using Z-Score

In [42]:
cols = list(df_users.columns)
cols.remove('name')
cols.remove('description')

for col in cols:
    col_zscore = col + '_zscore'
    df_users[col_zscore] = (df_users[col] - df_users[col].mean())/df_users[col].std(ddof=0)

## Popularity Measure = Sum of Z-Scores (to be refined)

In [43]:
df_users['popularity'] = df_users['followers_count_zscore']+df_users['retweet_count_zscore']+df_users['verified_zscore']

In [44]:
N = 50
df_users.nlargest(N, 'popularity')

Unnamed: 0_level_0,name,followers_count,retweet_count,verified,description,followers_count_zscore,retweet_count_zscore,verified_zscore,followers_count_zscore_zscore,retweet_count_zscore_zscore,verified_zscore_zscore,popularity
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
18839785,Narendra Modi,60652395,41072,True,Prime Minister of India,78.552852,179.297914,4.338553,78.552852,179.297914,4.338553,262.189319
101311381,Shah Rukh Khan,40859499,15074,True,,52.900567,65.782254,4.338553,52.900567,65.782254,4.338553,123.021373
471741741,PMO India,37731486,11439,True,Office of the Prime Minister of India,48.846552,49.91067,4.338553,48.846552,49.91067,4.338553,103.095775
428333,CNN Breaking News,58390960,119,True,Breaking news from CNN Digital. Now 58M strong...,75.621953,0.483896,4.338553,75.621953,0.483896,4.338553,80.444402
759251,CNN,49342437,532,True,It’s our job to #GoThere & tell the most diffi...,63.894751,2.287187,4.338553,63.894751,2.287187,4.338553,70.520491
807095,The New York Times,47070738,584,True,News tips? Share them here: http://nyti.ms/2FV...,60.95055,2.514236,4.338553,60.95055,2.514236,4.338553,67.803339
5402612,BBC Breaking News,44799065,521,True,Breaking news alerts and updates from the BBC....,58.006382,2.239158,4.338553,58.006382,2.239158,4.338553,64.584093
1447949844,Amit Shah,21985085,4719,True,"Union Home Minister, Government of India | MP,...",28.438666,20.568981,4.338553,28.438666,20.568981,4.338553,53.3462
742143,BBC News (World),28654283,700,True,"News, features and analysis from the World's n...",37.08218,3.02073,4.338553,37.08218,3.02073,4.338553,44.441462
5988062,The Economist,24921516,21,True,News and analysis with a global perspective. S...,32.244383,0.055997,4.338553,32.244383,0.055997,4.338553,36.638932


In [45]:
top_users = df_users.nlargest(N, 'popularity')

## Extracting Top N topics for Top N users

In [46]:
df_topics_lda = pd.read_csv(join(interim_dir, 'tweet_topics_data_lda.csv'), index_col=0) ## For LDA topics
df_topics_nmf = pd.read_csv(join(interim_dir, 'tweet_topics_data_nmf.csv'), index_col=0) ## For NMF topics

In [47]:
df_topics_lda = df_topics_lda[['user_id', 'name', 'verified', 'Topic']]
df_topics_nmf = df_topics_nmf[['user_id', 'name', 'verified', 'Topic']]

In [48]:
N = 5  ## Up to top 5 topics for every user
topics_users_lda = []
topics_users_nmf = []

for index, row in top_users.iterrows():
    topics_lda = df_topics_lda[df_topics_lda['user_id']==int(index)]
    topics_list_lda = topics_lda['Topic'].value_counts().nlargest(N).index.tolist()
    topics_users_lda.append(topics_list_lda)

    topics_nmf = df_topics_nmf[df_topics_nmf['user_id']==int(index)]
    topics_list_nmf = topics_nmf['Topic'].value_counts().nlargest(N).index.tolist()
    topics_users_nmf.append(topics_list_nmf)


In [49]:
top_users['topics_lda'] = topics_users_lda
top_users['topics_nmf'] = topics_users_nmf

In [50]:
top_users[['name', 'description', 'topics_lda', 'topics_nmf', 'popularity']]

Unnamed: 0_level_0,name,description,topics_lda,topics_nmf,popularity
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18839785,Narendra Modi,Prime Minister of India,"[4, 7, 5, 1]","[7, 2]",262.189319
101311381,Shah Rukh Khan,,[2],"[7, 2]",123.021373
471741741,PMO India,Office of the Prime Minister of India,[8],[7],103.095775
428333,CNN Breaking News,Breaking news from CNN Digital. Now 58M strong...,[4],[10],80.444402
759251,CNN,It’s our job to #GoThere & tell the most diffi...,"[4, 10, 7]","[10, 7, 2, 8]",70.520491
807095,The New York Times,News tips? Share them here: http://nyti.ms/2FV...,"[5, 10, 7]","[2, 10]",67.803339
5402612,BBC Breaking News,Breaking news alerts and updates from the BBC....,[5],[9],64.584093
1447949844,Amit Shah,"Union Home Minister, Government of India | MP,...","[9, 8]","[7, 5]",53.3462
742143,BBC News (World),"News, features and analysis from the World's n...","[5, 1, 4]","[2, 9, 8]",44.441462
5988062,The Economist,News and analysis with a global perspective. S...,[3],[8],36.638932
