# Network Analysis

In [9]:
from elasticsearch import Elasticsearch
import pandas as pd
import numpy as np
import eland as ed
import networkx as nx

# uncomment this if database is not already open (and give ES a couple minutes to set up)
#!make database

ed_df = ed.read_es('localhost', 'twitter')

In [8]:
df = ed_df[
    ['tweet_id', 'user_id', 'name', 'full_text_processed',
    'original_tweet_id_str', 'quoted_status_id_str', 'in_reply_to_status_id_str',
    'is_quote_status', 'is_retweet', 'is_reply']
].to_pandas().fillna(np.nan)

In [85]:
# remove the annoying scientific notation from id columns
#pd.set_option('display.float_format', lambda x: '%.0f' % x)

df['tweet_id'] = df['tweet_id'].astype(float).astype(int).astype(str)

def get_source_id(row):
    """Returns the original Tweet ID from a Quote, Retweet or Reply"""
    if row['is_quote_status']:
        val = (
            row['quoted_status_id_str']
            if ~np.isnan(row['quoted_status_id_str'])
            else row['in_reply_to_status_id_str']
        )
    elif row['is_retweet']:
        val = row['original_tweet_id_str']
    elif row['is_reply']:
        val = row['in_reply_to_status_id_str']
    else:
        val = np.nan
        
    return val
    
df['original_tweet_id'] = df.apply(get_source_id, axis=1)\
    .astype(float).fillna(0).astype(int).astype(str)

# I'm counting replies as original posts...
df['is_original'] = ~df[['is_quote_status', 'is_retweet']].max(1)

KeyError: 'tweet_id'

In [59]:
to_drop = df[~df.is_original & (df.original_tweet_id_str == '0')].index
df = df.drop(index=to_drop, columns=[
        'is_quote_status', 'is_retweet', 'is_reply',
        'original_tweet_id_str', 'quoted_status_id_str',
        'in_reply_to_status_id_str'
    ]
).rename(columns={'tweet_id': 'target', 'original_tweet_id': 'source', 'user_id': 'target_user'})

AttributeError: 'DataFrame' object has no attribute 'original_tweet_id_str'

In [120]:
df_nodes_tweets = df\
    .drop_duplicates('target')\
    .rename(columns={'target': 'node_id'})\
    .drop(columns=['target_user', 'full_text_processed', 'source', 'name', 'is_original'])
df_nodes_tweets['node_id'] = df_nodes_tweets.node_id.astype(str)

df_nodes_users = df\
    .drop_duplicates('target_user')\
    .rename(columns={'target_user': 'node_id'})\
    .drop(columns=['target', 'full_text_processed', 'is_original', 'source'])
df_nodes_users['node_id'] = df_nodes_users.node_id.astype(str)

t2v = pd.read_csv('../data/results/iwmi_tweet2vec.csv')\
    .rename(columns={'tweet_id':'node_id'})
t2v['node_id'] = t2v['node_id'].astype(float).astype(int).astype(str)

u2v = pd.read_csv('../data/results/iwmi_user2vec.csv')\
    .rename(columns={'user_id':'node_id'})
u2v['node_id'] = u2v['node_id'].astype(float).astype(int).astype(str)

df_nodes_tweets = df_nodes_tweets\
    .set_index('node_id')\
    .join(
        t2v.set_index('node_id')
    )\
    .reset_index()\
    .drop(columns=['original_tweet_id_str'])

df_nodes_users = df_nodes_users\
    .set_index('node_id')\
    .join(
        u2v.set_index('node_id')
    )\
    .reset_index()


In [142]:
d2v['node_id'].drop_duplicates()


0         1272212817551265792
1         1272108590048792576
2         1272078856350916608
3         1272033089493987328
4         1271858828494074112
                 ...         
194178                   3249
194179                   2737
194180                    767
194181                    573
194182                     41
Name: node_id, Length: 194183, dtype: object

## Network Analysis (Tweets)
There is possibly little to no information to be taken from this section, but I kept it anyway.

In [86]:
df

Unnamed: 0,target,target_user,name,full_text_processed,is_original,source
1263737818767130624,1263737818767130624,830636185869545472,Nikki 🇮🇳,exsecular dd ki gaan uda di amphan ne ab bhikh...,True,1263723569000000000
1263737812576202752,1263737812576202752,1121047353899622400,A $ H W I K K 🇮🇳,speaking situation wake cyclone amphan,False,1263734564838465536
1263737811460636672,1263737811460636672,1028668637865947136,Vandana. pooja,speaking situation wake cyclone amphan,False,1263734564838465536
1263737810105839616,1263737810105839616,896275238182244352,মৃদুল ফুকন 🇮🇳 मृदुल फुकन 🇮🇳 Mridul Phookan,speaking situation wake cyclone amphan,False,1263734564838465536
1263737809757749248,1263737809757749248,4271835029,MD YOUNUS KHAN,damage sunderbans amphancyclone last quite lon...,False,1263693523619950592
...,...,...,...,...,...,...
1262711874832994304,1262711874832994304,1877831,New York Times World,cyclone amphan ha yet make landfall already la...,True,0
1254077710600679424,1254077710600679424,254206787,Nicola Careem,un fear tremendous loss life…women children…in...,True,0
1262672672946900992,1262672672946900992,1262411428994015232,NIT Bhopal Voice,manitsaysnoexams promotefinalyearstudents mhrd...,True,0
1262778121700495360,1262778121700495360,753274181811249152,🥀,still feel jittery everytime think day seeing ...,False,1262776835873652736


In [32]:
# create a directed graph
G_dir = nx.from_pandas_edgelist(df_network[['source', 'target', 'weight']], create_using=nx.DiGraph())

# create an undirected graph
G = nx.from_pandas_edgelist(df_network[['source', 'target', 'weight']], create_using=nx.Graph())


In [37]:
print(f'Graph basic info: {G_dir.number_of_nodes()} nodes, {G_dir.number_of_edges()} edges.')

Graph basic info: 495233 nodes, 455471 edges.


In [38]:
# compare densities of the two graphs
print(f'Density measurement: \nUndirected: {nx.density(G)}\nDirected: {nx.density(G_dir)}')

Density measurement: 
Undirected: 3.71426127592167e-06
Directed: 1.857130637960835e-06


In [39]:
#out-degree
node_attrs=pd.DataFrame.from_dict(dict(G_dir.out_degree()), 'index').rename(columns={0:'out_degree'})

#in-degree
node_attrs=node_attrs.join(pd.DataFrame.from_dict(dict(G_dir.in_degree()), 'index').rename(columns={0:'in_degree'}))

# weighted out-degree
#node_attrs=node_attrs.join(pd.DataFrame.from_dict(dict(G_dir.out_degree(weight='weight')), 'index').rename(columns={0:'weighted_out_degree'}))

# weighted in-degree
#node_attrs=node_attrs.join(pd.DataFrame.from_dict(dict(G_dir.in_degree(weight='weight')), 'index').rename(columns={0:'weighted_in_degree'}))

# add tweet ids and original info
#node_attrs=node_attrs.join(df.set_index('COMUNE')[["COD_REG","COD_CM", "COD_PRO", "PRO_COM"]])

In [48]:
node_attrs

Unnamed: 0,out_degree,in_degree
126,1,0
278816194440605696,1,0
401812009810800576,1,0
446602988501491712,1,0
521237811643445184,1,0
...,...,...
,1,0
,1,0
,1,0
,1,0


In [None]:
node_attrs.out_degree.unique()

In [40]:
df_network[['source', 'target', 'weight']]

Unnamed: 0,source,target,weight
1263804753286397952,1263801627288521984,1263804753286397952,1
1263820895312732160,1263812617430134785,1263820895312732160,1
1263820892871618560,1263762563566731264,1263820892871618560,1
1263820887897378816,1263788358343499782,1263820887897378816,1
1263820880800595968,1263820878783053056,1263820880800595968,1
...,...,...,...
1263850896217092096,1263848854383849477,1263850896217092096,1
1263850895160348672,1263848854383849477,1263850895160348672,1
1263850832103161856,1263847781094944768,1263850832103161856,1
1263850818689740800,1263848064592183298,1263850818689740800,1


## Network Analysis (users)

In [66]:
df_network = ed_df[
    ['tweet_id', 'name',
    'original_tweet_id_str', 'quoted_status_id_str', 'in_reply_to_status_id_str',
    'is_quote_status', 'is_retweet', 'is_reply', 'is_translator', 'is_original']
].to_pandas().fillna(np.nan)

In [61]:
cols = ed_df.columns
cols[cols.str.contains('name')]

#ed_df[[]]

Index(['derived.locations.full_name', 'entities.user_mentions.name',
       'entities.user_mentions.screen_name', 'name', 'screen_name',
       'users_derived_full_name'],
      dtype='object')

## User2Vec
Based on: https://ieeexplore.ieee.org/document/8875952/

How it works:
- Run doc2vec
- Average vector representations for each user


In [6]:
from src.models import User2Vec, tokenize

# Hyperparameters
vector_size = 10
min_count = 2
epochs = 40

In [5]:
df_network = df[[
    'tweet_id', 'original_tweet_id_str', 
    'user_id', 'name', 'full_text_processed', 
]].copy()

In [7]:
unique_docs = df_network['full_text_processed'].unique()
train_corpus = [tokenize(doc, tag) for doc, tag in zip(unique_docs, range(unique_docs.shape[0]))]

In [8]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = User2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2020-08-13 11:25:06,742 : INFO : collecting all words and their counts
2020-08-13 11:25:06,744 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-08-13 11:25:06,893 : INFO : PROGRESS: at example #10000, processed 142336 words (961522/s), 13739 word types, 10000 tags
2020-08-13 11:25:06,960 : INFO : PROGRESS: at example #20000, processed 281233 words (2116381/s), 20729 word types, 20000 tags
2020-08-13 11:25:07,032 : INFO : PROGRESS: at example #30000, processed 413921 words (1853308/s), 26407 word types, 30000 tags
2020-08-13 11:25:07,088 : INFO : PROGRESS: at example #40000, processed 541298 words (2333120/s), 30667 word types, 40000 tags
2020-08-13 11:25:07,141 : INFO : PROGRESS: at example #50000, processed 676835 words (2587001/s), 36157 word types, 50000 tags
2020-08-13 11:25:07,214 : INFO : PROGRESS: at example #60000, processed 810216 words (1849519/s), 42092 word types, 60000 tags
2020-08-13 11:25:07,271 : INFO : PROGRESS: at example #70000, pr

2020-08-13 11:26:10,747 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-13 11:26:10,747 : INFO : EPOCH - 6 : training on 1133457 raw words (1038508 effective words) took 6.2s, 167605 effective words/s
2020-08-13 11:26:11,831 : INFO : EPOCH 7 - PROGRESS: at 15.98% examples, 163034 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:26:12,886 : INFO : EPOCH 7 - PROGRESS: at 33.88% examples, 168230 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:26:13,934 : INFO : EPOCH 7 - PROGRESS: at 52.08% examples, 169936 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:26:14,966 : INFO : EPOCH 7 - PROGRESS: at 68.92% examples, 170067 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:26:15,996 : INFO : EPOCH 7 - PROGRESS: at 86.54% examples, 171408 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:26:16,750 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-13 11:26:16,773 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-13 11:26:16,775 :

2020-08-13 11:27:04,258 : INFO : EPOCH 14 - PROGRESS: at 54.91% examples, 89939 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:27:05,284 : INFO : EPOCH 14 - PROGRESS: at 66.59% examples, 93814 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:27:06,310 : INFO : EPOCH 14 - PROGRESS: at 77.70% examples, 96561 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:27:07,388 : INFO : EPOCH 14 - PROGRESS: at 86.83% examples, 95180 words/s, in_qsize 5, out_qsize 0
2020-08-13 11:27:08,479 : INFO : EPOCH 14 - PROGRESS: at 98.31% examples, 97030 words/s, in_qsize 2, out_qsize 1
2020-08-13 11:27:08,481 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-13 11:27:08,515 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-13 11:27:08,525 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-13 11:27:08,525 : INFO : EPOCH - 14 : training on 1133457 raw words (1038491 effective words) took 10.6s, 98335 effective words/s
2020-08-13 11:27:09,552 

2020-08-13 11:28:04,654 : INFO : EPOCH 19 - PROGRESS: at 82.19% examples, 95961 words/s, in_qsize 5, out_qsize 0
2020-08-13 11:28:05,725 : INFO : EPOCH 19 - PROGRESS: at 87.58% examples, 91168 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:28:06,779 : INFO : EPOCH 19 - PROGRESS: at 95.36% examples, 89933 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:28:07,466 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-13 11:28:07,498 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-13 11:28:07,570 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-13 11:28:07,574 : INFO : EPOCH - 19 : training on 1133457 raw words (1038440 effective words) took 11.8s, 88112 effective words/s
2020-08-13 11:28:08,679 : INFO : EPOCH 20 - PROGRESS: at 10.13% examples, 100959 words/s, in_qsize 5, out_qsize 0
2020-08-13 11:28:09,684 : INFO : EPOCH 20 - PROGRESS: at 22.81% examples, 117982 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:28:10,70

2020-08-13 11:28:57,856 : INFO : EPOCH - 25 : training on 1133457 raw words (1038809 effective words) took 8.6s, 120304 effective words/s
2020-08-13 11:28:58,882 : INFO : EPOCH 26 - PROGRESS: at 10.92% examples, 116873 words/s, in_qsize 5, out_qsize 0
2020-08-13 11:28:59,888 : INFO : EPOCH 26 - PROGRESS: at 21.02% examples, 113093 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:29:00,957 : INFO : EPOCH 26 - PROGRESS: at 33.11% examples, 112411 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:29:01,968 : INFO : EPOCH 26 - PROGRESS: at 45.67% examples, 115770 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:29:02,991 : INFO : EPOCH 26 - PROGRESS: at 58.83% examples, 119285 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:29:04,050 : INFO : EPOCH 26 - PROGRESS: at 71.49% examples, 119968 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:29:05,099 : INFO : EPOCH 26 - PROGRESS: at 84.13% examples, 120130 words/s, in_qsize 5, out_qsize 0
2020-08-13 11:29:06,193 : INFO : EPOCH 26 - PROGRESS: at 95.58% 

2020-08-13 11:29:53,949 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-13 11:29:53,954 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-13 11:29:53,961 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-13 11:29:53,962 : INFO : EPOCH - 32 : training on 1133457 raw words (1038524 effective words) took 7.3s, 142303 effective words/s
2020-08-13 11:29:55,049 : INFO : EPOCH 33 - PROGRESS: at 13.42% examples, 135893 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:29:56,063 : INFO : EPOCH 33 - PROGRESS: at 28.25% examples, 144254 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:29:57,181 : INFO : EPOCH 33 - PROGRESS: at 42.68% examples, 139437 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:29:58,202 : INFO : EPOCH 33 - PROGRESS: at 57.39% examples, 140186 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:29:59,286 : INFO : EPOCH 33 - PROGRESS: at 71.49% examples, 139592 words/s, in_qsize 5, out_qsize 0
2020-08-13 11:30:00

2020-08-13 11:30:46,194 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-13 11:30:46,207 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-13 11:30:46,224 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-13 11:30:46,225 : INFO : EPOCH - 39 : training on 1133457 raw words (1038469 effective words) took 7.3s, 142951 effective words/s
2020-08-13 11:30:47,272 : INFO : EPOCH 40 - PROGRESS: at 10.88% examples, 114447 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:30:48,365 : INFO : EPOCH 40 - PROGRESS: at 23.74% examples, 119986 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:30:49,401 : INFO : EPOCH 40 - PROGRESS: at 36.17% examples, 121300 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:30:50,437 : INFO : EPOCH 40 - PROGRESS: at 49.30% examples, 121639 words/s, in_qsize 6, out_qsize 0
2020-08-13 11:30:51,509 : INFO : EPOCH 40 - PROGRESS: at 62.33% examples, 122997 words/s, in_qsize 5, out_qsize 0
2020-08-13 11:30:52

In [9]:
vec_cols = [f'vec_{i}' for i in range(vector_size)]

users_id, user_vectors = model.infer_user_vectors(
    df_network['user_id'], 
    df_network['full_text_processed']
)

df_user_vecs = pd.DataFrame(user_vectors, columns=vec_cols)
df_user_vecs['user_id'] = users_id

In [None]:
df_user_vecs

**Sanity Checks**: To be removed

In [81]:
tweet_ids = df['target'].tolist()
check_existence = df.apply(lambda x: x['source'] in tweet_ids, axis=1)

In [82]:
from collections import Counter
Counter(check_existence)

Counter({False: 120328, True: 354083})

In [40]:
#pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [84]:
ed_df.shape

(474419, 129)