# Automated network analysis

In [None]:
import os
import pandas as pd
from collections import ChainMap
import networkx as nx
from community import community_louvain
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## Preparing data: Network data and node attributes

In [None]:
# set directory
os.chdir(r'C:\Users\maril\Documents\20-21 KU\block 4\DM\twitter')

In [None]:
# load edgelist
de_temp_edgelist = pd.read_csv('final_data_prepare1\de_edgelist_rt.csv', dtype = {'source':str,'target':str})
da_temp_edgelist = pd.read_csv('final_data_prepare1\da_edgelist_rt.csv', dtype = {'source':str,'target':str})
pl_temp_edgelist = pd.read_csv('final_data_prepare1\pl_edgelist_rt.csv', dtype = {'source':str,'target':str})

## Prepare the node labels

In [None]:
# load the dataframe
de = pd.read_csv(r'network_analysis\automated_network\de_labelled_users.csv')
da = pd.read_csv(r'network_analysis\automated_network\da_labelled_users.csv')
pl = pd.read_csv(r'network_analysis\automated_network\pl_labelled_users.csv')

In [None]:
# subset the dataframes to only include anti-vax, pro-vax and neutral users
de = de[de['labels']!='trash']
da = da[da['labels']!='trash']
pl = pl[pl['labels']!='trash']

# reset index
de.reset_index(inplace=True, drop=True)
da.reset_index(inplace=True, drop=True)
pl.reset_index(inplace=True, drop=True)

In [None]:
# remove all edges to or from a trash user

def remove_trash_edges(df, edgelist):

    # create set of all relevant users
    user_set = set(user for user in df['user'])
    
    # lists to store the final edges in
    source_list = [] 
    target_list = []

    # iterate through length of dataset
    for i in range(len(edgelist)):
        
        # retrieve the target and source user name
        source = edgelist['source'][i]
        target = edgelist['target'][i]
        
        if source in user_set:
            if target in user_set:
                source_list.append(source)
                target_list.append(target)
                
    # update the edgelist
    edgelist_final = pd.DataFrame(list(zip(source_list, target_list)), columns=['source','target'])
        
    return edgelist_final

In [None]:
# apply function
de_edgelist = remove_trash_edges(de, de_temp_edgelist)
da_edgelist = remove_trash_edges(da, da_temp_edgelist)
pl_edgelist = remove_trash_edges(pl, pl_temp_edgelist)

In [None]:
# function to create a user and label column
def label_column(df):
    
    # create a dictionary of users and their labels
    label_dict = dict(zip(df['user'], df['labels']))

    # create the final node dict (dict of dicts) to be passed
    node_dict = {}

    for i in label_dict:
        node_dict[i] = {'type':label_dict[i]}
    
    return node_dict

In [None]:
# apply function to datasets
de_node_dict = label_column(de)
da_node_dict = label_column(da)
pl_node_dict = label_column(pl)

# Degree centrality (entire network)

In [None]:
# retrieve the degree centrality of the network (here: undirected degree centrality)

def degree_centrality(edgelist):
    
    # create directed graph from all nodes and edges that we read in as an edgelist
    G = nx.from_pandas_edgelist(edgelist, source='source', target='target')

    # degree centrality for all nodes
    cent = nx.degree_centrality(G)

    # turn centrality into a dataframe
    cent_df = pd.DataFrame(cent.items(),columns=['user', 'degree']).sort_values(by='degree', ascending=False)

    # calculate the mean degree centrality in this network
    degree_cent = cent_df['degree'].mean()
    
    return degree_cent

In [None]:
# apply function
de_deg = degree_centrality(de_edgelist)
da_deg = degree_centrality(da_edgelist)
pl_deg = degree_centrality(pl_edgelist)

# print the results
print(f'German data: Degree centrality of {de_deg}')
print(f'Danish data: Degree centrality of {da_deg}')
print(f'Polish data: Degree centrality of {pl_deg}')

# Weighted, directed network
First we calculate the edge weights, then we create the directed network based on the edgelist.
  
An edge is drawn if source retweets target, i.e.:
* nodes with high indegree = nodes receiving a lot of retweets
* nodes with high outdegree = nodes retweeting others a lot

In [None]:
# calculating the weights

def graph(edgelist, node_dict):
    
    # calculate the edge weights
    weighted_edgelist = pd.DataFrame(edgelist.groupby(edgelist.columns.tolist(),as_index=False).size())
    weighted_edgelist = weighted_edgelist.rename(columns={'size':'weight'})
    weighted_edgelist = weighted_edgelist.sort_values('weight', ascending=False)
    
    # create directed graph from all nodes and edges that we read in as an edgelist
    DiG = nx.from_pandas_edgelist(weighted_edgelist, 
                                  source='source', 
                                  target='target', 
                                  edge_attr=['weight'], 
                                  create_using= nx.DiGraph)
    
    # set the node attributes
    nx.set_node_attributes(DiG, node_dict)

    return DiG, weighted_edgelist

In [None]:
# apply function
DiG_de, de_weighted_edgelist = graph(de_edgelist, de_node_dict)
DiG_da, da_weighted_edgelist = graph(da_edgelist, da_node_dict)
DiG_pl, pl_weighted_edgelist = graph(pl_edgelist, pl_node_dict)

In [None]:
# retrieve the standardized indegree and outdegree measures for all nodes and save as dictionaries

def in_out(DiG):

    # in- and outdegree centrality
    idc_dict = nx.in_degree_centrality(DiG)
    odc_dict = nx.out_degree_centrality(DiG)

    # turn the dictionaries into a dict of dicts in order to pass it as a node attribute
    deg_dict = dict(ChainMap(idc_dict, odc_dict))

    # create the final node dict (dict of dicts) to be passed
    node_degree_dict = {}

    for i in deg_dict:
        node_degree_dict[i] = {'idc':deg_dict[i], 'odc':deg_dict[i]}
        
    # set the indegree and outdegree centrality as node attributes
    nx.set_node_attributes(DiG, node_degree_dict)
    
    # retrieve the accounts with the highest indegree and outdegree: turn the 'idc' and 'odc' dict into a dataframe
    idc = pd.DataFrame(idc_dict.items(),columns=['user', 'indegree']).sort_values(by='indegree', ascending=False)
    odc = pd.DataFrame(idc_dict.items(),columns=['user', 'outdegree']).sort_values(by='outdegree', ascending=False)
    
    return DiG, idc, odc, node_degree_dict

In [None]:
# apply function
de_DiG, de_idc, de_odc, de_node_degree_dict = in_out(DiG_de)
da_DiG, da_idc, da_odc, da_node_degree_dict = in_out(DiG_da)
pl_DiG, pl_idc, pl_odc, pl_node_degree_dict = in_out(DiG_pl)

In [None]:
# display the German df's
display("Users with highest indegree", de_idc.head(5), "Users with highest outdegree", de_odc.head(5))

In [None]:
# display Danish df's
display("Users with highest indegree", da_idc.head(5), "Users with highest outdegree", da_odc.head(5))

In [None]:
display("Users with highest indegree", pl_idc.head(5), "Users with highest outdegree", pl_odc.head(5))

# Community Detection

The general goal of community detection is to find cohesive subgroups (= communities) within the network. For this to work, we need to treat the network (which is originally both weighted and directed) as a weighted and undirected network.
  
### Modularity
Here we calculate communities based on modularity (Louvian community detection algorithm).

In [None]:
# function to create graph and perform community detection

def louvain_comm(weighted_edgelist, node_degree_dict, node_dict):

    # create an undirected, weighted graph
    UG = nx.from_pandas_edgelist(weighted_edgelist, 
                                 source='source', 
                                 target='target', 
                                 edge_attr=['weight'])
    
    # add labels
    nx.set_node_attributes(UG, node_dict)
    
    # add indegree and outdegree centrality
    nx.set_node_attributes(UG, node_degree_dict)
    
    # Louvain community detection on weighted graph
    partition = community_louvain.best_partition(UG, weight='weight')
    
    # add community as node attribute
    for c in partition:
        UG.nodes[c]['community'] = partition[c]
    
    return UG

In [None]:
# apply function
de_UG = louvain_comm(de_weighted_edgelist, de_node_degree_dict, de_node_dict)
da_UG = louvain_comm(da_weighted_edgelist, da_node_degree_dict, da_node_dict)
pl_UG = louvain_comm(pl_weighted_edgelist, pl_node_degree_dict, pl_node_dict)

In [None]:
# get the number nodes and edges in each graph
print(f'German network: {len(de_UG.nodes)} nodes and {len(de_UG.edges)} edges')
print(f'Danish network: {len(da_UG.nodes)} nodes and {len(da_UG.edges)} edges')
print(f'Polish network: {len(pl_UG.nodes)} nodes and {len(pl_UG.edges)} edges')

In [None]:
# export to Gephi
nx.write_gexf(de_UG, r'network_analysis\automated_network\rt_network_louvain_de2.gexf')
nx.write_gexf(da_UG, r'network_analysis\automated_network\rt_network_louvain_da2.gexf')
nx.write_gexf(pl_UG, r'network_analysis\automated_network\rt_network_louvain_pl2.gexf')

# K-core decomposition

The k-core decomposition technique is often used to distinguish between the core and the periphery of a network. In order to do so, the algorithm divides the network into layers (also called k-shells). Each k-shell contains nodes which are equally well or better connected than all other nodes in that k-shell. In other words, the nodes in the high k-shells make up the core of the network because they are not only well connected, but they are well connected to other central (= well connected) nodes. 

In [None]:
# decompose network

def decompose(UG):
    
    # remove self-loops (otherwise the networkx can't decompose the network)
    UG.remove_edges_from(nx.selfloop_edges(UG))

    # Computing the coresness of each country
    G_coreness = nx.core_number(UG)

    # Returns a dictionary of the maximum k-core of each node
    G_coreness
    
    # add core number as node attribute
    for c in G_coreness:
        UG.nodes[c]['kshell'] = G_coreness[c]
        
    # turn the results into a pandas df
    core_df = pd.DataFrame(G_coreness.items(),columns=['user', 'shell']).sort_values(by='shell', ascending=False)
    
    return UG, core_df

In [None]:
# apply function
de_UG, de_core_df = decompose(de_UG)
da_UG, da_core_df = decompose(da_UG)
pl_UG, pl_core_df = decompose(pl_UG)

In [None]:
# what is the highest k-shell?
print('German data: Highest k-shell is', max(de_core_df['shell']))
print('Danish data: Highest k-shell is', max(da_core_df['shell']))
print('Polish data: Highest k-shell is', max(pl_core_df['shell']))

In [None]:
# let's have a look at the users in the highest k-shell

def max_shell_users(core_df):
    # empty set to store the user handles in
    core_users = set()

    # number of the highest k-shell
    max_shell = max(core_df['shell'])
    
    # get the users in the highest shells (i.e. the top 20% of shells)    
    for i in range(int(max_shell * 0.5)):
    
        # iterate through all users in the highest shell and add the names to the 'core_users' set
        for user in core_df.loc[core_df['shell'] == max_shell-i, 'user']:
            core_users.add(user)

    return core_users

In [None]:
# apply function
de_core_users = max_shell_users(de_core_df)
da_core_users = max_shell_users(da_core_df)
pl_core_users = max_shell_users(pl_core_df)

In [None]:
# find the anti-vaxx actors in the highest k-shell

def anti_vax_shell(df, core_users):
    
    # create set of anti-vaxx accounts
    anti_vaxx_users = set()

    # check which users in the highest k-shell are anti-vax actors
    for user in core_users:

        if df[df['user'] == user]['labels'].values[0] == 'anti-vaxx':
            anti_vaxx_users.add(user)

    return anti_vaxx_users

In [None]:
# apply function
de_anti_vaxx = anti_vax_shell(de, de_core_users)
da_anti_vaxx = anti_vax_shell(da, da_core_users)
pl_anti_vaxx = anti_vax_shell(pl, pl_core_users)

## Visualization as plots: Percentage of anti-vax actors across k-shells

In [None]:
def anti_vax_proportion(df, UG):

    # create set of anti-vaxx accounts
    anti_vaxx_users = set()

    for i in range(len(df)):
   
        if df['labels'][i] == 'anti-vaxx':
            anti_vaxx_users.add(df['user'][i])  
            
    # check how many anti-vaxxers there are in each community
    # retrieve the 'kshell' node attribute as a dictionary
    shell = nx.get_node_attributes(UG, name='kshell')

    # turn the 'comm' dict into a dataframe
    shell_df = pd.DataFrame(shell.items(),columns=['user', 'kshell']).sort_values(by='kshell', ascending=True)

    # add a 'anti_vaxx' column that is set to 0
    shell_df['anti-vaxx'] = 0

    # loop through the rows in the comm_df dataframe
    for i in range(len(shell_df['user'])):

        # if the user is in bot_set, update the value in the 'bot' column to 1
        if shell_df['user'][i] in anti_vaxx_users:
            shell_df.iloc[i,2] = 1

    # group the shell_df dataframe by 'anti-vaxx' to see how many anti-vaccine users there are in each shell
    shell_group_df = pd.DataFrame(shell_df.groupby(['kshell','anti-vaxx']).user.count().unstack(level = 1))
    
    # rename the columns
    shell_group_df.columns = ['num_vaxxer', 'num_anti_vaxxer']
    
    # calculate percentages
    # add a total column
    shell_group_df['num_total'] = shell_group_df['num_vaxxer'] + shell_group_df['num_anti_vaxxer']
    
    # calculate percentages
    shell_group_df['pct_anti_vaxxer_nodes'] = shell_group_df['num_anti_vaxxer']/shell_group_df['num_total']
    
    # replace NaN values by 0
    shell_group_df.loc[shell_group_df['pct_anti_vaxxer_nodes'].isna(), 'pct_anti_vaxxer_nodes'] = 0

    # display the dataframe
    shell_group_df.head(len(shell_group_df))
    
    return shell_group_df

In [None]:
# apply function
de_shell_group_df = anti_vax_proportion(de, de_UG)
da_shell_group_df = anti_vax_proportion(da, da_UG)
pl_shell_group_df = anti_vax_proportion(pl, pl_UG)

In [None]:
# make the plots
fig, (ax1,ax2,ax3) = plt.subplots(ncols=3, figsize=(20,4))

# Danish data
ax1.plot('pct_anti_vaxxer_nodes', '.', color='#1E337F', data=da_shell_group_df)

# y axis
ylim1 = ax1.get_ylim()
ax1.set_ylim(0, ylim1[1])

# axis labels
ax1.set_xlabel('K-shell')
ax1.set_ylabel('Percentage of anti-vaccine users')

# title
ax1.set_title('Danish data: Percentage of anti-vaccine users per k-shell')

# German data
ax2.plot('pct_anti_vaxxer_nodes', '.', color='#1E337F', data=de_shell_group_df)

# y axis
ylim2 = ax2.get_ylim()
ax2.set_ylim(0,ylim1[1])

# axis labels
ax2.set_xlabel('K-shell')
ax2.set_ylabel('Percentage of anti-vaccine users')

# title
ax2.set_title('German data: Percentage of anti-vaccine users per k-shell')


# Polish data
ax3.plot('pct_anti_vaxxer_nodes', '.', color='#1E337F', data=pl_shell_group_df)

# y axis
ylim3 = ax3.get_ylim()
ax3.set_ylim(0,ylim1[1])

# axis labels
ax3.set_xlabel('K-shell')
ax3.set_ylabel('Percentage of anti-vaccine users')

# title
_ = ax3.set_title('Polish data: Percentage of anti-vaccine users per k-shell')

In [None]:
# save plot
fig.savefig(r'network_analysis\automated_network\antivax_scatter.png',dpi=600)