In [3]:
## Plotting tools
%matplotlib inline
from bokeh.io import output_notebook, show, export_png, output_file
output_notebook()
from bokeh.plotting import figure
from bokeh.layouts import gridplot, column, row
from bokeh.models import Div
from bokeh.models import LinearAxis, Range1d
from bokeh.models import ColumnDataSource, Label, LabelSet, Range1d

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

#Import data processing packages
import pandas as pd
import numpy as np

#Import network packages
import networkx as nx

import collections          # for manipulation tuples and zipping objects
import statistics as stats  # for generating summary statistics
import time                 # for measuring computating time
from matplotlib import pyplot as plt  # for outputting nice plots
import seaborn as sns                 # for creating even nicer plots

# Install pygraphviz
!apt-get install graphviz graphviz-dev
!pip install pygraphviz 

zsh:1: command not found: apt-get


In [4]:
# In order to look at in- and out-degree separately, we first need to construct a directed graph.
twitter_data_file = 'twitter_combined.txt'

G = nx.read_edgelist(twitter_data_file,
                     create_using = nx.Graph(), 
                     nodetype = int)

directed_G = nx.read_edgelist(twitter_data_file,
                     create_using = nx.DiGraph(), 
                     nodetype = int)

In [5]:
#nx.draw(directed_G, with_labels = True)

In [6]:
directed_G_nodes = list(directed_G.nodes)
directed_G_in_degree = list(dict(directed_G.in_degree(directed_G_nodes)).values())
directed_G_out_degree = list(dict(directed_G.out_degree(directed_G_nodes)).values())

# print(type(directed_G.in_degree(directed_G_nodes)))
# print(list(dict(directed_G.in_degree(directed_G_nodes)).values()))

In [7]:
# Network metric statistics
def network_metric_statistics(metric_data):
    avg = stats.mean(metric_data)
    med = stats.median(metric_data)
    std = stats.stdev(metric_data)
    
    return("Here is a quick summary of your data: average = " + '{:.5f}'.format(avg) + ", median = " + '{:.5f}'.format(med) + ", standard deviation = " + '{:.5f}'.format(std))

In [8]:
in_degree_sequence = sorted((directed_G_in_degree[d] for d in range(len(directed_G.in_degree))), reverse=True)  # degree sequence for nx v2
network_metric_statistics(in_degree_sequence)

'Here is a quick summary of your data: average = 21.74685, median = 8.00000, standard deviation = 57.96535'

In [9]:
out_degree_sequence = sorted((directed_G_out_degree[d] for d in range(len(directed_G.out_degree))), reverse=True)  # degree sequence for nx v2
network_metric_statistics(out_degree_sequence)

'Here is a quick summary of your data: average = 21.74685, median = 9.00000, standard deviation = 36.98985'

In [10]:
# adj = nx.adjacency_matrix(G)
# # Maybe not needed?
# adj_spectrum = nx.adjacency_spectrum(G)
# inv_max = 1/max(np.real(adj_spectrum))
# alpha_candidate = 0.5*inv_max

In [11]:
eig = nx.eigenvector_centrality(G)
# katz = nx.katz_centrality(G, alpha = alpha_candidate, beta = inv_max)
page_rank = nx.pagerank(G)

In [12]:
katz = nx.katz_centrality(G, max_iter = 100)

KeyboardInterrupt: 

In [13]:
eig_df = pd.DataFrame.from_dict(eig, orient='index', columns = ['E centrality'])
# katz_df = pd.DataFrame.from_dict(katz, orient='index', columns = ['K centrality'])
pg_df = pd.DataFrame.from_dict(page_rank, orient='index', columns = ['PG centrality'])

In [14]:
eig_df = eig_df.fillna(0.)
# katz_df = katz_df.fillna(0.)
pg_df = pg_df.fillna(0)

In [15]:
eig_df = eig_df['E centrality'].sort_values(ascending = False)

In [17]:
eig_df.head(30)


40981798     0.172362
43003845     0.165223
22462180     0.162535
34428380     0.162134
27633075     0.102091
31331740     0.101237
83943787     0.093309
18996905     0.092844
208132323    0.091336
117674417    0.088590
116036694    0.084743
88323281     0.080952
17868918     0.080584
440963134    0.078033
195475105    0.078023
238260874    0.077274
133055665    0.076692
8088112      0.076029
263838766    0.074988
153226312    0.072085
204317520    0.070844
28465635     0.069739
100318079    0.069466
26929220     0.069152
151338729    0.068640
270449528    0.067250
121533789    0.065762
280935165    0.065593
196680777    0.064730
276706356    0.064556
Name: E centrality, dtype: float64

In [None]:
# katz_df = katz_df['K centrality'].sort_values(ascending = False)

In [None]:
# katz_df_30 = katz_df.head(30)

In [18]:
pg_df = pg_df['PG centrality'].sort_values(ascending = False)

In [19]:
pg_df.head(30)

115485051    0.001425
813286       0.001317
40981798     0.000867
7861312      0.000753
3359851      0.000701
43003845     0.000667
90420314     0.000623
16303106     0.000611
62581962     0.000590
972651       0.000586
15913        0.000586
1183041      0.000572
17093617     0.000550
22462180     0.000543
34428380     0.000539
14230524     0.000525
79797834     0.000524
10671602     0.000523
59804598     0.000522
15846407     0.000508
31353077     0.000507
14075928     0.000497
25521487     0.000496
63485337     0.000488
15439395     0.000485
428333       0.000472
11348282     0.000470
18927441     0.000450
783214       0.000444
48485771     0.000377
Name: PG centrality, dtype: float64