In [2]:
import pandas as pd
import numpy as np
import networkx as nx

import matplotlib.pyplot as plt
import math as m

from utility import log

# Preprocessing
Before starting with the actual algorithms we need to apply some prepocessing. For example removing data that is not relevant for our research, and extracting all nodes and links.

In [5]:
# Load the datafile
log("Loading data")
file_path = "data/soc-redditHyperlinks-body.tsv"
raw_data = pd.read_csv(file_path, sep="\t") 

[14:32:32] Loading data


In [6]:
# Remove columns we won't be using
log("Dropping")
data = raw_data.drop(columns=['POST_ID', 'PROPERTIES'])

# Create a list of all nodes
log("Creating list of all nodes")
nodes = pd.concat([data['SOURCE_SUBREDDIT'], data['TARGET_SUBREDDIT']], axis=0)
nodes.drop_duplicates(inplace=True)


log(f"Nodes: {nodes.size:,}")
log(f"Links: {data.shape[0]:,}")

[14:33:26] Dropping
[14:33:26] Creating list of all nodes
[14:33:26] Nodes: 35,776
[14:33:26] Links: 286,561


In [7]:
# Reduce the amount of links to just one direct link.
# This can be useful if we want a more compressed view of the data.
links_compressed = data.drop(columns=["TIMESTAMP"]).groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']).agg('sum')

log(f"Compressed links: {links_compressed.shape[0]:,}")

[14:33:29] Compressed links: 137,821


In [8]:
log("Preparing nx.MultiDiGraph()...")
mGraph = nx.MultiDiGraph()

log("Adding nodes...")
for node in nodes:
    mGraph.add_node(node)

log("Adding edges...")
for index, edge in data.iterrows():
    mGraph.add_edge(edge['SOURCE_SUBREDDIT'], edge['TARGET_SUBREDDIT'], sentiment=edge['LINK_SENTIMENT'])

[14:33:34] Preparing nx.MultiDiGraph()...
[14:33:34] Adding nodes...
[14:33:34] Adding edges...


# Anaylsis
This is the analysis of the network

In [34]:
log(f"mGraph > Vertices: \t{mGraph.number_of_nodes():,}")
log(f"mGraph > Edges: \t{mGraph.number_of_edges():,}")

in_degree = pd.DataFrame(mGraph.in_degree(), columns=['SOURCE', 'degree'])
out_degree = pd.DataFrame(mGraph.out_degree(), columns=['SOURCE', 'degree'])

in_degree.sort_values('degree', ascending=False, inplace=True)
out_degree.sort_values('degree', ascending=False, inplace=True)

log("In degree: (Most referenced subreddits)")
print(in_degree.head(5))

log("Out degree: (Referencing other subreddits)")
print(out_degree.head(5))

# Calculate (In/Out)-Degree centrality (in and out degree)
total_degree = in_degree.merge(out_degree, on='SOURCE', suffixes=('_in', '_out'))
total_degree['total_degree_centrality'] = total_degree['degree_in'] + total_degree['degree_out']
total_degree_centrality = total_degree.sort_values('total_degree_centrality', ascending=False)

log("Total Degree-centrality:")
print(total_degree.head(5))

# Calculate betweenness centrality using an approximation measure for time reasons
# If we calculate betweenness centrality for k=n then it will take too much (we can use a sample of k nodes)

# TODO: Maybe here calculate multiple betweeness centrality and average the result
betweennes_calculation = nx.betweenness_centrality(mGraph, k = 100)
betweenness_centrality = pd.DataFrame(betweennes_calculation.items(), columns=['SOURCE', 'betweenness_centrality'])

betweenness_centrality = betweenness_centrality.sort_values('betweenness_centrality', ascending=False)

log("Betweenness Centrality: (Influence of node in graph)")
print(betweenness_centrality.head(5))

# TODO: Calculate closeness centrality: This takes too LONG
# closeness_calculation = nx.closeness_centrality(mGraph)
# closeness_centrality = pd.DataFrame(closeness_calculation.items(), columns=['SOURCE', 'closeness_centrality'])
# closeness_centrality = closeness_centrality.sort_values('closeness_centrality', ascending=False)

# print("Closeness Centrality:")
# print(closeness_centrality.head(5))

# Calculate eigenvector centrality. We need a function
# which does that for multiGraph instead of singleGraph
def m_eigenvector_centrality(G, max_iter=100, tol=1e-6):
    nodes = G.nodes()
    n = len(nodes)
    eigenvector = {node: 1 / n for node in nodes}

    # Go until max iterations
    for _ in range(max_iter):
        new_eigenvector = {}

        # Find centrality values
        for node in nodes:
            centrality = sum(eigenvector[v] for v in G.successors(node))
            new_eigenvector[node] = centrality

        # Normalize the eigenvector
        norm = np.linalg.norm(list(new_eigenvector.values()))
        new_eigenvector = {k: v / norm for k, v in new_eigenvector.items()}

        # Check for convergence
        if sum((new_eigenvector[node] - eigenvector[node])**2 for node in nodes) < tol:
            return new_eigenvector

        eigenvector = new_eigenvector
    return eigenvector

eigenvector_calculation = m_eigenvector_centrality(mGraph)
eigenvector_centrality = pd.DataFrame(eigenvector_calculation.items(), columns=['SOURCE', 'eigenvector_centrality'])
eigenvector_centrality = eigenvector_centrality.sort_values('eigenvector_centrality', ascending=False)

log("Top 5 Nodes by Eigenvector Centrality:")
print(eigenvector_centrality.head(5))



[16:14:14] mGraph > Vertices: 	35,776
[16:14:14] mGraph > Edges: 	286,561
[16:14:14] In degree: (Most referenced subreddits)
               SOURCE  degree
34          askreddit    7329
22               iama    3694
7741             pics    2779
42     writingprompts    2490
11294          videos    2446
[16:14:14] Out degree: (Referencing other subreddits)
               SOURCE  degree
70     subredditdrama    4665
16        circlebroke    2358
6400  shitliberalssay    1968
126      outoftheloop    1958
79          copypasta    1824
[16:14:14] Total Degree-centrality:
           SOURCE  degree_in  degree_out  total_degree_centrality
0       askreddit       7329        1338                     8667
1            iama       3694        1181                     4875
2            pics       2779           4                     2783
3  writingprompts       2490        1707                     4197
4          videos       2446           4                     2450
[16:14:25] Betweenness Centra