In [35]:
import pandas as pd
import numpy as np
import networkx as nx

import matplotlib.pyplot as plt
import math as m

from utility import log

# Preprocessing
Before starting with the actual algorithms we need to apply some prepocessing. For example removing data that is not relevant for our research, and extracting all nodes and links.

In [18]:
# Load the datafile
log("Loading data")
file_path = "data\soc-redditHyperlinks-body.tsv"
raw_data = pd.read_csv(file_path, sep="\t") 

[09:59:34] Loading data


In [19]:
# Remove columns we won't be using
log("Dropping")
data = raw_data.drop(columns=['POST_ID', 'PROPERTIES'])

# Create a list of all nodes
log("Creating list of all nodes")
nodes = pd.concat([data['SOURCE_SUBREDDIT'], data['TARGET_SUBREDDIT']], axis=0)
nodes.drop_duplicates(inplace=True)


log(f"Nodes: {nodes.size:,}")
log(f"Links: {data.shape[0]:,}")

[09:59:50] Dropping
[09:59:50] Creating list of all nodes
[09:59:50] Nodes: 35,776
[09:59:50] Links: 286,561


In [22]:
# Reduce the amount of links to just one direct link.
# This can be useful if we want a more compressed view of the data.
links_compressed = data.drop(columns=["TIMESTAMP"]).groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']).agg('sum')

log(f"Compressed links: {links_compressed.shape[0]:,}")

[10:06:25] Compressed links: 137,821


In [23]:
log("Preparing nx.MultiDiGraph()...")
mGraph = nx.MultiDiGraph()

log("Adding nodes...")
for node in nodes:
    mGraph.add_node(node)

log("Adding edges...")
for index, edge in data.iterrows():
    mGraph.add_edge(edge['SOURCE_SUBREDDIT'], edge['TARGET_SUBREDDIT'], sentiment=edge['LINK_SENTIMENT'])

[10:17:25] Preparing nx.MultiDiGraph()...
[10:17:25] Adding nodes...
[10:17:25] Adding edges...


# Anaylsis
This is the analysis of the network

In [69]:
log(f"mGraph > Vertices: \t{mGraph.number_of_nodes():,}")
log(f"mGraph > Edges: \t{mGraph.number_of_edges():,}")

in_degree = pd.DataFrame(mGraph.in_degree(), columns=['SOURCE', 'degree'])
out_degree = pd.DataFrame(mGraph.out_degree(), columns=['SOURCE', 'degree'])

in_degree.sort_values('degree', ascending=False, inplace=True)
out_degree.sort_values('degree', ascending=False, inplace=True)

log("In degree: (Most referenced subreddits)")
print(in_degree.head(5))

log("Out degree: (Referencing other subreddits)")
print(out_degree.head(5))


[10:54:00] mGraph > Vertices: 	35,776
[10:54:00] mGraph > Edges: 	286,561
[10:54:00] In degree: (Most referenced subreddits)
               SOURCE  degree
34          askreddit    7329
22               iama    3694
7741             pics    2779
42     writingprompts    2490
11294          videos    2446
[10:54:00] Out degree: (Referencing other subreddits)
               SOURCE  degree
70     subredditdrama    4665
16        circlebroke    2358
6400  shitliberalssay    1968
126      outoftheloop    1958
79          copypasta    1824
