In [31]:
import pandas as pd
import numpy as np
import networkx as nx
import praw

import matplotlib.pyplot as plt
import math as m

from utility import log

from networkx.algorithms import approximation
from collections import Counter



# Preprocessing
Before starting with the actual algorithms we need to apply some prepocessing. For example removing data that is not relevant for our research, and extracting all nodes and links.

In [3]:
# Load the datafile
log("Loading data")
file_path = "data/soc-redditHyperlinks-body.tsv"
raw_data = pd.read_csv(file_path, sep="\t") 

[13:22:35] Loading data


In [4]:
# Remove columns we won't be using
log("Dropping")
data = raw_data.drop(columns=['POST_ID', 'PROPERTIES'])

# Create a list of all nodes
log("Creating list of all nodes")
nodes = pd.concat([data['SOURCE_SUBREDDIT'], data['TARGET_SUBREDDIT']], axis=0)
nodes.drop_duplicates(inplace=True)


log(f"Nodes: {nodes.size:,}")
log(f"Links: {data.shape[0]:,}")

[13:22:42] Dropping
[13:22:42] Creating list of all nodes
[13:22:42] Nodes: 35,776
[13:22:42] Links: 286,561


In [5]:
# Reduce the amount of links to just one direct link.
# This can be useful if we want a more compressed view of the data.
links_compressed = data.drop(columns=["TIMESTAMP"]).groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']).agg('sum')

log(f"Compressed links: {links_compressed.shape[0]:,}")

[13:22:45] Compressed links: 137,821


In [6]:
log("Preparing nx.MultiDiGraph()...")
mGraph = nx.MultiDiGraph()

log("Adding nodes...")
for node in nodes:
    mGraph.add_node(node)

log("Adding edges...")
for index, edge in data.iterrows():
    mGraph.add_edge(edge['SOURCE_SUBREDDIT'], edge['TARGET_SUBREDDIT'], sentiment=edge['LINK_SENTIMENT'])

[13:22:48] Preparing nx.MultiDiGraph()...
[13:22:48] Adding nodes...
[13:22:48] Adding edges...


# Anaylsis
This is the analysis of the network

In [7]:
log(f"mGraph > Vertices: \t{mGraph.number_of_nodes():,}")
log(f"mGraph > Edges: \t{mGraph.number_of_edges():,}")

in_degree = pd.DataFrame(mGraph.in_degree(), columns=['SOURCE', 'degree'])
out_degree = pd.DataFrame(mGraph.out_degree(), columns=['SOURCE', 'degree'])

in_degree.sort_values('degree', ascending=False, inplace=True)
out_degree.sort_values('degree', ascending=False, inplace=True)

log("In degree: (Most referenced subreddits)")
print(in_degree.head(5))

log("Out degree: (Referencing other subreddits)")
print(out_degree.head(5))


[09:22:44] mGraph > Vertices: 	35,776
[09:22:44] mGraph > Edges: 	286,561
[09:22:44] In degree: (Most referenced subreddits)
               SOURCE  degree
34          askreddit    7329
22               iama    3694
7741             pics    2779
42     writingprompts    2490
11294          videos    2446
[09:22:44] Out degree: (Referencing other subreddits)
               SOURCE  degree
70     subredditdrama    4665
16        circlebroke    2358
6400  shitliberalssay    1968
126      outoftheloop    1958
79          copypasta    1824


Calculation of centrality indices: Degree Centrality, Betweenness Centrality, Closeness Centrality and Eigenvector Centrality

In [46]:
# Calculate (In/Out)-Degree centrality (in and out degree)
total_degree = in_degree.merge(out_degree, on='SOURCE', suffixes=('_in', '_out'))
total_degree['total_degree_centrality'] = total_degree['degree_in'] + total_degree['degree_out']
total_degree_centrality = total_degree.sort_values('total_degree_centrality', ascending=False)

log("Total Degree-centrality:")
print(total_degree.head(5))

# Calculate betweenness centrality using an approximation measure for time reasons
# If we calculate betweenness centrality for k=n then it will take too much (we can use a sample of k nodes)

# TODO: Maybe here calculate multiple betweeness centrality and average the result
betweennes_calculation = nx.betweenness_centrality(mGraph, k = 100)
betweenness_centrality = pd.DataFrame(betweennes_calculation.items(), columns=['SOURCE', 'betweenness_centrality'])

betweenness_centrality = betweenness_centrality.sort_values('betweenness_centrality', ascending=False)

log("Betweenness Centrality: (Influence of node in graph)")
print(betweenness_centrality.head(5))


# Calculate eigenvector centrality. We need a function
# which does that for multiGraph instead of singleGraph
def m_eigenvector_centrality(G, max_iter=100, tol=1e-6):
    nodes = G.nodes()
    n = len(nodes)
    eigenvector = {node: 1 / n for node in nodes}

    # Go until max iterations
    for _ in range(max_iter):
        new_eigenvector = {}

        # Find centrality values
        for node in nodes:
            centrality = sum(eigenvector[v] for v in G.successors(node))
            new_eigenvector[node] = centrality

        # Normalize the eigenvector
        norm = np.linalg.norm(list(new_eigenvector.values()))
        new_eigenvector = {k: v / norm for k, v in new_eigenvector.items()}

        # Check for convergence
        if sum((new_eigenvector[node] - eigenvector[node])**2 for node in nodes) < tol:
            return new_eigenvector

        eigenvector = new_eigenvector
    return eigenvector

eigenvector_calculation = m_eigenvector_centrality(mGraph)
eigenvector_centrality = pd.DataFrame(eigenvector_calculation.items(), columns=['SOURCE', 'eigenvector_centrality'])
eigenvector_centrality = eigenvector_centrality.sort_values('eigenvector_centrality', ascending=False)

log("Top 5 Nodes by Eigenvector Centrality:")
print(eigenvector_centrality.head(5))

# Use a subset of nodes to calculate closeness centrality for. The subset is chosen as
# the combination of other centrality values
combined_centrality = total_degree_centrality.merge(betweenness_centrality, on='SOURCE')
combined_centrality = combined_centrality.merge(eigenvector_centrality, on='SOURCE')

# Just sum the values
combined_centrality['combined_centrality'] = (
    combined_centrality['total_degree_centrality'] +
    combined_centrality['betweenness_centrality'] +
    combined_centrality['eigenvector_centrality']
)

combined_centrality = combined_centrality.sort_values('combined_centrality', ascending=False)
# Select the top 20
top_nodes = combined_centrality.head(20)
top_nodes_list = top_nodes['SOURCE'].tolist()

closeness_centrality_calc = {}

for node in top_nodes_list:
    closeness_centrality_calc[node] = nx.closeness_centrality(mGraph, u=node)
closeness_centrality = pd.DataFrame(closeness_centrality_calc.items(), columns=['SOURCE', 'closeness_centrality'])
closeness_centrality = closeness_centrality.sort_values('closeness_centrality', ascending=False)

log("Top 5 Nodes by Closeness Centrality:")
print(closeness_centrality.head(5))

[10:55:26] Total Degree-centrality:
           SOURCE  degree_in  degree_out  total_degree_centrality
0       askreddit       7329        1338                     8667
1            iama       3694        1181                     4875
2            pics       2779           4                     2783
3  writingprompts       2490        1707                     4197
4          videos       2446           4                     2450
[10:55:37] Betweenness Centrality: (Influence of node in graph)
             SOURCE  betweenness_centrality
34        askreddit                0.059975
22             iama                0.057724
70   subredditdrama                0.049607
42   writingprompts                0.034253
126    outoftheloop                0.020895
[10:55:39] Top 5 Nodes by Eigenvector Centrality:
                   SOURCE  eigenvector_centrality
70         subredditdrama                0.284837
626                 drama                0.181029
79              copypasta               

Calculation of Clustering Coefficient

In [29]:
def f_clustering_coefficient(G, node):
    # List of nodes connected to it
    neighbors = list(G.neighbors(node))
    l = len(neighbors)
    # If there are less than two neighbors then coefficient = 0
    if l < 2:
        return 0.0
    
    # Sum all edges and divide it by all possible edges
    possible_edges = l * (l - 1) 
    actual_edges = sum(1 for v1 in neighbors for v2 in neighbors if G.has_edge(v1, v2))

    return actual_edges / possible_edges


clustering_coefficient_calc = {}
for node in mGraph.nodes():
    clustering_coefficient_calc[node] = f_clustering_coefficient(mGraph, node)

clustering_coefficient = pd.DataFrame(clustering_coefficient_calc.items(), columns=['SOURCE', 'clustering_coefficient'])
count_of_ones = len(clustering_coefficient[clustering_coefficient['clustering_coefficient'] == 1])

log("Number of nodes with a clustering coefficient = 1:")
print(count_of_ones)

[10:22:19] Number of nodes with a clustering coefficient = 1:
1498


Network Diameter + Number of Connected Components + Size of connected components + Density

In [39]:
strongly_connected_components = list(nx.strongly_connected_components(mGraph))
component_sizes = [len(component) for component in strongly_connected_components]
sorted_sizes = sorted(component_sizes, reverse=True)

log("Number of Strongly Connected Components:")
print(len(strongly_connected_components))
log("Top 5 Strongly Connected Components by size:")
for i, size in enumerate(sorted_sizes[:5], start=1):
    print(f"Component {i}: Size = {size} nodes")

density = nx.density(mGraph)
log("The density of the Graph is")
print(density)

# Calculate diameter only for the biggest SCC
largest_scc = max(strongly_connected_components, key=len)
largest_scc_subgraph = mGraph.subgraph(largest_scc)
# diameter = nx.diameter(largest_scc_subgraph, weight="weight", usebounds=True)
diameter = approximation.diameter(largest_scc_subgraph)

log("Diameter approximation of largest SCC:")
print(diameter)

[10:39:56] Number of Strongly Connected Components:
24071
[10:39:56] Top 5 Strongly Connected Components by size:
Component 1: Size = 11564 nodes
Component 2: Size = 6 nodes
Component 3: Size = 5 nodes
Component 4: Size = 4 nodes
Component 5: Size = 4 nodes
[10:39:56] The density of the Graph is
0.00022389565198911403
[10:39:58] Diameter approximation of largest SCC:
12


Find Cliques (Honestly I don't think this is necessary since afaik Cliques are defined for undirected graphs)

In [9]:
# Convert to undirected graph
undirected_graph = mGraph.to_undirected()

cliques = list(nx.find_cliques(undirected_graph))

sorted_cliques = sorted(cliques, key=lambda x: len(x), reverse=True)

# Print the 5 largest cliques
for i, clique in enumerate(sorted_cliques[:5], start=1):
    print(f"Largest Clique {i}, Length {len(clique)}: {clique}")

Largest Clique 1, Length 34: ['seahawks', 'nfl', 'denverbroncos', 'redskins', 'patriots', 'nyjets', 'eagles', 'saints', 'minnesotavikings', 'oaklandraiders', 'detroitlions', 'losangelesrams', 'tennesseetitans', '49ers', 'nygiants', 'buffalobills', 'jaguars', 'nfl_draft', 'chibears', 'browns', 'buccaneers', 'greenbaypackers', 'texans', 'chargers', 'steelers', 'falcons', 'ravens', 'panthers', 'miamidolphins', 'colts', 'azcardinals', 'cowboys', 'bengals', 'kansascitychiefs']
Largest Clique 2, Length 25: ['seahawks', 'nfl', 'stlouisrams', 'chargers', 'ravens', 'colts', 'bengals', 'greenbaypackers', 'chibears', 'redskins', 'texans', 'nyjets', 'detroitlions', 'panthers', 'azcardinals', 'kansascitychiefs', 'nfl_draft', 'losangelesrams', 'tennesseetitans', 'steelers', 'falcons', 'miamidolphins', '49ers', 'cowboys', 'nygiants']
Largest Clique 3, Length 24: ['seahawks', 'nfl', 'denverbroncos', 'redskins', 'patriots', 'nyjets', 'eagles', 'saints', 'minnesotavikings', 'oaklandraiders', 'detroitlio

Run homophily test with NSFW and SFW subreddits (not done yet)

Important nodes acting as Bridges

In [35]:
# We can convert to unidirected simple graph since it will not change the result of edge removal
simple_graph = mGraph.to_undirected()

# Find bridge edges in the undirected graph
bridge_edges = list(nx.bridges(simple_graph))

bridge_nodes = [node for edge in bridge_edges for node in edge]

# Count the occurrences of each node
top_nodes = Counter(bridge_nodes)

# Get the top 5 nodes with the highest counts
top_nodes = top_nodes.most_common(5)

# Print the top 5 nodes
print("Top 5 Nodes in Bridge Edges:")
for node, count in top_nodes:
    print(f"Node: {node}, Count: {count}")

Top 5 Nodes in Bridge Edges:
Node: askreddit, Count: 291
Node: writingprompts, Count: 188
Node: tipofmypenis, Count: 188
Node: iama, Count: 183
Node: mhoc, Count: 133


Implement Girvan-Newman algorithm (not done yet)