In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import networkx as nx


# Dataload

We are loading the graph where each node has the attribute "community". To work with text-elements, we are removing every row with empty posts and rows where posts are either deleted or removed. This is classic Reddit terminology where either the user has deleted (\[deleted\]) a post, or the post has been removed (\[removed\]) by an admin or moderator.

In [54]:
G = nx.read_gml('reddit-all-posts-with-communities.gml')

In [None]:
rows = []


for node, data in list(G.nodes(data=True)):
    community = data.get("community")
    posts = data.get("posts", [])
    
    if not isinstance(posts, list):
        posts = [posts]

    for post in posts:
        #if post is empty, do not include
        if len(post) != 0:
            if post not in ['[deleted]', '[removed]']:
                rows.append({
                    "id": node,
                    "text": post,
                    "community": community
                })

df = pd.DataFrame(rows)

In [70]:
communities = df['community'].unique().tolist()
communities_dict = dict.fromkeys(communities, 0)

for index, row in df.iterrows():
    communities_dict[row['community']] += 1

In [71]:
print(communities_dict)

{4199: 584179, 7969: 659990, 10441: 157390, 11441: 428489, 4565: 661499, 1: 1, 2: 1, 10894: 323655, 3: 1, 4: 3, 5: 1, 6: 1, 8: 1, 9: 4, 10: 1, 11: 2, 13: 4, 14: 1, 14638: 4, 15: 1, 16: 1, 17: 1, 18: 9, 19: 3, 21: 1, 22: 1, 23: 2, 24: 8, 26: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 2, 32: 1, 34: 3, 35: 1, 36: 1, 37: 3, 39: 5, 40: 1, 41: 1, 42: 2, 43: 1, 45: 1, 46: 1, 47: 1, 48: 1, 49: 1, 51: 1, 52: 1, 53: 1, 54: 1, 55: 3, 56: 1, 58: 1, 59: 1, 60: 5, 61: 1, 62: 1, 63: 1, 64: 1, 65: 14, 6403: 156, 66: 1, 67: 2, 68: 1, 69: 1, 70: 1, 72: 1, 73: 42, 74: 1, 75: 1, 76: 2, 77: 7, 78: 1, 79: 1, 80: 4, 81: 1, 82: 2, 83: 1, 84: 3, 86: 1, 87: 1, 88: 1, 89: 1, 90: 1, 92: 1, 93: 1, 94: 1, 95: 1, 96: 1, 98: 7, 99: 1, 100: 1, 101: 1, 102: 7, 103: 1, 104: 2, 105: 1, 106: 3, 107: 1, 108: 1, 109: 1, 110: 1, 111: 2, 112: 2, 115: 1, 116: 1, 118: 2, 120: 1, 121: 1, 122: 1, 123: 6, 125: 1, 126: 1, 127: 1, 128: 1, 129: 1, 130: 1, 131: 1, 132: 3, 133: 1, 134: 1, 135: 1, 136: 1, 137: 1, 138: 1, 140: 2, 141: 3, 142: 1,