In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import ast
import numpy as np
from community import community_louvain

In [2]:
comments = pd.read_csv('reddit_comments.csv',encoding='utf-8')
posts = pd.read_csv('reddit_posts.csv',encoding='utf-8')

In [3]:
Posts = posts[['author', 'cleaned_tokens']]
Comments = comments[['author', 'cleaned_tokens']]

In [4]:
lPosts = pd.concat([Posts, comments], ignore_index=True)

In [5]:
#str → list
def parse_tokens(tokens):
    if isinstance(tokens, str):
        try:
            return ast.literal_eval(tokens)
        except:
            return []
    return tokens

In [6]:
def posts(dataframe_name):
    dataframe_name['cleaned_tokens'] = dataframe_name['cleaned_tokens'].apply(parse_tokens)
    lPosts = dataframe_name[dataframe_name['author'].notna()]
    lPosts = lPosts[~lPosts['author'].isin(['[deleted]', '[removed]'])]
    edges = []
    for _, row in lPosts.iterrows():
        user = row['author']
        tokens = row['cleaned_tokens']
        if not isinstance(tokens, list):
            continue
        for token in tokens:
            edges.append({'user': user, 'keyword': token})
    edges_df = pd.DataFrame(edges)
    min_freq = 5
    keyword_counts = edges_df['keyword'].value_counts()
    edges_df = edges_df[edges_df['keyword'].isin(keyword_counts[keyword_counts >= min_freq].index)]
    edges_df.to_csv("posts_community_user_keyword.csv", index=False)
    return edges_df

In [7]:
def comments(dataframe_name):
    dataframe_name['cleaned_tokens'] = dataframe_name['cleaned_tokens'].apply(parse_tokens)
    lPosts = dataframe_name[dataframe_name['author'].notna()]
    lPosts = lPosts[~lPosts['author'].isin(['[deleted]', '[removed]'])]
    edges = []
    for _, row in lPosts.iterrows():
        user = row['author']
        tokens = row['cleaned_tokens']
        if not isinstance(tokens, list):
            continue
        for token in tokens:
            edges.append({'user': user, 'keyword': token})
    edges_df = pd.DataFrame(edges)
    min_freq = 5
    keyword_counts = edges_df['keyword'].value_counts()
    edges_df = edges_df[edges_df['keyword'].isin(keyword_counts[keyword_counts >= min_freq].index)]
    edges_df.to_csv("comments_community_user_keyword.csv", index=False)
    return edges_df

In [8]:
posts(Posts)

Unnamed: 0,user,keyword
0,Own_Teacher3433,ever
1,Own_Teacher3433,land
2,Own_Teacher3433,moon
3,Ms_Photon,tell
4,Ms_Photon,think
...,...,...
653085,Mradul_Sharma,financially
653086,Mradul_Sharma,need
653087,Mradul_Sharma,good
653088,Mradul_Sharma,financial


In [9]:
comments(Comments)

Unnamed: 0,user,keyword
0,AutoModerator,###meta
1,AutoModerator,sticky
2,AutoModerator,comment
3,AutoModerator,rule
4,AutoModerator,apply
...,...,...
454666,EnoughNoLibsSpam,utterly
454667,EnoughNoLibsSpam,irrelevant
454668,EnoughNoLibsSpam,point
454669,EnoughNoLibsSpam,earth


In [14]:
def community_analysis1(filename,gephi_name):
    edges = pd.read_csv(filename)
    G = nx.from_pandas_edgelist(edges, source='user', target='keyword', create_using=nx.DiGraph)
    G_undirected = G.to_undirected()
    partition = community_louvain.best_partition(G_undirected, resolution=1.0)
    communities = set(partition.values())
    print(f"Detected {len(communities)} communities")
    nx.set_node_attributes(G, partition, 'community')
    
    num_communities = len(communities)
    cmap = plt.colormaps['tab20'].resampled(num_communities)
    community_colors = {}
    for i, comm in enumerate(sorted(communities)):
        rgb = cmap(i)[:3]
        hex_color = '#%02x%02x%02x' % tuple(int(255*x) for x in rgb)
        community_colors[comm] = hex_color
    node_colors = {node: community_colors[comm] for node, comm in partition.items()}
    nx.set_node_attributes(G, node_colors, 'color')
    
    community_sizes = pd.Series(partition).value_counts()
    print("Community Sizes:\n", community_sizes)
    community_leaders = {}
    for comm in communities:
        nodes_in_comm = [n for n in G.nodes if partition[n] == comm]
        subgraph = G.subgraph(nodes_in_comm)
        degrees = dict(subgraph.degree())
        leader = max(degrees.items(), key=lambda x: x[1])[0]
        community_leaders[comm] = leader
    print("\nCommunity Leaders:\n", community_leaders)  
    pos = {}
    offset = 10_000
    for i, comm in enumerate(sorted(communities)):
        nodes_in_comm = [n for n in G.nodes if partition[n] == comm]
        subgraph = G.subgraph(nodes_in_comm)
        if len(nodes_in_comm) <= 3:
            sub_pos = nx.random_layout(subgraph)
        else:
            sub_pos = nx.spring_layout(subgraph, seed=42, k=0.5, iterations=100)
        dx, dy = (i * offset, i * offset)
        for node, (x, y) in sub_pos.items():
            pos[node] = (x + dx, y + dy)

    nx.set_node_attributes(G, {k: float(v[0]) for k, v in pos.items()}, 'x')
    nx.set_node_attributes(G, {k: float(v[1]) for k, v in pos.items()}, 'y')

    nx.write_graphml(G, f"{gephi_name}.graphml")


In [11]:
def community_analysis2(filename,gephi_name):
    edges = pd.read_csv(filename)
    G = nx.from_pandas_edgelist(edges, source='user', target='keyword', create_using=nx.DiGraph)
    G_undirected = G.to_undirected()
    partition = community_louvain.best_partition(G_undirected, resolution=1.0)
    communities = set(partition.values())
    print(f"Detected {len(communities)} communities")
    nx.set_node_attributes(G, partition, 'community')
    
    num_communities = len(communities)
    cmap = plt.colormaps['tab20'].resampled(num_communities)
    community_colors = {}
    for i, comm in enumerate(sorted(communities)):
        rgb = cmap(i)[:3]
        hex_color = '#%02x%02x%02x' % tuple(int(255*x) for x in rgb)
        community_colors[comm] = hex_color
    node_colors = {node: community_colors[comm] for node, comm in partition.items()}
    nx.set_node_attributes(G, node_colors, 'color')
    
    community_sizes = pd.Series(partition).value_counts()
    print("Community Sizes:\n", community_sizes)
    community_leaders = {}
    for comm in communities:
        nodes_in_comm = [n for n in G.nodes if partition[n] == comm]
        subgraph = G.subgraph(nodes_in_comm)
        degrees = dict(subgraph.degree())
        leader = max(degrees.items(), key=lambda x: x[1])[0]
        community_leaders[comm] = leader
    print("\nCommunity Leaders:\n", community_leaders)
    node_sizes = {node: 10 for node in G.nodes}
    for leader in community_leaders.values():
        if leader in node_sizes:
            node_sizes[leader] = 20
    nx.set_node_attributes(G, node_sizes, 'size')
    pos = {}
    offset = 0
    for comm in communities:
        nodes = [n for n in G.nodes if partition[n] == comm]
        subgraph = G.subgraph(nodes)
        sub_pos = nx.spring_layout(subgraph, seed=42)  
        for n, p in sub_pos.items():
            pos[n] = (p[0] + offset, p[1])  
        offset += 3  

    nx.set_node_attributes(G, {k: float(v[0]) for k, v in pos.items()}, 'x')
    nx.set_node_attributes(G, {k: float(v[1]) for k, v in pos.items()}, 'y')

    nx.write_graphml(G, f"{gephi_name}.graphml")


In [12]:
community_analysis2('posts_community_user_keyword.csv','posts_community_user_keyword')

Detected 11 communities
Community Sizes:
 4     3183
1     2740
2     2709
0     1749
5      634
10     508
7      359
3      319
8      284
6      135
9       99
Name: count, dtype: int64

Community Leaders:
 {0: 'moon', 1: 'nickhintonn333', 2: 'CuteBananaMuffin', 3: 'JmoneyHimself', 4: 'trevorj414', 5: 'clemaneuverers', 6: 'labledcrazy', 7: 'EnoughNoLibsSpam', 8: 'No_Formal_3110', 9: 'djanalbeads', 10: '666isbad'}


In [15]:
community_analysis1('comments_community_user_keyword.csv','comments_community_user_keyword')

Detected 9 communities
Community Sizes:
 1    3367
5    2884
2    2255
0    2124
6    2028
3    1075
7      73
8      11
4       2
Name: count, dtype: int64

Community Leaders:
 {0: 'see', 1: 'Kazeite', 2: 'would', 3: 'like', 4: 'ding', 5: 'moon', 6: 'ConspiracyCornerNews', 7: 'made', 8: 'cool'}
