In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import ast
import numpy as np
from community import community_louvain

In [2]:
comments = pd.read_csv('reddit_comments.csv',encoding='utf-8')
posts = pd.read_csv('reddit_posts.csv',encoding='utf-8')

In [3]:
Posts = posts[['author', 'cleaned_tokens']]
Comments = comments[['author', 'cleaned_tokens']]

In [4]:
lPosts = pd.concat([Posts, Comments], ignore_index=True)

In [5]:
#str → list
def parse_tokens(tokens):
    if isinstance(tokens, str):
        try:
            return ast.literal_eval(tokens)
        except:
            return []
    return tokens
#color
def hex_to_rgb(hex_color):
    hex_color = hex_color.lstrip('#')
    return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))

In [6]:
def Lposts(combined_df, output_prefix="lPosts", min_freq=5):
    clean_df = combined_df[
        combined_df['author'].notna() & 
        ~combined_df['author'].isin(['[deleted]', '[removed]'])
    ].copy()
    clean_df['cleaned_tokens'] = clean_df['cleaned_tokens'].apply(parse_tokens)
    edges = []
    for _, row in clean_df.iterrows():
        if isinstance(row['cleaned_tokens'], list):
            edges.extend([{'user': row['author'], 'keyword': token} 
                         for token in row['cleaned_tokens']])
    
    edges_df = pd.DataFrame(edges)
    keyword_counts = edges_df['keyword'].value_counts()
    edges_df = edges_df[edges_df['keyword'].isin(keyword_counts[keyword_counts >= min_freq].index)]
    edges_df.to_csv(f"{output_prefix}_user_keyword.csv", index=False)
    return edges_df

In [7]:
Lposts(lPosts)

Unnamed: 0,user,keyword
0,Own_Teacher3433,ever
1,Own_Teacher3433,land
2,Own_Teacher3433,moon
3,Ms_Photon,tell
4,Ms_Photon,think
...,...,...
1107756,EnoughNoLibsSpam,utterly
1107757,EnoughNoLibsSpam,irrelevant
1107758,EnoughNoLibsSpam,point
1107759,EnoughNoLibsSpam,earth


In [16]:
def community_analysis(filename, gephi_name, target_communities=None):
    edges = pd.read_csv(filename, usecols=['user', 'keyword'], dtype={'user': 'category', 'keyword': 'category'})
    user_nodes = edges['user'].unique()
    keyword_nodes = edges['keyword'].unique()
    G = nx.DiGraph()
    G.add_nodes_from(user_nodes, bipartite=0)
    G.add_nodes_from(keyword_nodes, bipartite=1)
    edge_tuples = list(zip(edges['user'], edges['keyword']))
    G.add_edges_from(edge_tuples)

    G_undirected = G.to_undirected(as_view=True)
    partition = community_louvain.best_partition(G_undirected, resolution=1.0, random_state=42)
    print(f"Total number of communities: {len(set(partition.values()))}")
    degrees = dict(G.degree())
    all_communities = set(partition.values())
    all_leaders = {
        comm: max((n for n in G.nodes if partition[n] == comm), key=lambda x: degrees[x])
        for comm in all_communities
    }
    for comm in sorted(all_leaders):
        leader = all_leaders[comm]
        print(f"Community {comm}: Leader = '{leader}', Degree = {degrees[leader]}")
    if target_communities is not None:
        filtered_nodes = [n for n in G.nodes if partition[n] in target_communities]
        G = G.subgraph(filtered_nodes).copy()
        partition = {n: partition[n] for n in filtered_nodes}

    communities = set(partition.values())
    num_communities = len(communities)
    cmap = plt.colormaps['tab20'].resampled(num_communities)
    community_colors = {}

    for i, comm in enumerate(sorted(communities)):
        rgb = cmap(i)[:3]
        hex_color = '#%02x%02x%02x' % tuple(int(255*x) for x in rgb)
        community_colors[comm] = hex_color

    node_colors = {node: community_colors[comm] for node, comm in partition.items()}
    nx.set_node_attributes(G, node_colors, 'color')

    for comm, hex_col in community_colors.items():
        rgb = hex_to_rgb(hex_col)
        print(f"Community {comm}: RGB{rgb}")

    community_sizes = pd.Series(partition).value_counts().sort_index()
    print("Community Sizes:\n", community_sizes)

    degrees = dict(G.degree())
    community_leaders = {comm: max((n for n in G.nodes if partition[n] == comm), key=lambda x: degrees[x])
                         for comm in communities}
    print("\nCommunity Leaders:\n", community_leaders)

    node_sizes = {node: 10 for node in G.nodes}
    for leader in community_leaders.values():
        if leader in node_sizes:
            node_sizes[leader] = 20
    nx.set_node_attributes(G, node_sizes, 'size')

    pos = {}
    offset = 10_00
    for i, comm in enumerate(sorted(communities)):
        nodes_in_comm = [n for n in G.nodes if partition[n] == comm]
        subgraph = G.subgraph(nodes_in_comm)

        if len(nodes_in_comm) <= 3:
            sub_pos = nx.random_layout(subgraph)
        else:
            sub_pos = nx.spring_layout(subgraph, seed=42, k=0.2, iterations=50)

        dx, dy = (i * offset, i * offset)
        pos.update({node: (x + dx, y + dy) for node, (x, y) in sub_pos.items()})

    nx.set_node_attributes(G, {k: float(v[0]) for k, v in pos.items()}, 'x')
    nx.set_node_attributes(G, {k: float(v[1]) for k, v in pos.items()}, 'y')

    nx.write_graphml(G, f"{gephi_name}.graphml")

In [17]:
community_analysis("lPosts_user_keyword.csv", "lPosts", target_communities=[0,3,5,4])

Total number of communities: 20
Community 0: Leader = 'moon', Degree = 3120
Community 1: Leader = 'would', Degree = 1705
Community 2: Leader = 'Kazeite', Degree = 3163
Community 3: Leader = 'CuteBananaMuffin', Degree = 5925
Community 4: Leader = 'jonathan_92', Degree = 523
Community 5: Leader = 'Ok_Magician_1194', Degree = 262
Community 6: Leader = 'Buckyhead', Degree = 361
Community 7: Leader = 'iriebeatz', Degree = 9
Community 8: Leader = 'rawkstaugh', Degree = 23
Community 9: Leader = 'Occumsmachete', Degree = 12
Community 10: Leader = 'Claytertot', Degree = 30
Community 11: Leader = 'Vo_Sirisov', Degree = 591
Community 12: Leader = 'remindmeyears', Degree = 6
Community 13: Leader = 'duh', Degree = 12
Community 14: Leader = 'backwards', Degree = 34
Community 15: Leader = 'Gullible-Success-601', Degree = 9
Community 16: Leader = 'An_American1776', Degree = 119
Community 17: Leader = 'SgtSharki', Degree = 15
Community 18: Leader = 'ohhh', Degree = 4
Community 19: Leader = 'Anomalistic

HITS arithmetic

In [10]:
def hits_analysis(filename, top_n=10):
    edges = pd.read_csv(filename)
    G = nx.from_pandas_edgelist(
        edges, 
        source='user', 
        target='keyword', 
        create_using=nx.DiGraph()
    )
    hubs, authorities = nx.hits(G, max_iter=100, tol=1e-08, nstart=None, normalized=True)
    top_hubs = sorted(hubs.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_authorities = sorted(authorities.items(), key=lambda x: x[1], reverse=True)[:top_n]
    print("\nThe most important hub (users who quote important keywords):")
    for user, score in top_hubs:
        print(f"{user}: {score:.4f}")
    
    print("\nThe most important authoritative keywords (keywords quoted by important users):")
    for keyword, score in top_authorities:
        print(f"{keyword}: {score:.4f}")
    
    return hubs, authorities

In [11]:
hubs, authorities = hits_analysis("lPosts_user_keyword.csv")


The most important hub (users who quote important keywords):
CuteBananaMuffin: 0.0042
Kazeite: 0.0034
EnoughNoLibsSpam: 0.0031
TreyinHada: 0.0025
SgtBrutalisk: 0.0024
ConspiracyCornerNews: 0.0024
CatEyes420: 0.0024
trevorj414: 0.0023
qwertycoder: 0.0023
nickhintonn333: 0.0022

The most important authoritative keywords (keywords quoted by important users):
moon: 0.0024
would: 0.0020
like: 0.0019
one: 0.0019
people: 0.0019
time: 0.0017
landing: 0.0017
think: 0.0017
know: 0.0017
even: 0.0016


In [12]:
def community_hits_analysis(filename, gephi_name, target_communities=None):
    edges = pd.read_csv(filename)
    G = nx.from_pandas_edgelist(edges, source='user', target='keyword', create_using=nx.DiGraph)
    G_undirected = G.to_undirected()
    partition = community_louvain.best_partition(G_undirected, resolution=1.0)
    communities = set(partition.values())
    if target_communities is not None:
        communities = [c for c in communities if c in target_communities]
    
    for comm in sorted(communities):
        print(f"\n=== commmunity {comm} HITS analysis ===")
        nodes_in_comm = [n for n in G.nodes if partition[n] == comm]
        subgraph = G.subgraph(nodes_in_comm)
        
        if len(nodes_in_comm) > 10: 
            try:
                hubs, authorities = nx.hits(subgraph)
                top_hubs = sorted(hubs.items(), key=lambda x: x[1], reverse=True)[:5]
                top_auth = sorted(authorities.items(), key=lambda x: x[1], reverse=True)[:5]
                
                print("\nThe most important hub:")
                for user, score in top_hubs:
                    print(f"  {user}: {score:.4f}")
                
                print("\nThe most important keywords:")
                for keyword, score in top_auth:
                    print(f"  {keyword}: {score:.4f}")
                    
            except nx.PowerIterationFailedConvergence:
                print("HITS algorithm failed to converge with this community")
        else:
            #HITS analysis those with enough node and edge community, if it has lots of public edges it will be small
            print("The community is too small for HITS. Showing top degree nodes instead:")
            degrees = dict(subgraph.degree())
            top_nodes = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:3]
            for node, deg in top_nodes:
                print(f"  {node}: Degree = {deg}")

In [13]:
community_hits_analysis("lPosts_user_keyword.csv", "lPosts", target_communities=[0,3,5,4])


=== commmunity 0 HITS analysis ===
The community is too small for HITS. Showing top degree nodes instead:
  collecting: Degree = 2
  senatormerkin: Degree = 1
  Jolcski: Degree = 1

=== commmunity 3 HITS analysis ===

The most important hub:
  Kazeite: 0.0251
  EnoughNoLibsSpam: 0.0199
  canadian1987: 0.0125
  4544BeersOnTheWall: 0.0121
  CarbonSlayer72: 0.0121

The most important keywords:
  lunar: 0.0033
  around: 0.0032
  surface: 0.0031
  orbit: 0.0029
  without: 0.0028

=== commmunity 4 HITS analysis ===

The most important hub:
  einzelkind: 0.3659
  layomao: 0.3299
  Squirrelboy85: 0.0705
  ILoveBeerAndFishing: 0.0607
  MycelialArchetype: 0.0581

The most important keywords:
  der: 0.0999
  ich: 0.0933
  auch: 0.0933
  ist: 0.0933
  den: 0.0933

=== commmunity 5 HITS analysis ===
The community is too small for HITS. Showing top degree nodes instead:
  owen: Degree = 3
  Level_Hovercraft_825: Degree = 3
  No-Win-1137: Degree = 3
