In [None]:
import pandas as pd
import math
import os
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
wd = os.getcwd()
data_path = wd + "/data/"
for doc in os.listdir(data_path):
    print(doc)

In [None]:
import pandas as pd

rows = []
with open(data_path + "reddit-IncelExit-posts.anon.txt", "r", encoding="utf-8", errors="replace") as f:
    for line in f:
        line = line.rstrip("\n")
        if not line or line.startswith(("link\t","url\t")):
            continue  # skip the two header-ish lines
        parts = line.split("\t", 6)  # at most 7 columns
        # pad to length 7
        if len(parts) < 7:
            parts += [""] * (7 - len(parts))
        rows.append(parts)

df = pd.DataFrame(rows, columns=["ref","comment_id","user_id","parent","timestamp","title","text"])

# Optional: clean literal \n sequences inside text/title
df["title"] = df["title"].str.replace("\\n", "\n")
df["text"]  = df["text"].str.replace("\\n", "\n")

# Optional: timestamp
df["timestamp_dt"] = pd.to_datetime(df["timestamp"], unit="s", errors="coerce")


In [None]:
len(df)

In [None]:
df

In [None]:
incels_reduced = df  # Define this first
incels_reduced_small = incels_reduced.copy()  # Work with a copy
incels_reduced_small.fillna(False, inplace=True)

In [None]:
incels_reduced = df  # Define this first
incels_reduced_small = incels_reduced.copy()  # Work with a copy
incels_reduced_small.fillna(False, inplace=True)

chats = {}
edges = []
nodes = set()
node_attributes = {}

# First pass: Add all root posts (posts with titles)
for idx, row in incels_reduced_small.iterrows():
    link, c_id, u_id, p_id, timestamp, title, text, ts = row
    if p_id == '':  # This is a root post
        chats[c_id] = {
            'text': text,
            'user_id': u_id
        }

        if node_attributes.get(u_id):
            node_attributes[u_id]['posts'].append(text)
        else:
            node_attributes[u_id] = {
                'posts': [text]
            }
        nodes.add(u_id)

print(f"Root posts found: {len(chats)}")

# Multiple passes to handle parent-child relationships
max_iterations = 10
iteration = 0


while not incels_reduced_small.empty and iteration < max_iterations:
    indices_to_drop = []
    incels_reduced_small = incels_reduced_small.iloc[::-1]
    for idx, row in incels_reduced_small.iterrows():
        link, c_id, u_id, p_id, timestamp, title, text, ts = row
        
        # Skip if already processed (root posts)
        if c_id in chats:
            indices_to_drop.append(idx)
            continue
            
        # Process if parent exists in chats
        if p_id in chats:
            chats[c_id] = {
                'text': text,
                'user_id': u_id,
            }   
            if chats[p_id]['user_id'] != u_id: # Avoid self-loops
                edges.append((u_id, chats[p_id]['user_id'])) 
            nodes.add(u_id)

            if node_attributes.get(u_id):
                node_attributes[u_id]['posts'].append(text)
            else:
                node_attributes[u_id] = {
                    'posts': [text]
                }
            indices_to_drop.append(idx)
    
    print(f"Iteration {iteration + 1}: Processing {len(indices_to_drop)} comments")
    
    
    if not indices_to_drop:
        print("No more comments can be processed - breaking")
        break   
    
    incels_reduced_small = incels_reduced_small.drop(index=indices_to_drop)
    print(f"Remaining rows: {len(incels_reduced_small)}")
    iteration += 1

print(f"\nFinal results:")
print(f"Total comments processed: {len(chats)}")
print(f"Total edges created: {len(edges)}")


In [None]:
distinct_edges = set()
for u, v in edges:
    if (u, v) in distinct_edges or (v, u) in distinct_edges:
        continue
    distinct_edges.add((u, v))


In [None]:
len(distinct_edges)

In [None]:
weighted_edges = {}
for u, v in edges:
    if (u, v) in distinct_edges:
        if (u, v) in weighted_edges:
            weighted_edges[(u, v)] += 1
        else:
            weighted_edges[(u, v)] = 1
    elif (v, u) in distinct_edges:
        if (v, u) in weighted_edges:
            weighted_edges[(v, u)] += 1
        else:
            weighted_edges[(v, u)] = 1

In [None]:
weighted_edges_list = [(u, v, w) for (u, v), w in weighted_edges.items()]

In [None]:
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_weighted_edges_from(weighted_edges_list)


In [None]:
# node_with_5_posts = [node for node in G.nodes(data=True) if len(node['posts']) >= 5]
for key, value in node_attributes.items():
    if len(value['posts']) >= 5:
        node_attributes[key]['n_posts'] = len(value['posts'])

In [None]:
nx.set_node_attributes(G, node_attributes)

In [None]:
subgraph_nodes = [node for node, attr in G.nodes(data=True) if attr.get('n_posts', 0) >= 10]
subgraph_ = G.subgraph(subgraph_nodes).copy()

In [None]:

plt.figure(figsize=(12, 9))

undirected_subgraph = subgraph_.to_undirected()
n = undirected_subgraph.number_of_nodes()

spring_pos = nx.spring_layout(
                    undirected_subgraph, 
                    k=5)

pos = nx.kamada_kawai_layout(
    undirected_subgraph, 
    pos=spring_pos, 
    weight=None, 
    scale=20
)
deg = dict(undirected_subgraph.degree())
node_sizes = [20 + deg.get(n, 0)*0.5 for n in undirected_subgraph.nodes()]  # base size + scale factor
n_posts = [undirected_subgraph.nodes[n].get('n_posts', 0) for n in undirected_subgraph.nodes()]

cmap = plt.cm.viridis

nodes = nx.draw_networkx_nodes(undirected_subgraph, pos,
                       node_size=node_sizes,
                       node_color=n_posts,
                       cmap=cmap)
nx.draw_networkx_edges(undirected_subgraph, pos, width=0.1, alpha=0.5)

colorbar = plt.colorbar(nodes)
colorbar.set_label('Number of posts')
plt.tight_layout()
plt.axis('off')
plt.show()

In [None]:
# show top 5 posters (by occurrences in df) and also from node_attributes if available
top_from_df = df['user_id'].value_counts().head(5)
print("Top 5 posters (from df):")
print(top_from_df)

# if node_attributes exists, show top 5 by collected posts there (counts from stored posts lists)
if 'node_attributes' in globals():
    top_from_nodes = sorted(
        ((uid, len(attrs.get('posts', []))) for uid, attrs in node_attributes.items()),
        key=lambda x: x[1], reverse=True
    )[:5]
    print("\nTop 5 posters (from node_attributes):")
    for i, (uid, cnt) in enumerate(top_from_nodes, 1):
        print(f"{i}. {uid}: {cnt} posts")

In [None]:
subgraph_nodes