In [1]:
import pandas as pd
import numpy as np
from paths import *
import igraph as ig
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
df = pd.read_csv(OUTPUT_PATH / 'joined_comments.csv')
df['created'] = pd.to_datetime(df['created'], format='%Y-%m-%d %H:%M:%S')

In [3]:
df

Unnamed: 0,author,subreddit,score,created,link,body
0,u/lisztomania_0,AskMiddleEast,0.063841,2023-08-01 03:00:00,https://www.reddit.com/r/AskMiddleEast/comment...,It is an über talent that has become an über t...
1,u/JakeTheSandMan,worldnews,0.063841,2023-08-01 03:00:00,https://www.reddit.com/r/worldnews/comments/15...,“Armada” \n\nI prefer to call them floating hu...
2,u/justnmirrrs,worldnews,0.063841,2023-08-01 03:00:00,https://www.reddit.com/r/worldnews/comments/15...,"70% of France electricity is nuclear, so it co..."
3,u/liquid_cherry,worldnews,0.063841,2023-08-01 03:00:00,https://www.reddit.com/r/worldnews/comments/15...,In the United States you can burn the Bible an...
4,u/Grouchy-Bad-4190,AskMiddleEast,0.063841,2023-08-01 03:00:00,https://www.reddit.com/r/AskMiddleEast/comment...,Good
...,...,...,...,...,...,...
7877793,u/i_thrive_on_apathy,worldnews,0.064655,2024-03-01 01:59:00,https://www.reddit.com/r/worldnews/comments/1b...,American conservatives are also 99% braindead ...
7877794,u/[deleted],worldnews,0.063841,2024-03-01 01:59:00,https://www.reddit.com/r/worldnews/comments/1b...,[removed]
7877795,u/Charming-Engine4430,IsraelPalestine,0.064132,2024-03-01 01:59:00,https://www.reddit.com/r/IsraelPalestine/comme...,But that's not what they're trying to do. They...
7877796,u/Unpleasant_Classic,worldnews,0.063957,2024-03-01 01:59:00,https://www.reddit.com/r/worldnews/comments/1b...,"Nothing to see here, please move along."


In [None]:
df['submission_id'] = df['link'].str.extract(r'/comments/([a-z0-9]+)/')

print(f"Total dataset: {len(df):,} comments")
print(f"Authors: {df['author'].nunique():,}")
print(f"Subreddits: {df['subreddit'].unique()}")
print(f"Submissions: {df['submission_id'].nunique():,}")
print(f"Date range: {df['created'].min()} to {df['created'].max()}")

In [5]:
# Extract hour from 'created_time'
df['hour'] = df['created'].dt.hour

# Group by 'author_name' and calculate the mean hour
df_users = df.groupby('author')['hour'].mean().reset_index()

# Function to estimate timezone from mean hour
def estimate_timezone(hour):
    # Assuming most active time is between 12:00 - 21:00 local time
    local_hour = int (hour - 12)
    return f"UTC{local_hour:+03d}:00"

# Apply the timezone estimation function
df_users['estimated_timezone'] = df_users['hour'].apply(estimate_timezone)

df.drop('hour', axis=1, inplace=True)
# Count the number of users in each estimated timezone
timezone_distribution = df_users['estimated_timezone'].value_counts()
timezone_distribution

estimated_timezone
UTC+00:00    57445
UTC+01:00    36370
UTC+02:00    36178
UTC+03:00    34059
UTC+04:00    30607
UTC-01:00    29722
UTC+05:00    27425
UTC-02:00    25678
UTC+06:00    24179
UTC-03:00    22506
UTC+07:00    21508
UTC-04:00    19045
UTC+08:00    18988
UTC-05:00    16732
UTC+09:00    16009
UTC-06:00    15004
UTC-07:00    13862
UTC+10:00    13703
UTC-08:00    12830
UTC-09:00    12158
UTC-10:00    11967
UTC-11:00    11654
UTC+11:00    11361
UTC-12:00    10742
Name: count, dtype: int64

In [6]:
df['estimated_timezone'] = df_users['estimated_timezone']

As we have 7.9M posts from 530K authors across 8 subreddits + median author has only 2 posts, but top 13% create 80% of content + worldnews is MASSIVE (411K authors) - this dominates the network \ \

We have a problem that full network would have 87.8 BILLION edges - completely impossible \
Even top 80% of authors = 1.8 billion edges = ~5 hours to build

In [8]:
MAX_AUTHORS = 50000
MAX_AUTHORS_PER_SUB = 6000
SAMPLE_VIS = 1000

In [9]:
top_authors = df['author'].value_counts().head(MAX_AUTHORS).index
df = df[df['author'].isin(top_authors)]

print(f"Kept {len(df):,} posts ({len(df)/7877798*100:.1f}% of original)")
print(f"From {df['author'].nunique():,} authors")

Kept 5,871,708 posts (74.5% of original)
From 50,000 authors


In [10]:
edges = defaultdict(int)
total_author_pairs = 0

for sub in df['subreddit'].unique():
    sub_authors = df[df['subreddit'] == sub]['author'].unique()

    # Limit authors per subreddit
    if len(sub_authors) > MAX_AUTHORS_PER_SUB:
        # Keep most active in this subreddit
        sub_df = df[df['subreddit'] == sub]
        top_in_sub = sub_df['author'].value_counts().head(MAX_AUTHORS_PER_SUB).index
        sub_authors = [a for a in sub_authors if a in top_in_sub]

    print(f"  {sub}: {len(sub_authors):,} authors")

    # Create edges
    for i, author1 in enumerate(sub_authors):
        for author2 in sub_authors[i+1:]:
            edge = tuple(sorted([author1, author2]))
            edges[edge] += 1
            total_author_pairs += 1

    if (total_author_pairs % 1000000 == 0):
        print(f"    ...processed {total_author_pairs:,} pairs so far")

print(f"Total unique edges: {len(edges):,}")

  worldnews: 6,000 authors
  AskMiddleEast: 6,000 authors
  Palestine: 6,000 authors
  lebanon: 3,951 authors
  IsraelPalestine: 6,000 authors
  Israel: 6,000 authors
  israelexposed: 2,976 authors
  IsraelCrimes: 1,897 authors
Total unique edges: 95,653,283


In [11]:
g = ig.Graph()
all_authors = df['author'].unique().tolist()
g.add_vertices(all_authors)
g.add_edges(list(edges.keys()))
g.es['weight'] = list(edges.values())

print(f"Initial graph: {g.vcount():,} nodes, {g.ecount():,} edges")

# Keep only main component
print("Extracting largest connected component...")
g = g.connected_components().giant()
print(f"Main component: {g.vcount():,} nodes, {g.ecount():,} edges")

Initial graph: 50,000 nodes, 95,653,283 edges
Extracting largest connected component...
Main component: 25,745 nodes, 95,653,283 edges


In [12]:
degree = g.degree()
pagerank = g.pagerank(weights='weight')

g.vs['degree'] = degree
g.vs['pagerank'] = pagerank