In [62]:
# Environment setup
import sys
from pathlib import Path

# Add src to path
workspace_root = Path().cwd()
sys.path.insert(0, str(workspace_root / 'src'))

print(f"Project root: {workspace_root}")
print(f"Python version: {sys.version}")
print("✓ Environment configured")

Project root: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment
Python version: 3.12.0 (v3.12.0:0fb18b02c8, Oct  2 2023, 09:45:56) [Clang 13.0.0 (clang-1300.0.29.30)]
✓ Environment configured


In [63]:
# Imports
import pandas as pd
import polars as pl
from datetime import datetime
import json

# Thesis pipeline utilities
from thesis_pipeline.io.paths import get_data_path
from thesis_pipeline.io.parquet import read_parquet, write_parquet

print("✓ All imports successful")

✓ All imports successful


## 1. Load Submissions and Comments from Gold Layer

Load monthly submissions and comments, then merge to create thread pseudo-documents.

In [64]:
# Paths
gold_submissions = get_data_path('gold') / 'reddit' / 'submissions'
gold_comments = get_data_path('gold') / 'reddit' / 'comments'
output_path = get_data_path('qa', 'reddit', create=True)

print(f"Gold submissions: {gold_submissions}")
print(f"Gold comments: {gold_comments}")
print(f"Output: {output_path}")

Gold submissions: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/02_gold/reddit/submissions
Gold comments: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/02_gold/reddit/comments
Output: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit


In [65]:
# Load submissions (monthly files)
months = ['2016-09', '2016-10']

submissions_dfs = []
for month in months:
    df = read_parquet(gold_submissions / f'{month}.parquet')
    submissions_dfs.append(df)
    print(f"Loaded {month} submissions: {len(df):,}")

df_submissions = pd.concat(submissions_dfs, ignore_index=True)

print(f"\nTotal submissions: {len(df_submissions):,}")
print(f"Columns: {df_submissions.columns.tolist()}")
print(f"\nSample:")
print(df_submissions.head(2))

Loaded 2016-09 submissions: 386,214
Loaded 2016-10 submissions: 537,217

Total submissions: 923,431
Columns: ['submission_id', 'title', 'selftext', 'created_utc', 'subreddit_id', 'subreddit', 'num_comments']

Sample:
  submission_id                                              title selftext  \
0        50kc6b  Third Party Politics To combat Two Party Polit...            
1        50kc7a  Italy told to brace itself for 'September assa...            

   created_utc subreddit_id         subreddit  num_comments  
0   1472688004     t5_2cneq          politics             1  
1   1472688011     t5_3bwj3  abetterworldnews             0  


In [66]:
# Load comments (monthly files)
comments_dfs = []
for month in months:
    df = read_parquet(gold_comments / f'{month}.parquet')
    comments_dfs.append(df)
    print(f"Loaded {month} comments: {len(df):,}")

df_comments = pd.concat(comments_dfs, ignore_index=True)

print(f"\nTotal comments: {len(df_comments):,}")
print(f"Columns: {df_comments.columns.tolist()}")
print(f"\nUnique submissions referenced: {df_comments['submission_id'].nunique():,}")

Loaded 2016-09 comments: 3,766,133
Loaded 2016-10 comments: 4,932,790

Total comments: 8,698,923
Columns: ['comment_id', 'submission_id', 'created_utc', 'subreddit_id', 'subreddit', 'body']

Unique submissions referenced: 501,969


In [84]:
# Filter out bot comments
print("\nFiltering bot comments...")
before_filter = len(df_comments)
print(f"  Before: {before_filter:,}")

bot_signature = "*I am a bot, and this action was performed automatically."
df_comments = df_comments[~df_comments['body'].str.contains(bot_signature, na=False, regex=False)].copy()

after_filter = len(df_comments)
print(f"  After: {after_filter:,}")
print(f"  Removed: {before_filter - after_filter:,} bot comments")


Filtering bot comments...
  Before: 8,698,923
  After: 8,587,661
  Removed: 111,262 bot comments


## 2. Create Thread Pseudo-Documents

Combine submission text with all comments to create thread-level documents for topic modeling.

In [85]:
# Validate data
print("Data validation:")
print(f"  Submissions: {len(df_submissions):,}")
print(f"  Comments: {len(df_comments):,}")
print(f"  Submissions with missing title: {df_submissions['title'].isna().sum()}")
print(f"  Submissions with missing selftext: {df_submissions['selftext'].isna().sum()}")
print(f"  Comments with missing body: {df_comments['body'].isna().sum()}")

# Date ranges
sub_dates = pd.to_datetime(df_submissions['created_utc'], unit='s')
com_dates = pd.to_datetime(df_comments['created_utc'], unit='s')
print(f"\nDate ranges:")
print(f"  Submissions: {sub_dates.min().date()} to {sub_dates.max().date()}")
print(f"  Comments: {com_dates.min().date()} to {com_dates.max().date()}")

Data validation:
  Submissions: 923,431
  Comments: 8,587,661
  Submissions with missing title: 0
  Submissions with missing selftext: 0
  Comments with missing body: 0

Date ranges:
  Submissions: 2016-09-01 to 2016-10-31
  Comments: 2016-09-01 to 2016-10-31


In [86]:
# Group comments by submission
print("Grouping comments by submission...")

comment_groups = df_comments.groupby('submission_id').agg({
    'body': lambda x: ' '.join(x.fillna('').astype(str)),
    'comment_id': 'count'
}).rename(columns={'body': 'all_comments_text', 'comment_id': 'n_comments'}).reset_index()

print(f"Grouped into {len(comment_groups):,} submissions with comments")
print(f"Comments per submission: min={comment_groups['n_comments'].min()}, "
      f"median={comment_groups['n_comments'].median():.0f}, "
      f"max={comment_groups['n_comments'].max()}")

Grouping comments by submission...
Grouped into 472,803 submissions with comments
Comments per submission: min=1, median=3, max=24157


In [87]:
# Merge submissions with comment aggregations
print("Creating thread pseudo-documents...")

# Clean submission text
df_submissions['title'] = df_submissions['title'].fillna('').astype(str)
df_submissions['selftext'] = df_submissions['selftext'].fillna('').astype(str)

# Merge
thread_pseudodocs = df_submissions.merge(
    comment_groups, 
    on='submission_id', 
    how='left'
)

# Fill missing values (submissions with no comments)
thread_pseudodocs['all_comments_text'] = thread_pseudodocs['all_comments_text'].fillna('')
thread_pseudodocs['n_comments'] = thread_pseudodocs['n_comments'].fillna(0).astype(int)

# Create pseudo-document: title + selftext + all comments
thread_pseudodocs['pseudodoc_text'] = (
    thread_pseudodocs['title'] + ' ' + 
    thread_pseudodocs['selftext'] + ' ' + 
    thread_pseudodocs['all_comments_text']
).str.strip()

print(f"\nCreated {len(thread_pseudodocs):,} thread pseudo-documents")
print(f"  With comments: {(thread_pseudodocs['n_comments'] > 0).sum():,}")
print(f"  Without comments: {(thread_pseudodocs['n_comments'] == 0).sum():,}")

Creating thread pseudo-documents...

Created 923,431 thread pseudo-documents
  With comments: 472,803
  Without comments: 450,628


In [88]:
# Filter to only submissions with at least one comment
# (No discussion = cannot measure polarization)
print(f"\nFiltering to submissions with >= 1 comment...")
print(f"  Before: {len(thread_pseudodocs):,}")

thread_pseudodocs = thread_pseudodocs[thread_pseudodocs['n_comments'] > 0].copy()

print(f"  After: {len(thread_pseudodocs):,}")
print(f"  Removed: {len(df_submissions) - len(thread_pseudodocs):,} submissions without replies")

# Compute text statistics
thread_pseudodocs['pseudodoc_length'] = thread_pseudodocs['pseudodoc_text'].str.len()
thread_pseudodocs['pseudodoc_tokens_approx'] = thread_pseudodocs['pseudodoc_text'].str.split().str.len()
thread_pseudodocs['title_length'] = thread_pseudodocs['title'].str.len()
thread_pseudodocs['selftext_length'] = thread_pseudodocs['selftext'].str.len()

# Select and order columns
final_cols = [
    'submission_id',
    'title',
    'selftext',
    'n_comments',
    'pseudodoc_text',
    'pseudodoc_length',
    'pseudodoc_tokens_approx',
    'title_length',
    'selftext_length',
    'created_utc',
    'subreddit',
    'subreddit_id'
]

thread_pseudodocs = thread_pseudodocs[final_cols]

print(f"\n✓ Final thread pseudo-documents: {len(thread_pseudodocs):,}")
print(f"Columns: {thread_pseudodocs.columns.tolist()}")


Filtering to submissions with >= 1 comment...
  Before: 923,431
  After: 472,803
  Removed: 450,628 submissions without replies

✓ Final thread pseudo-documents: 472,803
Columns: ['submission_id', 'title', 'selftext', 'n_comments', 'pseudodoc_text', 'pseudodoc_length', 'pseudodoc_tokens_approx', 'title_length', 'selftext_length', 'created_utc', 'subreddit', 'subreddit_id']


## 3. Quality Checks

In [89]:
# Thread statistics
print("=" * 80)
print("THREAD PSEUDO-DOCUMENT STATISTICS")
print("=" * 80)

print(f"\nTotal threads with comments: {len(thread_pseudodocs):,}")
print(f"Total submissions: {len(df_submissions):,}")
print(f"Total comments: {len(df_comments):,}")

print(f"\n{'Metric':<30} {'Mean':>12} {'Median':>12} {'Min':>12} {'Max':>12}")
print("-" * 80)

THREAD PSEUDO-DOCUMENT STATISTICS

Total threads with comments: 472,803
Total submissions: 923,431
Total comments: 8,587,661

Metric                                 Mean       Median          Min          Max
--------------------------------------------------------------------------------


In [90]:
# Comments per thread
stats = thread_pseudodocs['n_comments'].describe()
print(f"{'Comments/thread':<30} {stats['mean']:>12.1f} {stats['50%']:>12.0f} {stats['min']:>12.0f} {stats['max']:>12.0f}")

# Pseudo-document length
stats = thread_pseudodocs['pseudodoc_length'].describe()
print(f"{'Pseudodoc length (chars)':<30} {stats['mean']:>12.0f} {stats['50%']:>12.0f} {stats['min']:>12.0f} {stats['max']:>12.0f}")

# Tokens (approx)
stats = thread_pseudodocs['pseudodoc_tokens_approx'].describe()
print(f"{'Pseudodoc tokens (approx)':<30} {stats['mean']:>12.0f} {stats['50%']:>12.0f} {stats['min']:>12.0f} {stats['max']:>12.0f}")

# Subreddit distribution
print(f"\nUnique subreddits: {thread_pseudodocs['subreddit'].nunique():,}")
print(f"Top 5 subreddits:")
for sub, count in thread_pseudodocs['subreddit'].value_counts().head(5).items():
    print(f"  {sub}: {count:,}")

Comments/thread                        18.2            3            1        24157
Pseudodoc length (chars)               4036          686           14      3908716
Pseudodoc tokens (approx)               665          100            2       661263

Unique subreddits: 354
Top 5 subreddits:
  The_Donald: 212,355
  politics: 61,121
  EnoughTrumpSpam: 24,646
  willis7737_news: 17,614
  hillaryclinton: 12,124


In [91]:
# Sample pseudo-documents
print("\n" + "=" * 80)
print("SAMPLE PSEUDO-DOCUMENTS")
print("=" * 80)

for i in [0, len(thread_pseudodocs)//2, -1]:
    row = thread_pseudodocs.iloc[i]
    print(f"\nSubmission {row['submission_id']}:")
    print(f"  Title: {row['title'][:100]}")
    print(f"  Comments: {row['n_comments']}")
    print(f"  Subreddit: {row['subreddit']}")
    print(f"  Length: {row['pseudodoc_length']:,} chars, ~{row['pseudodoc_tokens_approx']:,} tokens")
    print(f"  Text preview: {row['pseudodoc_text'][:1500]}...")
    print("-" * 80)


SAMPLE PSEUDO-DOCUMENTS

Submission 50kc92:
  Title: When is the speech?
  Comments: 1
  Subreddit: The_Donald
  Length: 62 chars, ~14 tokens
  Text preview: When is the speech?  I thought it was at 7 pm? like right now!...
--------------------------------------------------------------------------------

Submission 567qm2:
  Title: No Hurricane is stopping my support!!
  Comments: 1
  Subreddit: The_Donald
  Length: 70 chars, ~11 tokens
  Text preview: No Hurricane is stopping my support!! [deleted] #KEK BE SAFE CENTIPEDE...
--------------------------------------------------------------------------------

Submission 5afnem:
  Title: Trump's childhood Halloween costume
  Comments: 1
  Subreddit: EnoughTrumpSpam
  Length: 601 chars, ~26 tokens
  Text preview: Trump's childhood Halloween costume [deleted] 

Snapshots:

1. *This Post* - [Error](https://archive.is/?run=1&amp;url=http%3A%2F%2Fwww.southpark.com.br%2Fwp-content%2Fuploads%2F2011%2F11%2Feric-cartman-hitler.jpg "error auto-archi

## 4. Create Comment-Submission Mapping

Map all comments to their submissions for stance detection.

In [92]:
# Create comment-submission mapping for stance detection
# Only include comments from threads that have discussion (n_comments > 0)

print("Creating comment-submission mapping...")

valid_submissions = set(thread_pseudodocs['submission_id'].values)
print(f"Valid submissions (with >= 1 comment): {len(valid_submissions):,}")

# Filter comments to only those with valid submissions
df_comments_filtered = df_comments[df_comments['submission_id'].isin(valid_submissions)].copy()
print(f"Comments from valid submissions: {len(df_comments_filtered):,}")

# Merge with submission info
submission_info = thread_pseudodocs[['submission_id', 'title', 'selftext', 'subreddit', 'subreddit_id']].copy()

comment_map = df_comments_filtered.merge(submission_info, on='submission_id', how='left', suffixes=('_comment', '_submission'))

# Rename for clarity
comment_map = comment_map.rename(columns={
    'body': 'comment_body',
    'created_utc': 'comment_created_utc',
    'subreddit_comment': 'subreddit',  # Use comment's subreddit (should match submission's)
    'subreddit_id_comment': 'subreddit_id'
})

# Select final columns
final_cols = [
    'comment_id',
    'submission_id',
    'comment_body',
    'comment_created_utc',
    'subreddit',
    'subreddit_id',
    'title',
    'selftext'
]

comment_map = comment_map[final_cols]

print(f"\n✓ Created mapping for {len(comment_map):,} comments")
print(f"Columns: {comment_map.columns.tolist()}")

Creating comment-submission mapping...
Valid submissions (with >= 1 comment): 472,803
Comments from valid submissions: 8,587,661

✓ Created mapping for 8,587,661 comments
Columns: ['comment_id', 'submission_id', 'comment_body', 'comment_created_utc', 'subreddit', 'subreddit_id', 'title', 'selftext']


In [93]:
# Create thread metadata
print("\nCreating thread metadata...")

thread_metadata = thread_pseudodocs[['submission_id', 'title', 'n_comments', 'created_utc', 'subreddit']].copy()

print(f"✓ Created metadata for {len(thread_metadata):,} threads")
print(f"Columns: {thread_metadata.columns.tolist()}")


Creating thread metadata...
✓ Created metadata for 472,803 threads
Columns: ['submission_id', 'title', 'n_comments', 'created_utc', 'subreddit']


In [94]:
# Verify data quality
print("\n" + "=" * 60)
print("DATA QUALITY VERIFICATION")
print("=" * 60)

# Check that submissions and replies are properly separated
print(f"\nComment breakdown:")
print(f"  Total comments: {len(comment_map):,}")
print(f"  Submissions (comment_id == submission_id): {(comment_map['comment_id'] == comment_map['submission_id']).sum():,}")
print(f"  Replies (comment_id != submission_id): {(comment_map['comment_id'] != comment_map['submission_id']).sum():,}")

# Sample a thread to verify structure
sample_thread = thread_pseudodocs.iloc[1000]
print(f"\nSample thread verification (submission_id: {sample_thread['submission_id']}):")
print(f"  n_comments from metadata: {sample_thread['n_comments']}")

print("\n")
# Get all comments for this thread
thread_comments = comment_map[comment_map['submission_id'] == sample_thread['submission_id']].sort_values('comment_created_utc')
print(f"  Total comments in map: {len(thread_comments)}")

# Show first few comments to verify structure
print("\n")
print(f"  Submission title: {sample_thread['title']}")
print(f"  Submission selftext: {sample_thread['selftext'][:200]}...")
print(f"Comments for submission_id {sample_thread['submission_id']}:")
print(f"\nFirst 3 comments from this thread:")
for idx, row in thread_comments.head(3).iterrows():
    print(f"  Comment {row['comment_id']}: {row['comment_body'][:80]}...")


DATA QUALITY VERIFICATION

Comment breakdown:
  Total comments: 8,587,661
  Submissions (comment_id == submission_id): 0
  Replies (comment_id != submission_id): 8,587,661

Sample thread verification (submission_id: 50l8nh):
  n_comments from metadata: 3


  Total comments in map: 3


  Submission title: Broward prosecutors reviewing elections office posting results early
  Submission selftext: ...
Comments for submission_id 50l8nh:

First 3 comments from this thread:
  Comment d74z8bv: When Broward County posted election results online before the polls closed Tuesd...
  Comment d75tpta: "Intent gross negligence"

Further solidifying the Hillary defense. Next time i ...
  Comment d75ttux: Make sure you are wearing a crown....


## 5. Save Outputs

Save three key outputs:
1. **thread_pseudodocs.parquet**: Thread-level pseudo-documents (submission + all comments) for topic modeling
2. **thread_metadata.parquet**: Thread-level statistics (submission_id, title, n_comments, created_utc, subreddit)
3. **comment_thread_map.parquet**: All comments with full context for stance detection

In [95]:
# Save thread pseudo-documents
print("Saving thread pseudo-documents...")
thread_output = output_path / 'thread_pseudodocs.parquet'
write_parquet(thread_pseudodocs, thread_output)
print(f"✓ Saved {len(thread_pseudodocs):,} thread pseudo-documents")
print(f"  Location: {thread_output}")

# Save thread metadata
print("\nSaving thread metadata...")
metadata_output = output_path / 'thread_metadata.parquet'
write_parquet(thread_metadata, metadata_output)
print(f"✓ Saved {len(thread_metadata):,} thread metadata records")
print(f"  Location: {metadata_output}")

Saving thread pseudo-documents...
✓ Wrote 472,803 rows to thread_pseudodocs.parquet (1150.6 MB)
✓ Saved 472,803 thread pseudo-documents
  Location: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit/thread_pseudodocs.parquet

Saving thread metadata...
✓ Wrote 472,803 rows to thread_metadata.parquet (30.3 MB)
✓ Saved 472,803 thread metadata records
  Location: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit/thread_metadata.parquet


In [96]:
# Save comment-thread mapping with full context
print("\nSaving comment-thread mapping...")
comment_output = output_path / 'comment_thread_map.parquet'
write_parquet(comment_map, comment_output)
print(f"✓ Saved {len(comment_map):,} comments (submissions + replies)")
print(f"  Location: {comment_output}")


Saving comment-thread mapping...
✓ Wrote 8,587,661 rows to comment_thread_map.parquet (2387.7 MB)
✓ Saved 8,587,661 comments (submissions + replies)
  Location: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit/comment_thread_map.parquet


## 6. Summary

Generate metadata for this processing run.

In [99]:
# Create summary metadata
summary = {
    'notebook': '14_reddit_corpus_prep_topics',
    'timestamp': datetime.now().isoformat(),
    'inputs': {
        'gold_submissions': str(gold_submissions),
        'gold_comments': str(gold_comments),
        'months': months,
        'n_submissions_raw': len(df_submissions),
        'n_comments_raw': len(df_comments)
    },
    'outputs': {
        'thread_pseudodocs': str(thread_output),
        'comment_thread_map': str(comment_output),
        'thread_metadata': str(metadata_output),
        'n_threads': len(thread_pseudodocs),
        'n_comments_in_map': len(comment_map)
    },
    'statistics': {
        'comments_per_thread': {
            'mean': float(thread_pseudodocs['n_comments'].mean()),
            'median': float(thread_pseudodocs['n_comments'].median()),
            'min': int(thread_pseudodocs['n_comments'].min()),
            'max': int(thread_pseudodocs['n_comments'].max())
        },
        'pseudodoc_length_chars': {
            'mean': float(thread_pseudodocs['pseudodoc_length'].mean()),
            'median': float(thread_pseudodocs['pseudodoc_length'].median()),
            'min': int(thread_pseudodocs['pseudodoc_length'].min()),
            'max': int(thread_pseudodocs['pseudodoc_length'].max())
        },
        'pseudodoc_tokens_approx': {
            'mean': float(thread_pseudodocs['pseudodoc_tokens_approx'].mean()),
            'median': float(thread_pseudodocs['pseudodoc_tokens_approx'].median())
        }
    }
}

# Save summary to file
summary_file = output_path / 'run_metadata.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

# Print summary
print("\n" + "=" * 60)
print("RUN SUMMARY")
print("=" * 60)
print(f"\nNotebook: {summary['notebook']}")
print(f"  Gold submissions: {summary['inputs']['gold_submissions']}")
print(f"  Gold comments: {summary['inputs']['gold_comments']}")
print(f"  Months: {summary['inputs']['months']}")
print(f"  Submissions (raw): {summary['inputs']['n_submissions_raw']:,}")
print(f"  Comments (raw): {summary['inputs']['n_comments_raw']:,}")
print(f"\nOutputs:")
print(f"  Thread pseudo-documents: {summary['outputs']['n_threads']:,}")
print(f"  Comments in mapping: {summary['outputs']['n_comments_in_map']:,}")

print(f"\nSummary saved to: {summary_file}")


RUN SUMMARY

Notebook: 14_reddit_corpus_prep_topics
  Gold submissions: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/02_gold/reddit/submissions
  Gold comments: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/02_gold/reddit/comments
  Months: ['2016-09', '2016-10']
  Submissions (raw): 923,431
  Comments (raw): 8,587,661

Outputs:
  Thread pseudo-documents: 472,803
  Comments in mapping: 8,587,661

Summary saved to: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit/run_metadata.json
