In [9]:
# Environment setup
import sys
from pathlib import Path

# Add src to path
workspace_root = Path().cwd()
sys.path.insert(0, str(workspace_root / 'src'))

print(f"Project root: {workspace_root}")
print(f"Python version: {sys.version}")
print("✓ Environment configured")

Project root: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment
Python version: 3.12.0 (v3.12.0:0fb18b02c8, Oct  2 2023, 09:45:56) [Clang 13.0.0 (clang-1300.0.29.30)]
✓ Environment configured


In [10]:
# Imports
import pandas as pd
import polars as pl
from datetime import datetime
import json

# Thesis pipeline utilities
from thesis_pipeline.io.paths import get_data_path
from thesis_pipeline.io.parquet import read_parquet, write_parquet

print("✓ All imports successful")

✓ All imports successful


## 1. Load Reddit Gold Layer

Load the thread-structured comments from notebook 12 output.

In [11]:
# Paths
gold_path = get_data_path('gold', 'reddit')
output_path = get_data_path('qa', 'reddit', create=True)

print(f"Gold layer: {gold_path}")
print(f"Output: {output_path}")

Gold layer: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/02_gold/reddit
Output: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit


In [13]:
# Load gold layer (thread-structured comments)
# Gold layer has one parquet file per day: 2016-09-01.parquet, 2016-09-02.parquet, etc.

print("Loading Reddit gold layer...")

# Get all parquet files in gold directory
gold_files = sorted(gold_path.glob('*.parquet'))
print(f"Found {len(gold_files)} parquet files")

# Read and concatenate all files
dfs = []
for file in gold_files:
    df_day = read_parquet(file)
    dfs.append(df_day)
    print(f"  Loaded {file.name}: {len(df_day):,} comments")

df = pd.concat(dfs, ignore_index=True)

print(f"\nTotal loaded: {len(df):,} comments")
print(f"Columns: {df.columns.tolist()}")
print(f"\nSample:")
df.head(3)

Loading Reddit gold layer...
Found 61 parquet files
  Loaded 2016-09-01.parquet: 109,184 comments
  Loaded 2016-09-02.parquet: 98,775 comments
  Loaded 2016-09-03.parquet: 74,731 comments
  Loaded 2016-09-04.parquet: 80,990 comments
  Loaded 2016-09-05.parquet: 85,066 comments
  Loaded 2016-09-06.parquet: 110,391 comments
  Loaded 2016-09-07.parquet: 105,031 comments
  Loaded 2016-09-08.parquet: 123,756 comments
  Loaded 2016-09-09.parquet: 98,605 comments
  Loaded 2016-09-10.parquet: 87,648 comments
  Loaded 2016-09-11.parquet: 138,072 comments
  Loaded 2016-09-12.parquet: 153,446 comments
  Loaded 2016-09-13.parquet: 142,864 comments
  Loaded 2016-09-14.parquet: 137,999 comments
  Loaded 2016-09-15.parquet: 132,073 comments
  Loaded 2016-09-16.parquet: 143,633 comments
  Loaded 2016-09-17.parquet: 111,225 comments
  Loaded 2016-09-18.parquet: 110,791 comments
  Loaded 2016-09-19.parquet: 148,111 comments
  Loaded 2016-09-20.parquet: 154,930 comments
  Loaded 2016-09-21.parquet: 138,4

Unnamed: 0,date,created_utc,comment_id,thread_id,is_top_level,author,subreddit,subreddit_id,body,cleaned_body,score,parent_id
0,2016-09-01,1472688001,d74qmz3,50grgt,False,krb7H,politics,t5_2cneq,trump seems to be gaining supporters at an inc...,,3,t1_d74ft4z
1,2016-09-01,1472688001,d74qmze,50juq5,True,OQcjv,politics,t5_2cneq,Hi `alictrmods`. Thank you for participating i...,,1,t3_50juq5
2,2016-09-01,1472688002,d74qn03,50kabh,True,mQu7y,The_Donald,t5_38unr,The Mistakes of the Obama...,,1,t3_50kabh


In [16]:
# Data validation
print("Data validation:")
print(f"  Total comments: {len(df):,}")
print(f"  Unique threads (thread_id): {df['thread_id'].nunique():,}")

# Convert Unix timestamps to datetime for readable display
date_min = pd.to_datetime(df['created_utc'], unit='s').min()
date_max = pd.to_datetime(df['created_utc'], unit='s').max()
print(f"  Date range: {date_min.date()} to {date_max.date()}")

print(f"  Missing thread_id: {df['thread_id'].isna().sum():,}")
print(f"  Missing body: {df['body'].isna().sum():,}")

# Check for submission metadata
if 'submission_title' in df.columns:
    print(f"  Missing submission_title: {df['submission_title'].isna().sum():,}")
if 'submission_selftext' in df.columns:
    print(f"  Missing submission_selftext: {df['submission_selftext'].isna().sum():,}")

Data validation:
  Total comments: 8,785,795
  Unique threads (thread_id): 510,756
  Date range: 2016-09-01 to 2016-10-31
  Missing thread_id: 0
  Missing body: 0


## 2. Separate Submissions and Comments

For each thread:
- Identify the submission (initial post) - first comment chronologically per thread_id
- Separate regular comments that reply to the submission
- This is crucial for providing context during stance detection

In [43]:
# Ensure required columns exist
required_cols = ['thread_id', 'body', 'created_utc']
missing = [col for col in required_cols if col not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Clean text
df_clean = df.copy()
df_clean['body'] = df_clean['body'].fillna('').astype(str).str.strip()

# Filter out empty bodies
df_clean = df_clean[df_clean['body'] != '']

print(f"After filtering empty bodies: {len(df_clean):,} rows")
print(f"Columns: {df_clean.columns.tolist()}")

After filtering empty bodies: 8,785,795 rows
Columns: ['date', 'created_utc', 'comment_id', 'thread_id', 'is_top_level', 'author', 'subreddit', 'subreddit_id', 'body', 'cleaned_body', 'score', 'parent_id']


In [44]:
# Identify submissions (first post per thread chronologically)
print("Identifying submission posts...")

# Get the earliest comment per thread (this is the submission)
# First, get the index of the earliest comment per thread
submission_indices = df_clean.groupby('thread_id')['created_utc'].idxmin()

# Extract submission rows
submissions = df_clean.loc[submission_indices].copy()
submissions = submissions.rename(columns={'body': 'submission_body', 'created_utc': 'submission_time'})

# Keep only needed columns
submission_cols = ['thread_id', 'submission_body', 'submission_time']
if 'comment_id' in submissions.columns:
    submission_cols.insert(1, 'comment_id')
    submissions = submissions.rename(columns={'comment_id': 'submission_id'})
    submission_cols[1] = 'submission_id'  # Update in the list too

submissions = submissions[submission_cols].reset_index(drop=True)

print(f"Identified {len(submissions):,} submission posts")
print(f"\nSubmissions columns: {submissions.columns.tolist()}")
submissions.head(3)

Identifying submission posts...
Identified 510,756 submission posts

Submissions columns: ['thread_id', 'submission_id', 'submission_body', 'submission_time']


Unnamed: 0,thread_id,submission_id,submission_body,submission_time
0,1nzpxk,d81de5w,Your submission was automatically removed.\n\n...,1474817641
1,3la1rr,d90ndc8,Your submission was automatically removed.\n\n...,1476987946
2,3o62wk,d7sjcpt,[A reminder for everyone](https://www.reddit.c...,1474246699


In [33]:
# Compute thread-level statistics
print("Computing thread statistics...")

# Count comments per thread (including submission)
thread_stats = df_clean.groupby('thread_id').agg({
    'created_utc': ['min', 'max', 'count'],
    'body': lambda x: ' '.join(x)  # Concatenate ALL bodies (submission + comments) for topic modeling
}).reset_index()

# Flatten column names
thread_stats.columns = ['thread_id', 'thread_start', 'thread_end', 'n_total', 'all_text']

# n_comments = n_total - 1 (excluding submission)
thread_stats['n_comments'] = thread_stats['n_total'] - 1

print(f"Computed stats for {len(thread_stats):,} threads")
print(f"\nThread stats columns: {thread_stats.columns.tolist()}")
thread_stats.head(3)

Computing thread statistics...
Computed stats for 510,756 threads

Thread stats columns: ['thread_id', 'thread_start', 'thread_end', 'n_total', 'all_text', 'n_comments']


Unnamed: 0,thread_id,thread_start,thread_end,n_total,all_text,n_comments
0,1nzpxk,1474817641,1474817641,1,Your submission was automatically removed.\n\n...,0
1,3la1rr,1476987946,1476987946,1,Your submission was automatically removed.\n\n...,0
2,3o62wk,1474246699,1474246699,1,[A reminder for everyone](https://www.reddit.c...,0


In [45]:
# Create thread pseudo-documents by merging submissions + stats
print("Creating thread pseudo-documents...")

# Merge submission text with thread stats
thread_pseudodocs = thread_stats.merge(submissions[['thread_id', 'submission_body']], on='thread_id', how='left')

# The pseudo-document for topic modeling is: submission + all comments (already in all_text)
# But we also keep submission_body separate for later use in stance detection
thread_pseudodocs['pseudodoc_text'] = thread_pseudodocs['all_text']

# Compute text statistics
thread_pseudodocs['pseudodoc_length'] = thread_pseudodocs['pseudodoc_text'].str.len()
thread_pseudodocs['pseudodoc_tokens_approx'] = thread_pseudodocs['pseudodoc_text'].str.split().str.len()

# Drop the temporary all_text column
thread_pseudodocs = thread_pseudodocs.drop(columns=['all_text', 'n_total'])

print(f"Created {len(thread_pseudodocs):,} thread pseudo-documents")

# Filter out threads with 0 comments (no interaction = no polarization)
print(f"\nFiltering threads with 0 comments (no replies)...")
print(f"  Before: {len(thread_pseudodocs):,} threads")
thread_pseudodocs = thread_pseudodocs[thread_pseudodocs['n_comments'] > 0].copy()
print(f"  After: {len(thread_pseudodocs):,} threads")
print(f"  Removed: {len(thread_stats) - len(thread_pseudodocs):,} threads with only submissions")

print(f"\n✓ Final thread pseudo-documents: {len(thread_pseudodocs):,}")
print(f"Columns: {thread_pseudodocs.columns.tolist()}")
thread_pseudodocs.head(3)

Creating thread pseudo-documents...
Created 510,756 thread pseudo-documents

Filtering threads with 0 comments (no replies)...
  Before: 510,756 threads
  After: 341,692 threads
  Removed: 169,064 threads with only submissions

✓ Final thread pseudo-documents: 341,692
Columns: ['thread_id', 'thread_start', 'thread_end', 'n_comments', 'submission_body', 'pseudodoc_text', 'pseudodoc_length', 'pseudodoc_tokens_approx']


Unnamed: 0,thread_id,thread_start,thread_end,n_comments,submission_body,pseudodoc_text,pseudodoc_length,pseudodoc_tokens_approx
7,45fp0h,1477636970,1477637393,2,Your post has been removed because /r/Grassroo...,Your post has been removed because /r/Grassroo...,1328,129
9,468nsf,1477638288,1477638396,2,Your post has been removed because /r/Grassroo...,Your post has been removed because /r/Grassroo...,1328,129
11,46vcvs,1474397005,1474397012,2,Your post has been removed because /r/Grassroo...,Your post has been removed because /r/Grassroo...,1328,129


## 3. Quality Checks

In [55]:
# Thread statistics
print("=" * 60)
print("THREAD STATISTICS")
print("=" * 60)

print(f"\nTotal threads: {len(thread_pseudodocs):,}")
print(f"Total submissions: {len(submissions):,}")

print(f"\n{'Metric':<30} {'Mean':>10} {'Median':>10} {'Min':>10} {'Max':>10}")
print("-" * 70)

# Comments per thread (excluding submission)
stats = thread_pseudodocs['n_comments'].describe()
print(f"{'Comments per thread':<30} {stats['mean']:>10.1f} {stats['50%']:>10.0f} {stats['min']:>10.0f} {stats['max']:>10.0f}")

# Pseudo-document length
stats = thread_pseudodocs['pseudodoc_length'].describe()
print(f"{'Pseudo-doc characters':<30} {stats['mean']:>10.0f} {stats['50%']:>10.0f} {stats['min']:>10.0f} {stats['max']:>10.0f}")

# Pseudo-document tokens
stats = thread_pseudodocs['pseudodoc_tokens_approx'].describe()
print(f"{'Pseudo-doc tokens (approx)':<30} {stats['mean']:>10.0f} {stats['50%']:>10.0f} {stats['min']:>10.0f} {stats['max']:>10.0f}")

THREAD STATISTICS

Total threads: 341,692
Total submissions: 510,756

Metric                               Mean     Median        Min        Max
----------------------------------------------------------------------
Comments per thread                  24.2          4          1      24156
Pseudo-doc characters                5472       1059         18    3883085
Pseudo-doc tokens (approx)            907        166          2     660136


In [56]:
# Temporal distribution of submissions
print("\n" + "=" * 60)
print("TEMPORAL DISTRIBUTION (SUBMISSIONS)")
print("=" * 60)

# Convert submission times to datetime
submissions_temporal = submissions.copy()
submissions_temporal['submission_date'] = pd.to_datetime(
    submissions_temporal['submission_time'], unit='s'
).dt.date

# Daily counts
daily_counts = submissions_temporal.groupby('submission_date').size()

print(f"\nSubmission dates: {daily_counts.index.min()} to {daily_counts.index.max()}")
print(f"Mean submissions/day: {daily_counts.mean():.0f}")
print(f"Median submissions/day: {daily_counts.median():.0f}")
print(f"\nDays with most activity:")
print(daily_counts.nlargest(5))


TEMPORAL DISTRIBUTION (SUBMISSIONS)

Submission dates: 2016-09-01 to 2016-10-31
Mean submissions/day: 8373
Median submissions/day: 7874

Days with most activity:
submission_date
2016-10-10    13564
2016-10-20    13523
2016-10-28    12699
2016-10-18    12133
2016-10-31    12075
dtype: int64


In [57]:
# Sample pseudo-documents
print("\n" + "=" * 60)
print("SAMPLE PSEUDO-DOCUMENTS")
print("=" * 60)

for i in [0, len(thread_pseudodocs)//2, -1]:
    row = thread_pseudodocs.iloc[i]
    print(f"\nThread {row['thread_id']}:")
    print(f"  Comments: {row['n_comments']}")
    print(f"  Length: {row['pseudodoc_length']:,} chars, ~{row['pseudodoc_tokens_approx']:,} tokens")
    print(f"  Submission preview: {row['submission_body'][:150]}...")
    print(f"  Full text preview: {row['pseudodoc_text'][:200]}...")
    print("-" * 60)


SAMPLE PSEUDO-DOCUMENTS

Thread 45fp0h:
  Comments: 2
  Length: 1,328 chars, ~129 tokens
  Submission preview: Your post has been removed because /r/GrassrootsSelect has offically moved to /r/Political_Revolution. You can read the announcement post [here](https...
  Full text preview: Your post has been removed because /r/GrassrootsSelect has offically moved to /r/Political_Revolution. You can read the announcement post [here](https://www.reddit.com/r/GrassrootsSelect/comments/4rjj...
------------------------------------------------------------

Thread 566soa:
  Comments: 12
  Length: 1,232 chars, ~206 tokens
  Submission preview: Ok I'll also donate $5...
  Full text preview: Ok I'll also donate $5 Ill donate my brothers kidney. I donated my lake house I knocked the chicken nuggets out of my mom's hands and into Bernie's Get a sticker! And new pins! Sounds like someone's g...
------------------------------------------------------------

Thread 5afyzw:
  Comments: 1
  Length: 1,782 ch

## 4. Create Comment and Thread Metadata

Create two tables:
1. **comment_thread_map**: All comments (submissions + replies) with full context for stance detection

2. **thread_metadata**: Thread-level statisticsOnly include comments from threads with n_comments > 0 (threads with actual interaction).


In [51]:
# Create comprehensive comment-to-thread mapping
print("Creating comment-to-thread mapping...")

# Get threads that have interaction (n_comments > 0)
valid_threads = set(thread_pseudodocs['thread_id'].values)
print(f"Valid threads (n_comments > 0): {len(valid_threads):,}")

# Filter df_clean to only include comments from valid threads
df_filtered = df_clean[df_clean['thread_id'].isin(valid_threads)].copy()
print(f"Comments from valid threads: {len(df_filtered):,}")

# Select all available columns
comment_cols = ['comment_id', 'thread_id', 'created_utc', 'body', 'is_top_level', 
                'author', 'subreddit', 'subreddit_id', 'parent_id']
available_cols = [col for col in comment_cols if col in df_filtered.columns]

comment_map = df_filtered[available_cols].copy()
comment_map = comment_map.rename(columns={'body': 'comment_body'})

# Merge with submission information
submission_info = submissions[['thread_id', 'submission_id', 'submission_body', 'submission_time']].copy()
comment_map = comment_map.merge(submission_info, on='thread_id', how='left')

# Reorder columns for better readability
col_order = ['comment_id', 'comment_body', 'created_utc', 'subreddit_id', 'subreddit', 
             'author', 'parent_id', 'thread_id', 'is_top_level', 'submission_id', 
             'submission_body', 'submission_time']
final_cols = [col for col in col_order if col in comment_map.columns]
comment_map = comment_map[final_cols]

print(f"✓ Created mapping for {len(comment_map):,} comments (submissions + replies)")
print(f"  Submissions: {comment_map['comment_id'].isin(submissions['submission_id']).sum():,}")
print(f"  Replies: {(~comment_map['comment_id'].isin(submissions['submission_id'])).sum():,}")
print(f"\nColumns: {comment_map.columns.tolist()}")
comment_map.head(3)

Creating comment-to-thread mapping...
Valid threads (n_comments > 0): 341,692
Comments from valid threads: 8,616,731
✓ Created mapping for 8,616,731 comments (submissions + replies)
  Submissions: 341,692
  Replies: 8,275,039

Columns: ['comment_id', 'comment_body', 'created_utc', 'subreddit_id', 'subreddit', 'author', 'parent_id', 'thread_id', 'is_top_level', 'submission_id', 'submission_body', 'submission_time']


Unnamed: 0,comment_id,comment_body,created_utc,subreddit_id,subreddit,author,parent_id,thread_id,is_top_level,submission_id,submission_body,submission_time
0,d74qmz3,trump seems to be gaining supporters at an inc...,1472688001,t5_2cneq,politics,krb7H,t1_d74ft4z,50grgt,False,d74qmz3,trump seems to be gaining supporters at an inc...,1472688001
1,d74qmze,Hi `alictrmods`. Thank you for participating i...,1472688001,t5_2cneq,politics,OQcjv,t3_50juq5,50juq5,True,d74qmze,Hi `alictrmods`. Thank you for participating i...,1472688001
2,d74qn03,The Mistakes of the Obama...,1472688002,t5_38unr,The_Donald,mQu7y,t3_50kabh,50kabh,True,d74qn03,The Mistakes of the Obama...,1472688002


In [52]:
# Create thread metadata table
print("\nCreating thread metadata...")

thread_metadata = thread_pseudodocs[['thread_id', 'thread_start', 'thread_end', 'n_comments']].copy()

print(f"✓ Created metadata for {len(thread_metadata):,} threads")
print(f"\nColumns: {thread_metadata.columns.tolist()}")
thread_metadata.head(3)


Creating thread metadata...
✓ Created metadata for 341,692 threads

Columns: ['thread_id', 'thread_start', 'thread_end', 'n_comments']


Unnamed: 0,thread_id,thread_start,thread_end,n_comments
7,45fp0h,1477636970,1477637393,2
9,468nsf,1477638288,1477638396,2
11,46vcvs,1474397005,1474397012,2


In [60]:
# Verify data quality
print("\n" + "=" * 60)
print("DATA QUALITY VERIFICATION")
print("=" * 60)

# Check that submissions and replies are properly separated
print(f"\nComment breakdown:")
print(f"  Total comments: {len(comment_map):,}")
print(f"  Submissions (comment_id == submission_id): {(comment_map['comment_id'] == comment_map['submission_id']).sum():,}")
print(f"  Replies (comment_id != submission_id): {(comment_map['comment_id'] != comment_map['submission_id']).sum():,}")

# Sample a thread to verify structure
sample_thread = thread_pseudodocs.iloc[1000]
print(f"\nSample thread verification (thread_id: {sample_thread['thread_id']}):")
print(f"  n_comments from metadata: {sample_thread['n_comments']}")

# Get all comments for this thread
thread_comments = comment_map[comment_map['thread_id'] == sample_thread['thread_id']].sort_values('created_utc')
print(f"  Total comments in map: {len(thread_comments)}")
print(f"  Submission: {(thread_comments['comment_id'] == thread_comments['submission_id']).sum()}")
print(f"  Replies: {(thread_comments['comment_id'] != thread_comments['submission_id']).sum()}")

# Show first few comments to verify structure
print(f"\nFirst 3 comments from this thread:")
for idx, row in thread_comments.head(3).iterrows():
    is_submission = row['comment_id'] == row['submission_id']
    print(f"  {'[SUBMISSION]' if is_submission else '[REPLY]'} {row['comment_id']}: {row['comment_body']}...")
    print(f"    Submission context: {row['submission_body']}...")


DATA QUALITY VERIFICATION

Comment breakdown:
  Total comments: 8,616,731
  Submissions (comment_id == submission_id): 341,692
  Replies (comment_id != submission_id): 8,275,039

Sample thread verification (thread_id: 4yfa7i):
  n_comments from metadata: 1
  Total comments in map: 2
  Submission: 1
  Replies: 1

First 3 comments from this thread:
  [SUBMISSION] d76wu8m: But you're talking apples and oranges.

The Canadian banks discussed there are consumer deposit banks -- that's where the 7% capital reserve applies.

The banks in the US that got hit in the crisis were investment banks, for the most part. If you're looking at TD Bank, you should be comparing WF, Citi, Chase, BoA, etc. but when you say the "banks that crashed the economy", you're talking about Lehman, Bear, Goldman, etc. They are completely different types of institutions.

We don't have nearly as many investment banks, but regulation is very limited....
    Submission context: But you're talking apples and oranges.

T

## 5. Save Outputs

Save three key outputs:
1. **thread_pseudodocs.parquet**: Thread-level pseudo-documents (submission + all comments) for topic modeling
2. **thread_metadata.parquet**: Thread-level statistics (thread_id, start, end, n_comments)
3. **comment_thread_map.parquet**: All comments with full context for stance detection

## 5. Save Outputs

Save three key outputs:
1. **thread_pseudodocs.parquet**: Thread-level pseudo-documents (submission + all comments) for topic modeling
2. **thread_metadata.parquet**: Thread-level statistics (thread_id, start, end, n_comments)
3. **comment_thread_map.parquet**: All comments with full context for stance detection

In [58]:
# Save thread pseudo-documents
print("Saving thread pseudo-documents...")
thread_output = output_path / 'thread_pseudodocs.parquet'
write_parquet(thread_pseudodocs, thread_output)
print(f"✓ Saved {len(thread_pseudodocs):,} thread pseudo-documents")
print(f"  Location: {thread_output}")

# Save thread metadata
print("\nSaving thread metadata...")
metadata_output = output_path / 'thread_metadata.parquet'
write_parquet(thread_metadata, metadata_output)
print(f"✓ Saved {len(thread_metadata):,} thread metadata records")
print(f"  Location: {metadata_output}")

Saving thread pseudo-documents...
✓ Wrote 341,692 rows to thread_pseudodocs.parquet (1091.7 MB)
✓ Saved 341,692 thread pseudo-documents
  Location: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit/thread_pseudodocs.parquet

Saving thread metadata...
✓ Wrote 341,692 rows to thread_metadata.parquet (5.7 MB)
✓ Saved 341,692 thread metadata records
  Location: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit/thread_metadata.parquet


In [59]:
# Save comment-thread mapping with full context
print("\nSaving comment-thread mapping...")
comment_output = output_path / 'comment_thread_map.parquet'
write_parquet(comment_map, comment_output)
print(f"✓ Saved {len(comment_map):,} comments (submissions + replies)")
print(f"  Location: {comment_output}")


Saving comment-thread mapping...
✓ Wrote 8,616,731 rows to comment_thread_map.parquet (2147.3 MB)
✓ Saved 8,616,731 comments (submissions + replies)
  Location: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit/comment_thread_map.parquet


## 6. Summary

Generate metadata for this processing run.

In [61]:
# Create summary metadata
summary = {
    'notebook': '14_reddit_corpus_prep_topics',
    'timestamp': datetime.now().isoformat(),
    'inputs': {
        'gold_layer': str(gold_path),
        'n_files': len(gold_files),
        'n_comments_raw': len(df)
    },
    'outputs': {
        'thread_pseudodocs': str(thread_output),
        'comment_thread_map': str(comment_output),
        'n_threads': len(thread_pseudodocs),
        'n_submissions': len(submissions),
        'n_comments_total': len(comment_map)
    },
    'statistics': {
        'comments_per_thread': {
            'mean': float(thread_pseudodocs['n_comments'].mean()),
            'median': float(thread_pseudodocs['n_comments'].median()),
            'min': int(thread_pseudodocs['n_comments'].min()),
            'max': int(thread_pseudodocs['n_comments'].max())
        },
        'pseudodoc_length_chars': {
            'mean': float(thread_pseudodocs['pseudodoc_length'].mean()),
            'median': float(thread_pseudodocs['pseudodoc_length'].median()),
            'min': int(thread_pseudodocs['pseudodoc_length'].min()),
            'max': int(thread_pseudodocs['pseudodoc_length'].max())
        },
        'pseudodoc_tokens_approx': {
            'mean': float(thread_pseudodocs['pseudodoc_tokens_approx'].mean()),
            'median': float(thread_pseudodocs['pseudodoc_tokens_approx'].median())
        }
    }
}

# Save summary to file
summary_file = output_path / 'run_metadata.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

# Print summary
print("\n" + "=" * 60)
print("RUN SUMMARY")
print("=" * 60)
print(f"\nNotebook: {summary['notebook']}")
print(f"Timestamp: {summary['timestamp']}")
print(f"\nInputs:")
print(f"  Gold layer: {summary['inputs']['gold_layer']}")
print(f"  Files: {summary['inputs']['n_files']}")
print(f"  Comments (raw): {summary['inputs']['n_comments_raw']:,}")
print(f"\nOutputs:")
print(f"  Threads: {summary['outputs']['n_threads']:,}")
print(f"  Submissions: {summary['outputs']['n_submissions']:,}")
print(f"  Comments (total): {summary['outputs']['n_comments_total']:,}")
print(f"\nSummary saved to: {summary_file}")


RUN SUMMARY

Notebook: 14_reddit_corpus_prep_topics
Timestamp: 2025-12-19T14:50:01.648488

Inputs:
  Gold layer: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/02_gold/reddit
  Files: 61
  Comments (raw): 8,785,795

Outputs:
  Threads: 341,692
  Submissions: 510,756
  Comments (total): 8,616,731

Summary saved to: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/data/01_corpus/03_qa/reddit/run_metadata.json
