In [1]:
import sys
from pathlib import Path
from datetime import datetime
import json
import pandas as pd
import csv
from tqdm import tqdm

workspace_root = Path.cwd()
sys.path.insert(0, str(workspace_root / 'src'))

from thesis_pipeline.io.config import load_all_configs

print(f"Thread pseudodocuments started: {datetime.now().isoformat()}")
print(f"Workspace: {workspace_root}")

Thread pseudodocuments started: 2025-12-18T15:02:57.531437
Workspace: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment


## 1. Load Configuration and Setup Paths

In [None]:
configs = load_all_configs(workspace_root / 'configs')
reddit_cfg = configs['reddit']
global_cfg = configs['global']

# Input/output directories
silver_reddit_dir = workspace_root / 'data/01_silver/reddit'
gold_reddit_dir = workspace_root / 'data/02_gold/reddit'
gold_reddit_dir.mkdir(parents=True, exist_ok=True)

print(f"Silver input: {silver_reddit_dir.relative_to(workspace_root)}")
print(f"Gold output: {gold_reddit_dir.relative_to(workspace_root)}")
print(f"Thread context usage: {reddit_cfg['processing']['thread_context_usage']}")

Silver input: data/01_silver/reddit
Gold output: data/03_gold/reddit
Thread context usage: inference_only


## 2. Process Daily Files with Thread Context

In [8]:
# Get all silver layer files
silver_files = sorted(silver_reddit_dir.glob('2016-*.parquet'))

print(f"Processing {len(silver_files)} daily files...")
print("=" * 80)

total_comments = 0
total_with_thread = 0
files_written = []

for silver_file in tqdm(silver_files, desc="Processing daily files"):
    # Read silver layer
    df = pd.read_parquet(silver_file)
    total_comments += len(df)
    
    # Use link_id as thread_id (already present in data)
    df['thread_id'] = df['link_id'].str.replace('t3_', '', regex=False)
    
    # Determine if comment is top-level (parent is thread) or reply (parent is comment)
    df['is_top_level'] = df['parent_id'].str.startswith('t3_')
    
    # Count how many have thread context (should be 100% since link_id is always present)
    has_thread = df['thread_id'].notna().sum()
    total_with_thread += has_thread
    
    # Reorder columns: keep timestamp-related first
    column_order = [
        'date',
        'created_utc',
        'comment_id',
        'thread_id',
        'is_top_level',
        'author',
        'subreddit',
        'subreddit_id',
        'body',
        'cleaned_body',
        'score',
        'parent_id'
    ]
    
    # Only keep columns that exist
    column_order = [col for col in column_order if col in df.columns]
    df = df[column_order]
    
    # Write to gold layer (same date, with thread context)
    output_file = gold_reddit_dir / silver_file.name
    df.to_parquet(output_file, compression='snappy', index=False)
    files_written.append(output_file)

print("\n" + "=" * 80)
print(f"Total comments: {total_comments:,}")
print(f"With thread context: {total_with_thread:,} ({100*total_with_thread/total_comments:.1f}%)")
print(f"Files written: {len(files_written)}")
print("=" * 80)

Processing 61 daily files...


Processing daily files: 100%|██████████| 61/61 [00:18<00:00,  3.27it/s]


Total comments: 8,785,795
With thread context: 8,785,795 (100.0%)
Files written: 61





## 3. Verify Timestamp Integrity

**Critical check:** Ensure timestamps remain comment-based, not thread-based.

In [9]:
# Pick a sample file and verify timestamps
if files_written:
    sample_file = files_written[30]  # Mid-October
    
    print(f"Timestamp verification: {sample_file.name}")
    print("=" * 80)
    
    df_gold = pd.read_parquet(sample_file)
    
    # Check that all dates match filename
    expected_date = sample_file.stem
    actual_dates = df_gold['date'].unique()
    
    print(f"Expected date: {expected_date}")
    print(f"Actual dates in file: {actual_dates}")
    
    if len(actual_dates) == 1 and actual_dates[0] == expected_date:
        print("✓ Timestamp integrity verified: all comments match file date")
    else:
        print("✗ WARNING: Date mismatch detected!")
    
    # Show examples
    print(f"\nSample with thread context (first 3):")
    for idx, row in df_gold.head(3).iterrows():
        print(f"\n  Comment date: {row['date']}")
        print(f"  Thread ID: {row['thread_id']}")
        print(f"  Top-level: {row['is_top_level']}")
        print(f"  Comment: {row['body'][:80]}...")
    
    print("\n" + "=" * 80)
else:
    print("⚠ No files to verify")

Timestamp verification: 2016-10-01.parquet
Expected date: 2016-10-01
Actual dates in file: ['2016-10-01']
✓ Timestamp integrity verified: all comments match file date

Sample with thread context (first 3):

  Comment date: 2016-10-01
  Thread ID: 55aq8o
  Top-level: True
  Comment: *Someone* gets it ..

...

  Comment date: 2016-10-01
  Thread ID: 557ste
  Top-level: False
  Comment: &gt;Dollar coins finally catch on thanks to triumphant RenFaire nerds. 

Alright...

  Comment date: 2016-10-01
  Thread ID: 55ao69
  Top-level: False
  Comment: Something something socialism only works until......



## 4. Thread Context Coverage Analysis

In [10]:
# Analyze coverage across all files
print("Thread context coverage by date:")
print("=" * 80)

coverage_stats = []

for gold_file in files_written[:10]:  # Sample first 10 days
    df = pd.read_parquet(gold_file)
    
    total = len(df)
    with_thread = df['thread_id'].notna().sum()
    top_level = df['is_top_level'].sum()
    
    coverage_stats.append({
        'date': gold_file.stem,
        'total_comments': total,
        'with_thread_id': with_thread,
        'top_level_comments': top_level,
        'thread_pct': 100 * with_thread / total if total > 0 else 0,
        'top_level_pct': 100 * top_level / total if total > 0 else 0
    })

df_coverage = pd.DataFrame(coverage_stats)
print(df_coverage.to_string(index=False))

print("\n" + "=" * 80)
print(f"Average thread ID coverage: {df_coverage['thread_pct'].mean():.1f}%")
print(f"Average top-level comments: {df_coverage['top_level_pct'].mean():.1f}%")
print("=" * 80)

Thread context coverage by date:
      date  total_comments  with_thread_id  top_level_comments  thread_pct  top_level_pct
2016-09-01          109184          109184               31339       100.0      28.702924
2016-09-02           98775           98775               26888       100.0      27.221463
2016-09-03           74731           74731               20825       100.0      27.866615
2016-09-04           80990           80990               21031       100.0      25.967403
2016-09-05           85066           85066               23980       100.0      28.189876
2016-09-06          110391          110391               29266       100.0      26.511219
2016-09-07          105031          105031               30600       100.0      29.134256
2016-09-08          123756          123756               36093       100.0      29.164647
2016-09-09           98605           98605               28615       100.0      29.019827
2016-09-10           87648           87648               25401     

## 5. Save Processing Metadata

In [12]:
# Save gold layer metadata
gold_metadata = {
    'timestamp': datetime.now().isoformat(),
    'input': {
        'silver_directory': str(silver_reddit_dir.relative_to(workspace_root)),
        'files_processed': int(len(silver_files))  # Convert to native int
    },
    'thread_context': {
        'source': 'link_id field from comments (already present)',
        'thread_titles_available': False,
        'note': 'Politosphere dataset does not include submission/thread titles'
    },
    'processing': {
        'total_comments': int(total_comments),  # Convert to native int
        'with_thread_context': int(total_with_thread),  # Convert to native int
        'thread_coverage_pct': round(100 * total_with_thread / total_comments, 2) if total_comments > 0 else 0
    },
    'output': {
        'directory': str(gold_reddit_dir.relative_to(workspace_root)),
        'files_written': int(len(files_written)),  # Convert to native int
        'dates_covered': sorted([f.stem for f in files_written])
    },
    'timestamp_rule': {
        'indexing_field': reddit_cfg['processing']['time_index_field'],
        'thread_context_usage': reddit_cfg['processing']['thread_context_usage'],
        'note': 'Thread context (thread_id, parent) is for inference only. All timestamps remain comment.created_utc'
    }
}

metadata_file = gold_reddit_dir / 'gold_metadata.json'
with open(metadata_file, 'w') as f:
    json.dump(gold_metadata, f, indent=2)

print(f"✓ Gold layer metadata saved: {metadata_file.relative_to(workspace_root)}")

✓ Gold layer metadata saved: data/03_gold/reddit/gold_metadata.json
