In [1]:
import sys
from pathlib import Path
from datetime import datetime
import json
import subprocess

workspace_root = Path.cwd()
sys.path.insert(0, str(workspace_root / 'src'))

from thesis_pipeline.io.config import load_all_configs

print(f"Validation plan initialized: {datetime.now().isoformat()}")
print(f"Workspace: {workspace_root}")

Validation plan initialized: 2025-12-18T14:05:49.621369
Workspace: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment


## 1. Load Configurations

In [2]:
configs = load_all_configs(workspace_root / 'configs')
global_cfg = configs['global']
reddit_cfg = configs['reddit']
news_cfg = configs['news']

period_start = global_cfg['validation_run']['period_start']
period_end = global_cfg['validation_run']['period_end']
seed = global_cfg['validation_run']['seed_string']

print(f"Period: {period_start} to {period_end}")
print(f"Seed: {seed}")
print(f"Daily news limit: {news_cfg['sampling']['daily_limit_n']:,} articles/day")

Period: 2016-09-01 to 2016-10-31
Seed: thesis_sep_oct_2016_v1
Daily news limit: 10,000 articles/day


## 2. Calculate Time Span

In [3]:
from datetime import datetime, timedelta

start_date = datetime.strptime(period_start, '%Y-%m-%d')
end_date = datetime.strptime(period_end, '%Y-%m-%d')
num_days = (end_date - start_date).days + 1

print(f"Total days: {num_days}")
print(f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

# Generate list of all dates
all_dates = [start_date + timedelta(days=i) for i in range(num_days)]
print(f"\nFirst 5 dates: {[d.strftime('%Y-%m-%d') for d in all_dates[:5]]}")
print(f"Last 5 dates: {[d.strftime('%Y-%m-%d') for d in all_dates[-5:]]}")

Total days: 61
Date range: 2016-09-01 to 2016-10-31

First 5 dates: ['2016-09-01', '2016-09-02', '2016-09-03', '2016-09-04', '2016-09-05']
Last 5 dates: ['2016-10-27', '2016-10-28', '2016-10-29', '2016-10-30', '2016-10-31']


## 3. Estimated Data Volumes

In [4]:
# News estimates
news_daily_limit = news_cfg['sampling']['daily_limit_n']
estimated_news_articles = num_days * news_daily_limit

# Conservative size estimates (compressed Parquet)
avg_article_kb = 5  # ~5KB per article (text + metadata, compressed)
estimated_news_gb = (estimated_news_articles * avg_article_kb) / (1024 * 1024)

print("=== News (CC-NEWS) ===")
print(f"Daily limit: {news_daily_limit:,} articles/day")
print(f"Total articles (max): {estimated_news_articles:,}")
print(f"Estimated storage: ~{estimated_news_gb:.1f} GB (silver layer)")

print("\n=== Reddit (Politosphere) ===")
print("Unknown volume until download completes")
print("Period covers Sep-Oct 2016 US election season (high activity expected)")

print("\n=== Total Estimated Storage ===")
print(f"Raw + Silver + Gold: ~20-40 GB (conservative estimate)")
print(f"QA artifacts + reports: ~1-2 GB")

=== News (CC-NEWS) ===
Daily limit: 10,000 articles/day
Total articles (max): 610,000
Estimated storage: ~2.9 GB (silver layer)

=== Reddit (Politosphere) ===
Unknown volume until download completes
Period covers Sep-Oct 2016 US election season (high activity expected)

=== Total Estimated Storage ===
Raw + Silver + Gold: ~20-40 GB (conservative estimate)
QA artifacts + reports: ~1-2 GB


## 4. Execution Plan Overview

In [5]:
execution_plan = [
    {
        'phase': 'Reddit Processing',
        'notebooks': [
            '10_reddit_download_sep_oct_2016.ipynb',
            '11_redit_extract_filter_silver.ipynb',
            '12_reddit_thread_pseudodocs_gold.ipynb',
            '13_reddit_dataset_qc_report.ipynb'
        ],
        'outputs': [
            'data/00_raw/reddit/politosphere_2016-09_2016-10/ (Zenodo download)',
            'data/01_silver/reddit/YYYY-MM-DD.parquet (daily files)',
            'data/03_gold/reddit/YYYY-MM-DD.parquet (pseudodocs)',
            'reports/data_validation/2016-09_2016-10/reddit/'
        ]
    },
    {
        'phase': 'News Processing',
        'notebooks': [
            '20_news_hf_stream_sep_oct_2016.ipynb',
            '21_news_filter_dedup_sample_silver.ipynb',
            '22_news_dataset_qc_report.ipynb'
        ],
        'outputs': [
            'data/00_raw/news/ccnews_2016-09_2016-10/ (HF download)',
            'data/01_silver/news/YYYY-MM-DD.parquet (daily files, max 10k/day)',
            'reports/data_validation/2016-09_2016-10/news/'
        ]
    },
    {
        'phase': 'Cross-Domain Validation',
        'notebooks': [
            '90_shared_schema_checks.ipynb',
            '91_shared_determinism_checks.ipynb'
        ],
        'outputs': [
            'data/04_qa/manifests/ (hash manifests)',
            'data/04_qa/snapshots/ (determinism comparison)'
        ]
    }
]

print("=" * 70)
print("EXECUTION PLAN: Sep-Oct 2016 Validation Run")
print("=" * 70)

for i, phase_info in enumerate(execution_plan, 1):
    print(f"\n{i}. {phase_info['phase']}")
    print("   Notebooks:")
    for nb in phase_info['notebooks']:
        print(f"     - {nb}")
    print("   Expected Outputs:")
    for output in phase_info['outputs']:
        print(f"     - {output}")

print("\n" + "=" * 70)

EXECUTION PLAN: Sep-Oct 2016 Validation Run

1. Reddit Processing
   Notebooks:
     - 10_reddit_download_sep_oct_2016.ipynb
     - 11_redit_extract_filter_silver.ipynb
     - 12_reddit_thread_pseudodocs_gold.ipynb
     - 13_reddit_dataset_qc_report.ipynb
   Expected Outputs:
     - data/00_raw/reddit/politosphere_2016-09_2016-10/ (Zenodo download)
     - data/01_silver/reddit/YYYY-MM-DD.parquet (daily files)
     - data/03_gold/reddit/YYYY-MM-DD.parquet (pseudodocs)
     - reports/data_validation/2016-09_2016-10/reddit/

2. News Processing
   Notebooks:
     - 20_news_hf_stream_sep_oct_2016.ipynb
     - 21_news_filter_dedup_sample_silver.ipynb
     - 22_news_dataset_qc_report.ipynb
   Expected Outputs:
     - data/00_raw/news/ccnews_2016-09_2016-10/ (HF download)
     - data/01_silver/news/YYYY-MM-DD.parquet (daily files, max 10k/day)
     - reports/data_validation/2016-09_2016-10/news/

3. Cross-Domain Validation
   Notebooks:
     - 90_shared_schema_checks.ipynb
     - 91_shared_det

## 5. Key Design Decisions Summary

In [6]:
design_decisions = {
    'Platform Separation': 'Reddit and News processed independently, no cross-platform timestamping',
    'Reddit Time Indexing': 'Daily buckets use comment.created_utc only; thread context for inference only',
    'News Sampling': f'Deterministic hash-based: top-{news_daily_limit:,} by smallest hash(seed||url||date) per day',
    'Deduplication': 'Reddit: exact (author+body+time), News: canonical_url within day',
    'Partitioning': 'Daily Parquet files for both domains',
    'Data Access': 'Reddit: Zenodo download, News: HuggingFace download then process',
    'Leakage Prevention': 'Strict date filtering, no future information, Sep-Oct 2016 only',
    'Determinism': f'Fixed seed: "{seed}", reproducible sampling via hash ordering'
}

print("=== Key Design Decisions ===")
for decision, description in design_decisions.items():
    print(f"\n{decision}:")
    print(f"  {description}")

=== Key Design Decisions ===

Platform Separation:
  Reddit and News processed independently, no cross-platform timestamping

Reddit Time Indexing:
  Daily buckets use comment.created_utc only; thread context for inference only

News Sampling:
  Deterministic hash-based: top-10,000 by smallest hash(seed||url||date) per day

Deduplication:
  Reddit: exact (author+body+time), News: canonical_url within day

Partitioning:
  Daily Parquet files for both domains

Data Access:
  Reddit: Zenodo download, News: HuggingFace download then process

Leakage Prevention:
  Strict date filtering, no future information, Sep-Oct 2016 only

Determinism:
  Fixed seed: "thesis_sep_oct_2016_v1", reproducible sampling via hash ordering


## 6. Capture Git Commit and Environment

In [7]:
try:
    git_commit = subprocess.check_output(
        ['git', 'rev-parse', 'HEAD'],
        cwd=workspace_root,
        stderr=subprocess.DEVNULL
    ).decode().strip()
    
    git_branch = subprocess.check_output(
        ['git', 'rev-parse', '--abbrev-ref', 'HEAD'],
        cwd=workspace_root,
        stderr=subprocess.DEVNULL
    ).decode().strip()
    
    git_status = subprocess.check_output(
        ['git', 'status', '--porcelain'],
        cwd=workspace_root
    ).decode().strip()
    
    has_uncommitted = bool(git_status)
    
except subprocess.CalledProcessError:
    git_commit = 'unknown'
    git_branch = 'unknown'
    has_uncommitted = False

print(f"Git branch: {git_branch}")
print(f"Git commit: {git_commit}")
print(f"Uncommitted changes: {has_uncommitted}")

Git branch: main
Git commit: 175129e32cc0cbe6b9e9c1cbcfdf7fa8f0313d45
Uncommitted changes: True


## 7. Create Run Metadata JSON

In [None]:
run_metadata = {
    'run_id': f'2016-09_2016-10_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
    'created_at': datetime.now().isoformat(),
    'environment': {
        'python_version': sys.version,
        'workspace_root': str(workspace_root),
        'git_commit': git_commit,
        'git_branch': git_branch,
        'has_uncommitted_changes': has_uncommitted
    },
    'configuration': {
        'period_start': period_start,
        'period_end': period_end,
        'num_days': num_days,
        'seed_string': seed,
        'news_daily_limit': news_daily_limit,
        'hash_algorithm': global_cfg['determinism']['hash_algorithm']
    },
    'estimates': {
        'max_news_articles': estimated_news_articles,
        'estimated_storage_gb': round(estimated_news_gb, 1)
    },
    'status': {
        'phase': 'initialization',
        'notebooks_completed': [],
        'last_updated': datetime.now().isoformat()
    }
}

# Save metadata
metadata_path = workspace_root / global_cfg['paths']['artefacts'] / '2016-09_2016-10_run.json'
metadata_path.parent.mkdir(parents=True, exist_ok=True)

with open(metadata_path, 'w') as f:
    json.dump(run_metadata, f, indent=2)

print(f"✓ Run metadata saved to: {metadata_path.relative_to(workspace_root)}")
print(f"\nRun ID: {run_metadata['run_id']}")

## 8. Initialize Manifest Tracking Structure

In [None]:
# Create manifest tracking structure for determinism checks
manifest_structure = {
    'run_id': run_metadata['run_id'],
    'seed': seed,
    'created_at': datetime.now().isoformat(),
    'domains': {
        'reddit': {
            'daily_files': {},  # Will be populated: 'YYYY-MM-DD': {'hash': 'xxx', 'row_count': N, 'size_bytes': M}
            'total_rows': None,
            'total_size_bytes': None
        },
        'news': {
            'daily_files': {},
            'total_rows': None,
            'total_size_bytes': None
        }
    },
    'determinism_runs': []  # Will store hashes from each rerun
}

manifest_path = workspace_root / 'data/04_qa/manifests' / 'run_manifest.json'
manifest_path.parent.mkdir(parents=True, exist_ok=True)

with open(manifest_path, 'w') as f:
    json.dump(manifest_structure, f, indent=2)

print(f"✓ Manifest structure initialized: {manifest_path.relative_to(workspace_root)}")

## 9. Summary and Next Steps

In [None]:
print("=" * 70)
print("DATA VALIDATION PLAN SUMMARY")
print("=" * 70)
print(f"Run ID: {run_metadata['run_id']}")
print(f"Period: {period_start} to {period_end} ({num_days} days)")
print(f"Git commit: {git_commit[:8]}")
print(f"Seed: {seed}")
print(f"News daily limit: {news_daily_limit:,} articles/day")
print(f"Max news articles: {estimated_news_articles:,}")
print(f"Estimated storage: ~{estimated_news_gb:.1f} GB (news only)")
print("\n✓ Run metadata saved")
print("✓ Manifest tracking initialized")
print("\n=" * 70)
print("NEXT STEP: Begin Reddit processing")
print("  → Open: notebooks/reddit/10_reddit_download_sep_oct_2016.ipynb")
print("=" * 70)