In [1]:
import sys
import platform
from pathlib import Path
import shutil
from datetime import datetime

print(f"Validation run started: {datetime.now().isoformat()}")
print(f"Platform: {platform.platform()}")
print(f"Machine: {platform.machine()}")

Validation run started: 2025-12-18T14:02:02.018358
Platform: macOS-15.7.2-arm64-arm-64bit-Mach-O
Machine: arm64


## 1. Python Version Check

In [2]:
# Expected: Python 3.13.5
python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
print(f"Python version: {python_version}")
print(f"Executable: {sys.executable}")

expected_version = "3.13.5"
if python_version == expected_version:
    print(f"✓ Python version matches expected: {expected_version}")
else:
    print(f"⚠ Warning: Expected {expected_version}, got {python_version}")

Python version: 3.13.5
Executable: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment/venv/bin/python
✓ Python version matches expected: 3.13.5


## 2. Dependency Check

In [3]:
# Import all required packages
dependencies = {
    'pandas': None,
    'pyarrow': None,
    'numpy': None,
    'xxhash': None,
    'yaml': 'pyyaml',
    'datasets': None,
    'tqdm': None,
    'matplotlib': None,
    'seaborn': None,
    'jsonschema': None
}

import_errors = []

for module_name, package_name in dependencies.items():
    try:
        module = __import__(module_name)
        version = getattr(module, '__version__', 'unknown')
        print(f"✓ {package_name or module_name}: {version}")
    except ImportError as e:
        import_errors.append((package_name or module_name, str(e)))
        print(f"✗ {package_name or module_name}: NOT INSTALLED")

if import_errors:
    print(f"\n⚠ {len(import_errors)} package(s) missing. Install with: pip install -r requirements.txt")
else:
    print("\n✓ All dependencies installed successfully")

✓ pandas: 2.2.3
✓ pyarrow: 18.1.0
✓ numpy: 2.2.1
✓ xxhash: unknown
✓ pyyaml: 6.0.2
✓ datasets: 3.2.0
✓ tqdm: 4.67.1
✓ matplotlib: 3.9.3
✓ seaborn: 0.13.2
✓ jsonschema: 4.23.0

✓ All dependencies installed successfully


  version = getattr(module, '__version__', 'unknown')


## 3. Disk Space Check

In [4]:
# Check available disk space
workspace_root = Path.cwd()
disk_usage = shutil.disk_usage(workspace_root)

gb = 1024**3
total_gb = disk_usage.total / gb
used_gb = disk_usage.used / gb
free_gb = disk_usage.free / gb

print(f"Workspace: {workspace_root}")
print(f"Total disk: {total_gb:.1f} GB")
print(f"Used: {used_gb:.1f} GB ({100 * disk_usage.used / disk_usage.total:.1f}%)")
print(f"Free: {free_gb:.1f} GB")

# Estimate: ~600k news articles + ~? Reddit comments
# Conservative estimate: 50 GB needed for raw + processed data
required_gb = 50
if free_gb >= required_gb:
    print(f"\n✓ Sufficient disk space ({free_gb:.1f} GB available, ~{required_gb} GB estimated needed)")
else:
    print(f"\n⚠ Warning: Low disk space ({free_gb:.1f} GB available, ~{required_gb} GB estimated needed)")

Workspace: /Users/stahlma/Desktop/01_Studium/11_Thesis/Data_Experiment
Total disk: 926.4 GB
Used: 540.8 GB (58.4%)
Free: 385.5 GB

✓ Sufficient disk space (385.5 GB available, ~50 GB estimated needed)


## 4. Configuration Files Check

In [5]:
# Add src to path for imports
sys.path.insert(0, str(workspace_root / 'src'))

from thesis_pipeline.io.config import load_all_configs

try:
    configs = load_all_configs(workspace_root / 'configs')
    print("✓ All configuration files loaded successfully\n")
    
    # Display key parameters
    global_cfg = configs['global']
    reddit_cfg = configs['reddit']
    news_cfg = configs['news']
    
    print("=== Global Config ===")
    print(f"Period: {global_cfg['validation_run']['period_start']} to {global_cfg['validation_run']['period_end']}")
    print(f"Seed: {global_cfg['validation_run']['seed_string']}")
    print(f"Python: {global_cfg['validation_run']['python_version']}")
    
    print("\n=== Reddit Config ===")
    print(f"Source: {reddit_cfg['source']['name']}")
    print(f"Time index: {reddit_cfg['processing']['time_index_field']}")
    print(f"Partitioning: {reddit_cfg['output']['partitioning']}")
    
    print("\n=== News Config ===")
    print(f"Source: {news_cfg['source']['hf_dataset']}")
    print(f"Daily limit: {news_cfg['sampling']['daily_limit_n']:,} articles/day")
    print(f"Access method: {news_cfg['source']['access_method']}")
    print(f"Partitioning: {news_cfg['output']['partitioning']}")
    
except Exception as e:
    print(f"✗ Error loading configurations: {e}")

✓ All configuration files loaded successfully

=== Global Config ===
Period: 2016-09-01 to 2016-10-31
Seed: thesis_sep_oct_2016_v1
Python: 3.13.2

=== Reddit Config ===
Source: Politosphere
Time index: created_utc
Partitioning: daily

=== News Config ===
Source: stanford-oval/ccnews
Daily limit: 10,000 articles/day
Access method: download
Partitioning: daily


## 5. Directory Structure Check

In [6]:
# Verify all required directories exist
required_dirs = [
    'data/00_raw/reddit',
    'data/00_raw/news',
    'data/01_silver/reddit',
    'data/01_silver/news',
    'data/03_gold/reddit',
    'data/03_gold/news',
    'data/04_qa/manifests',
    'data/04_qa/snapshots',
    'reports/data_validation/2016-09_2016-10/reddit/figures',
    'reports/data_validation/2016-09_2016-10/reddit/tables',
    'reports/data_validation/2016-09_2016-10/news/figures',
    'reports/data_validation/2016-09_2016-10/news/tables',
    'logs/notebooks',
    'logs/runs',
    'artefacts/run_metadata'
]

missing_dirs = []
for dir_path in required_dirs:
    full_path = workspace_root / dir_path
    if not full_path.exists():
        missing_dirs.append(dir_path)

if missing_dirs:
    print(f"Creating {len(missing_dirs)} missing directories...")
    for dir_path in missing_dirs:
        (workspace_root / dir_path).mkdir(parents=True, exist_ok=True)
        print(f"  Created: {dir_path}")
else:
    print("✓ All required directories exist")

print("\n✓ Directory structure ready")

✓ All required directories exist

✓ Directory structure ready


## 6. Git Status Check

In [7]:
import subprocess

try:
    # Get current git commit
    git_commit = subprocess.check_output(
        ['git', 'rev-parse', 'HEAD'],
        cwd=workspace_root,
        stderr=subprocess.DEVNULL
    ).decode().strip()
    
    git_branch = subprocess.check_output(
        ['git', 'rev-parse', '--abbrev-ref', 'HEAD'],
        cwd=workspace_root,
        stderr=subprocess.DEVNULL
    ).decode().strip()
    
    print(f"Git branch: {git_branch}")
    print(f"Git commit: {git_commit[:8]}...")
    
    # Check for uncommitted changes
    git_status = subprocess.check_output(
        ['git', 'status', '--porcelain'],
        cwd=workspace_root
    ).decode().strip()
    
    if git_status:
        print("\n⚠ Uncommitted changes detected:")
        print(git_status[:500])  # First 500 chars
    else:
        print("\n✓ Working directory clean")
    
except subprocess.CalledProcessError:
    print("⚠ Not a git repository or git not available")
    git_commit = None

Git branch: main
Git commit: 175129e3...

⚠ Uncommitted changes detected:
M notebooks/00_environment_check.ipynb


## 7. Summary & Readiness

In [8]:
print("=" * 60)
print("ENVIRONMENT CHECK SUMMARY")
print("=" * 60)
print(f"✓ Python: {python_version}")
print(f"✓ Dependencies: All installed" if not import_errors else f"✗ Dependencies: {len(import_errors)} missing")
print(f"✓ Disk space: {free_gb:.1f} GB available")
print(f"✓ Configurations: Loaded")
print(f"✓ Directory structure: Ready")
print(f"✓ Git commit: {git_commit[:8] if git_commit else 'N/A'}")
print("\nStatus: READY TO PROCEED" if not import_errors else "Status: INSTALL DEPENDENCIES FIRST")
print("=" * 60)

ENVIRONMENT CHECK SUMMARY
✓ Python: 3.13.5
✓ Dependencies: All installed
✓ Disk space: 385.5 GB available
✓ Configurations: Loaded
✓ Directory structure: Ready
✓ Git commit: 175129e3

Status: READY TO PROCEED
