# 00 — Environment & Schema Audit

Freeze the execution environment, confirm required packages, document data schemas, and hash the `src/` tree. Run this notebook first before executing any analysis so that reviewers can verify the setup.

In [None]:
from pathlib import Path
import sys
import json
import platform
import subprocess
import hashlib

PROJECT_ROOT = Path.cwd()
SRC_DIR = PROJECT_ROOT / 'src'
REPORTS_DIR = PROJECT_ROOT / 'reports'
REPORTS_DIR.mkdir(exist_ok=True)


In [None]:
# Assert core packages are present
required = ['pandas', 'numpy', 'matplotlib', 'seaborn']
missing = []
for name in required:
    try:
        __import__(name)
    except ImportError:
        missing.append(name)
if missing:
    raise ModuleNotFoundError(f'Missing packages: {missing}')
print('All required packages import successfully.')
print('Python version:', sys.version)
print('Platform:', platform.platform())


In [None]:
# Capture deterministic environment snapshot (pip freeze)
result = subprocess.run([sys.executable, '-m', 'pip', 'freeze'], capture_output=True, text=True, check=True)
env_path = REPORTS_DIR / 'environment.txt'
env_path.write_text(result.stdout)
print(f'Pinned environment written to {env_path.relative_to(PROJECT_ROOT)}')
print('
'.join(result.stdout.split('
')[:10]))


In [None]:
# Summarise dataset schemas
schema_paths = [
    PROJECT_ROOT / 'data' / 'initial_calibration_dad' / 'schema.json',
    PROJECT_ROOT / 'data' / 'reflectance_canonical' / 'schema.json',
    PROJECT_ROOT / 'data' / 'mamba_validation_panel' / 'schema.json',
]
if (PROJECT_ROOT / 'data' / 'supplements_dose_schedule' / 'schema.json').exists():
    schema_paths.append(PROJECT_ROOT / 'data' / 'supplements_dose_schedule' / 'schema.json')

for path in schema_paths:
    if path.exists():
        print(f"
Schema: {path.relative_to(PROJECT_ROOT)}")
        schema = json.loads(path.read_text())
        print('  dataset_id:', schema.get('dataset_id'))
        for table in schema.get('tables', []):
            cols = ', '.join(col['name'] for col in table.get('columns', [])[:6])
            print(f"  - {table['path']} | columns={table['column_count']} rows≈{table['row_count']} | sample columns: {cols}")


In [None]:
# Reflectance regions of interest
ROI_BROAD = (320, 480)
ROI_NARROW = (360, 410)
print('Reflectance ROI (broad):', ROI_BROAD)
print('Reflectance ROI (narrow):', ROI_NARROW)


In [None]:
# Hash src/ tree to freeze code state
hasher = hashlib.sha256()
for path in sorted(SRC_DIR.rglob('*')):
    if path.is_file():
        hasher.update(path.relative_to(SRC_DIR).as_posix().encode())
        hasher.update(path.read_bytes())
hash_hex = hasher.hexdigest()
(SRC_DIR.parent / 'reports' / 'src_hash.txt').write_text(hash_hex + '
')
print('src/ sha256:', hash_hex)


## Summary

- Environment snapshot written to `reports/environment.txt`
- Source hash written to `reports/src_hash.txt`
- Dataset schemas printed above for quick inspection
- Reflectance ROIs documented for reproducibility