# 01 — Test data pipeline (download → raw → processed)

This notebook validates your two scripts:
- `load_existing_dataset.py` → downloads GSM8K and writes **raw** JSON into `gsm8k-distillation/data/raw`
- `import_data.py` → reads raw JSON and writes **processed** JSON into `gsm8k-distillation/data/processed`

Run it from the repo root.


In [None]:
import os
from pathlib import Path

# Adjust if your repo root differs
REPO_ROOT = Path('.').resolve()
RAW_DIR = REPO_ROOT / 'gsm8k-distillation' / 'data' / 'raw'
PROCESSED_DIR = REPO_ROOT / 'gsm8k-distillation' / 'data' / 'processed'
print('Repo root:', REPO_ROOT)
print('RAW_DIR:', RAW_DIR)
print('PROCESSED_DIR:', PROCESSED_DIR)
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
# If your scripts live in src/data, import them from there.
# If needed, extend sys.path to include src.
import sys
src_path = (REPO_ROOT / 'src').as_posix()
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from data.load_existing_dataset import GSM8KDatasetProcessor as RawProcessor
from data.import_data import GSM8KDatasetLoader as ProcessedLoader

print('Imports OK')


In [None]:
processor = RawProcessor(base_path=RAW_DIR.as_posix())

# Download only if missing (avoid re-downloading)
raw_train = RAW_DIR / 'gsm8k_cot_train.json'
raw_test  = RAW_DIR / 'gsm8k_cot_test.json'

if raw_train.exists() and raw_test.exists():
    print('Raw files already exist — skipping download.')
else:
    print('Downloading GSM8K → raw JSON...')
    processor.download_and_prepare_gsm8k()
    print('Done')

print('Raw train exists:', raw_train.exists(), 'size:', raw_train.stat().st_size if raw_train.exists() else None)
print('Raw test  exists:', raw_test.exists(),  'size:', raw_test.stat().st_size  if raw_test.exists()  else None)


In [None]:
# Quick sanity check: load a few raw records
import json

def peek_json(path, n=3):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print('Rows:', len(data))
    for i, ex in enumerate(data[:n]):
        print('\n--- Example', i, '---')
        print('question:', ex.get('question','')[:120])
        print('reasoning:', ex.get('reasoning','')[:120])
        print('answer:', ex.get('answer'))

if raw_train.exists():
    peek_json(raw_train, n=2)


In [None]:
# Process: raw → processed

train_loader = ProcessedLoader(raw_train)
test_loader  = ProcessedLoader(raw_test)

train_loader.print_statistics()
test_loader.print_statistics()

processed_train = PROCESSED_DIR / 'gsm8k_train_processed.json'
processed_test  = PROCESSED_DIR / 'gsm8k_test_processed.json'

train_loader.save_processed_dataset(train_loader.examples, processed_train)
test_loader.save_processed_dataset(test_loader.examples, processed_test)

print('Processed train:', processed_train, processed_train.exists(), processed_train.stat().st_size)
print('Processed test :', processed_test,  processed_test.exists(),  processed_test.stat().st_size)


In [None]:
# Final validation: schema and a couple of rows
peek_json(processed_train, n=2)
