End-to-end demo (toy data)

This notebook demonstrates the full pipeline on toy data:

Generate toy raw JSONL dataset

Prepare parquet feature snapshots (incremental feature families)

Run a small feature sweep (RF + XGB, very small trials)

Run a group-ensemble experiment on the largest feature set

Run analysis and display the error vs #features figure

Note: this notebook is designed to run locally. Ensure your working directory is the repo root (where src/ is located) and that dependencies from requirements.txt are installed.

In [None]:
# Cell 1: generate toy JSONL using the helper script
from src.data.make_dataset import generate_toy_jsonl
out_jsonl = 'data/raw/toy_demo.jsonl'
generate_toy_jsonl(out_jsonl, n_samples=300, seed=0)
print('Wrote toy dataset to', out_jsonl)

In [None]:
# Cell 2: prepare parquet features for incremental family sets
from src.experiments.prepare_parquet import prepare_parquets
manifest = prepare_parquets('data/raw/toy_demo.jsonl', out_dir='data/features_demo', pca_components=2)
print('Manifest:', manifest)

In [None]:
# Cell 3: run a small sweep with RF and XGB (no tuning) -- this calls the trainer
import subprocess
subprocess.run([
'python', 'src/experiments/run_feature_sweep.py',
'--out', 'results/demo_sweep',
'--models', 'rf', 'xgb',
'--pca', '2'
])


# After completion, results are in results/demo_sweep

In [None]:
# Cell 4: pick the largest features parquet produced earlier and run group ensemble
import os
import json
with open('data/features_demo/manifest.json','r') as f:
manifest = json.load(f)
# choose the last (largest) entry
entry = manifest[-1]
features_file = entry['file']
print('Using features file:', features_file)


subprocess.run([
'python', 'src/experiments/run_group_ensemble.py',
'--in', features_file,
'--out', 'results/demo_group_ens',
'--strategy', 'by_family',
'--aggregation', 'weighted'
])

In [None]:
# Cell 5: run analysis and display the figure inline
from src.analysis.run_analysis import plot_error_vs_features, best_model_per_bucket
import pandas as pd
import matplotlib.pyplot as plt


sweep_csv = 'results/demo_sweep/sweep_results_final.csv'
df = pd.read_csv(sweep_csv)
plot_error_vs_features(df, out_path='results/demo_fig_error_vs_features.png', metric_col='test_rmse')
img = plt.imread('results/demo_fig_error_vs_features.png')
plt.figure(figsize=(8,5))
plt.imshow(img)
plt.axis('off')
plt.show()


print('Best model per bucket:')
print(best_model_per_bucket(df))