# ASTR-113: Real Data Loading Smoke Test

This notebook validates the real-data collection and loading pipeline:
- Collect validated detections into a `TrainingDataset`
- Persist `TrainingSample` rows
- Load the dataset and create train/val/test splits

Run this before integrating with `notebooks/training/model_training.ipynb`. 


In [1]:
# Setup imports and environment
import os, sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

print(f"Project root added: {project_root}")


Project root added: /home/chris/github/AstrID


In [4]:
API_BASE = os.environ.get("ASTRID_API_BASE", "http://localhost:8000")
API_BASE


'http://localhost:8000'

In [5]:
# Collect a small dataset via API
import requests
from datetime import datetime

params = {
    "survey_ids": ["hst"],
    "start": "2024-01-01T00:00:00",
    "end": "2024-12-31T23:59:59",
    "confidence_threshold": 0.7,
    "max_samples": 50,
    "name": "smoketest_hst_2024"
}

r = requests.post(f"{API_BASE}/training/datasets/collect", json=params, timeout=60)
r.raise_for_status()
resp = r.json()
resp


{'status': 'success',
 'data': {'dataset_id': 'b407e1c8-df17-4d03-9e22-df5d404200c0',
  'name': 'smoketest_hst_2024',
  'total': 0,
  'quality': {'anomaly_ratio': 0.0, 'quality_score': 0.0}},
 'meta': {},
 'error': None}

In [6]:
# Verify dataset is listed
r = requests.get(f"{API_BASE}/training/datasets")
r.raise_for_status()
datasets = r.json()["data"]

print(f"Datasets: {len(datasets)}")
# Show the most recent one
sorted(datasets, key=lambda d: d.get("created_at", ""), reverse=True)[:3]


HTTPError: 500 Server Error: Internal Server Error for url: http://localhost:8000/training/datasets

In [None]:
# Inspect dataset via API (no direct DB access needed in notebook)
# Resolve dataset_id from prior response or fallback to latest dataset via API
if isinstance(resp, dict) and "data" in resp and resp["data"].get("dataset_id"):
    dataset_id = resp["data"]["dataset_id"]
else:
    r = requests.get(f"{API_BASE}/training/datasets")
    r.raise_for_status()
    datasets = r.json().get("data", [])
    if not datasets:
        raise RuntimeError("No training datasets available")
    # pick the most recent
    datasets_sorted = sorted(datasets, key=lambda d: d.get("created_at", ""), reverse=True)
    dataset_id = str(datasets_sorted[0]["id"])  # ensure string

print("Dataset ID:", dataset_id)

# Get dataset details
detail = requests.get(f"{API_BASE}/training/datasets/{dataset_id}")
detail.raise_for_status()
detail.json()


In [None]:
# Optional: preview a few sample image paths from DB via API
# (Future enhancement: API could return sample listings)
print("For now, verify counts above. Integration with training notebook will consume by dataset_id.")
