# ASTR-113: Real Data Loading Smoke Test

This notebook validates the real-data collection and loading pipeline:
- Collect validated detections into a `TrainingDataset`
- Persist `TrainingSample` rows
- Load the dataset and create train/val/test splits

Run this before integrating with `notebooks/training/model_training.ipynb`. 


In [1]:
# Setup imports and environment
import os, sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

print(f"Project root added: {project_root}")


Project root added: /home/chris/github/AstrID


In [2]:
API_BASE = os.environ.get("ASTRID_API_BASE", "http://127.0.0.1:8000")
API_BASE


'http://127.0.0.1:8000'

In [4]:
from src.core.constants import TRAINING_PIPELINE_API_KEY

global AUTH_HEADERS
AUTH_HEADERS = {
    "X-API-Key": TRAINING_PIPELINE_API_KEY,
    "Content-Type": "application/json",
}

In [5]:
# Collect a small dataset via API
import requests
from datetime import datetime

params = {
    "survey_ids": ["hst"],
    "start": "2024-01-01T00:00:00",
    "end": "2024-12-31T23:59:59",
    "confidence_threshold": 0.7,
    "max_samples": 50,
    "name": "smoketest_hst_2024"
}

r = requests.post(f"{API_BASE}/training/datasets/collect", json=params, headers=AUTH_HEADERS, timeout=60)
r.raise_for_status()
resp = r.json()
resp


{'status': 'success',
 'data': {'dataset_id': '732d7e5d-f75d-42d1-8e30-0f1b4c5fd46a',
  'name': 'smoketest_hst_2024',
  'total': 0,
  'quality': {'anomaly_ratio': 0.0, 'quality_score': 0.0}},
 'meta': {},
 'error': None}

In [7]:
# Verify dataset is listed
r = requests.get(f"{API_BASE}/training/datasets", headers=AUTH_HEADERS)
r.raise_for_status()
datasets = r.json()["data"]

print(f"Datasets: {len(datasets)}")
# Show the most recent one
sorted(datasets, key=lambda d: d.get("created_at", ""), reverse=True)[:3]


Datasets: 9


[{'id': '732d7e5d-f75d-42d1-8e30-0f1b4c5fd46a',
  'name': 'smoketest_hst_2024',
  'total_samples': 0,
  'quality_score': 0.0,
  'status': 'active',
  'created_at': '2025-09-23T04:52:04.801897+00:00'},
 {'id': 'e7b7abb1-febe-4f29-83a9-a887f4c4aec1',
  'name': 'test_f1eaed84-6366-4e22-8a0e-4cda1b699aa8_20250922_215005',
  'total_samples': 0,
  'quality_score': 0.0,
  'status': 'active',
  'created_at': '2025-09-23T04:50:06.751772+00:00'},
 {'id': '0a0dc4f1-f318-4287-a9a7-6fa1537d425b',
  'name': 'test_hst_20250922_172345',
  'total_samples': 0,
  'quality_score': 0.0,
  'status': 'active',
  'created_at': '2025-09-23T00:23:45.452931+00:00'}]

In [9]:
# Inspect dataset via API (no direct DB access needed in notebook)
# Resolve dataset_id from prior response or fallback to latest dataset via API
if isinstance(resp, dict) and "data" in resp and resp["data"].get("dataset_id"):
    dataset_id = resp["data"]["dataset_id"]
else:
    r = requests.get(f"{API_BASE}/training/datasets", headers=AUTH_HEADERS)
    r.raise_for_status()
    datasets = r.json().get("data", [])
    if not datasets:
        raise RuntimeError("No training datasets available")
    # pick the most recent
    datasets_sorted = sorted(datasets, key=lambda d: d.get("created_at", ""), reverse=True)
    dataset_id = str(datasets_sorted[0]["id"])  # ensure string

print("Dataset ID:", dataset_id)

# Get dataset details
detail = requests.get(f"{API_BASE}/training/datasets/{dataset_id}", headers=AUTH_HEADERS)
detail.raise_for_status()
detail.json()


Dataset ID: 732d7e5d-f75d-42d1-8e30-0f1b4c5fd46a


{'status': 'success',
 'data': {'id': '732d7e5d-f75d-42d1-8e30-0f1b4c5fd46a',
  'name': 'smoketest_hst_2024',
  'total_samples': 0,
  'quality_score': 0.0,
  'status': 'active',
  'samples': 0},
 'meta': {},
 'error': None}

In [10]:
# Optional: preview a few sample image paths from DB via API
# (Future enhancement: API could return sample listings)
print("For now, verify counts above. Integration with training notebook will consume by dataset_id.")


For now, verify counts above. Integration with training notebook will consume by dataset_id.
