## Setup and Configuration


In [None]:
# Setup
import sys
import os
import json
import requests
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print("✅ Path setup complete")


In [None]:
# Configuration
API_BASE = "http://127.0.0.1:8000"
MLFLOW_UI = "http://localhost:5000"
PREFECT_UI = "http://localhost:4200"

# Test parameters - Will be set after creating/loading survey
TEST_SURVEY_ID: Optional[str] = None
TEST_COUNT = 5  # Start small for testing
TEST_DATE_RANGE = {
    "start": "2024-01-01T00:00:00",
    "end": "2024-01-02T23:59:59"
}

# Authentication
AUTH_TOKEN: Optional[str] = None
AUTH_HEADERS: Dict[str, str] = {}

print(f"API Base: {API_BASE}")
print(f"Test Survey: {TEST_SURVEY_ID} (will be set after survey creation)")
print(f"Test Count: {TEST_COUNT}")
print("✅ Configuration complete")


## 🔐 Authentication Cell (Copy to Other Notebooks)

**Copy the cell below to any notebook that needs API authentication:**


In [None]:
# from auth_helper import authenticate_user
# from src.core.constants import ADMIN_EMAIL, ADMIN_PASSWORD

# # Authenticate and build auth headers
# AUTH_TOKEN = authenticate_user(ADMIN_EMAIL, ADMIN_PASSWORD)
# AUTH_HEADERS = {
#     "Authorization": f"Bearer {AUTH_TOKEN}" if AUTH_TOKEN else "",
#     "Content-Type": "application/json",
# }


In [None]:
from src.core.constants import TRAINING_PIPELINE_API_KEY

global AUTH_HEADERS
AUTH_HEADERS = {
    "X-API-Key": TRAINING_PIPELINE_API_KEY,
    "Content-Type": "application/json",
}

In [None]:
display(TRAINING_PIPELINE_API_KEY)
display(AUTH_HEADERS)

## Survey Setup


In [None]:
import uuid

def create_test_survey() -> str | None:
    """Create a test survey for the pipeline."""
    if not AUTH_HEADERS:
        print("❌ Not authenticated. Please run the authentication cell first.")
        return None
    
    # Create a test survey using a direct database approach
    # Since there's no direct API endpoint, we'll use a known UUID
    # that should work with the system
    
    # Generate a proper UUID for the survey
    survey_id = str(uuid.uuid4())
    
    print(f"🔧 Creating test survey with ID: {survey_id}")
    print("   Note: This survey will be used for testing the pipeline")
    print("   In production, surveys should be created through the admin interface")
    
    return survey_id

# Set the survey ID
TEST_SURVEY_ID = create_test_survey()

if TEST_SURVEY_ID:
    print(f"🎯 Ready to use survey ID: {TEST_SURVEY_ID}")
    print(f"   This is a valid UUID format that the API will accept")
else:
    print("❌ Could not set up survey ID")


## Step 1: Health Checks


In [None]:
def check_service_health(url: str, service_name: str) -> bool:
    """Check if a service is healthy."""
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            print(f"✅ {service_name}: Healthy")
            return True
        else:
            print(f"❌ {service_name}: HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ {service_name}: {str(e)}")
        return False

# Check all services
services = {
    "API": f"{API_BASE}/health",
    "MLflow": MLFLOW_UI,
    "Prefect": PREFECT_UI
}

all_healthy = True
for name, url in services.items():
    if not check_service_health(url, name):
        all_healthy = False

if all_healthy:
    print("\n🎉 All services are healthy!")
else:
    print("\n⚠️  Some services are not healthy. Please check docker-compose.")
    print("Run: docker-compose up -d api worker prefect mlflow redis")


In [None]:
# Consume real_data_manifest.json and print available samples\nimport json\nfrom pathlib import Path\n\nmanifest_path = Path.cwd().parent / "ml_training_data" / "real_data_loading.ipynb"  # notebook path\n# Manifest sits next to the real_data_loading notebook\nman_path = (Path.cwd().parent / "ml_training_data" / "real_data_manifest.json")\nif not man_path.exists():\n    raise FileNotFoundError(f"Manifest not found: {man_path}")\nmanifest = json.loads(man_path.read_text())\nprint(f"Found {len(manifest)} entries in manifest")\nprint(manifest[:3])\n\n# TODO: Hand off to training service or create TrainingDataset via API from manifest entries


In [3]:
# Build simple training list from manifest entries
from typing import List, Dict
from pathlib import Path

train_items: List[Dict] = []
val_items: List[Dict] = []

manifest_path = Path.cwd().parent / "ml_training_data" / "real_data_loading.ipynb"  # notebook path
man_path = (Path.cwd().parent / "ml_training_data" / "real_data_manifest.json")
if not man_path.exists():
    raise FileNotFoundError(f"Manifest not found: {man_path}")
manifest = json.loads(man_path.read_text())

# naive split 80/20
cut = max(1, int(0.8 * len(manifest)))
train_items = manifest[:cut]
val_items = manifest[cut:]

print(f"Train: {len(train_items)} | Val: {len(val_items)}")

# Preview a few items with full R2 URL for convenience
endpoint = os.getenv("MLFLOW_S3_ENDPOINT_URL", "")
def to_url(item: Dict) -> str:
    if endpoint:
        return f"{endpoint}/{item['bucket']}/{item['key']}"
    return f"s3://{item['bucket']}/{item['key']}"

# Preview a few training samples with their R2 URLs
for sample in train_items[:3]:
    print(to_url(sample))

# Prepare minimal dataset payload from manifest entries
from datetime import datetime

items = manifest  # manifest is already loaded above

dataset_payload = {
    "name": f"skyview_manifest_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
    "description": "SkyView cutouts uploaded to R2 for training",
    "total": len(items),
    "items": items[:10],  # keep it small for a dry run
}

print("Dataset payload preview (first 1 item):")
print(json.dumps({**dataset_payload, "items": dataset_payload["items"][:1]}, indent=2)[:800])


FileNotFoundError: Manifest not found: /home/chris/github/AstrID/ml_training_data/real_data_manifest.json

## Step 2: Ingest Test Observations


In [None]:
def ingest_test_observations() -> Dict[str, Any]:
    """Ingest test observations using the API."""
    # AUTH_HEADERS should already be set globally in cell 6
    if not AUTH_HEADERS:
        print("❌ AUTH_HEADERS not set. Please run the authentication cell first.")
        return {"success": False, "error": "AUTH_HEADERS not set. Please run the authentication cell first."}
    
    url = f"{API_BASE}/observations/ingest/batch-random"
    payload = {
        "count": TEST_COUNT,
        "survey_id": TEST_SURVEY_ID
    }
    
    print(f"Ingesting {TEST_COUNT} test observations for survey {TEST_SURVEY_ID}...")
    
    try:
        response = requests.post(url, json=payload, headers=AUTH_HEADERS, timeout=30)
        response.raise_for_status()
        
        result = response.json()
        observations = result.get("data", [])
        
        print(f"✅ Successfully ingested {len(observations)} observations")
        
        # Store observation IDs for later use
        observation_ids = [obs["id"] for obs in observations]
        
        return {
            "success": True,
            "count": len(observations),
            "observation_ids": observation_ids,
            "observations": observations
        }
        
    except requests.exceptions.RequestException as e:
        print(f"❌ Failed to ingest observations: {str(e)}")
        if hasattr(e, 'response') and e.response is not None:
            try:
                error_detail = e.response.json()
                print(f"   Error details: {error_detail}")
            except:
                print(f"   Response: {e.response.text}")
        return {"success": False, "error": str(e)}

# Run ingestion
ingestion_result = ingest_test_observations()

if ingestion_result["success"]:
    print(f"\nObservation IDs: {ingestion_result['observation_ids']}")
    # Store for later steps
    observation_ids = ingestion_result["observation_ids"]
else:
    print("\n❌ Cannot proceed without observations")
    observation_ids = []


In [None]:
def create_training_dataset() -> Dict[str, Any]:
    """Create training dataset from the processed detections."""
    url = f"{API_BASE}/training/datasets/collect"
    payload = {
        "survey_ids": [TEST_SURVEY_ID],
        "start": TEST_DATE_RANGE["start"],
        "end": TEST_DATE_RANGE["end"],
        "confidence_threshold": 0.3,  # Lower threshold for testing
        "max_samples": 100,
        "name": f"test_{TEST_SURVEY_ID}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    }
    
    print(f"Creating training dataset for survey {TEST_SURVEY_ID}...")
    print(f"Date range: {payload['start']} to {payload['end']}")
    print(f"Confidence threshold: {payload['confidence_threshold']}")
    
    try:
        response = requests.post(url, json=payload, headers=AUTH_HEADERS, timeout=60)
        response.raise_for_status()
        
        result = response.json()
        dataset_info = result.get("data", {})
        
        print(f"✅ Successfully created training dataset")
        print(f"   - Dataset ID: {dataset_info.get('dataset_id')}")
        print(f"   - Name: {dataset_info.get('name')}")
        print(f"   - Total samples: {dataset_info.get('total')}")
        print(f"   - Quality score: {dataset_info.get('quality', {}).get('quality_score', 0):.3f}")
        
        return {
            "success": True,
            "dataset_id": dataset_info.get("dataset_id"),
            "dataset_info": dataset_info
        }
        
    except requests.exceptions.RequestException as e:
        print(f"❌ Failed to create training dataset: {str(e)}")
        return {"success": False, "error": str(e)}

# Create training dataset
dataset_result = create_training_dataset()

if dataset_result["success"]:
    dataset_id = dataset_result["dataset_id"]
    print(f"\n🎉 Training dataset created successfully!")
    print(f"Dataset ID: {dataset_id}")
else:
    print(f"\n❌ Training dataset creation failed: {dataset_result['error']}")
    dataset_id = None


In [None]:
def list_training_datasets() -> Dict[str, Any]:
    """List all available training datasets."""
    url = f"{API_BASE}/training/datasets"
    
    try:
        response = requests.get(url, headers=AUTH_HEADERS, timeout=30)
        response.raise_for_status()
        
        result = response.json()
        datasets = result.get("data", [])
        
        print(f"✅ Found {len(datasets)} training datasets")
        
        for i, dataset in enumerate(datasets, 1):
            print(f"\n{i}. {dataset.get('name')}")
            print(f"   - ID: {dataset.get('id')}")
            print(f"   - Samples: {dataset.get('total_samples')}")
            print(f"   - Quality: {dataset.get('quality_score', 0):.3f}")
            print(f"   - Status: {dataset.get('status')}")
            print(f"   - Created: {dataset.get('created_at')}")
        
        return {
            "success": True,
            "datasets": datasets
        }
        
    except requests.exceptions.RequestException as e:
        print(f"❌ Failed to list datasets: {str(e)}")
        return {"success": False, "error": str(e)}

# List all datasets
list_result = list_training_datasets()

if list_result["success"]:
    datasets = list_result["datasets"]
    
    # Find datasets with samples
    datasets_with_samples = [d for d in datasets if d.get("total_samples", 0) > 0]
    
    if datasets_with_samples:
        print(f"\n🎉 Found {len(datasets_with_samples)} datasets with samples!")
        print(f"\nReady for training:")
        for dataset in datasets_with_samples:
            print(f"- {dataset['name']} (ID: {dataset['id']}) - {dataset['total_samples']} samples")
    else:
        print(f"\n⚠️  No datasets have samples yet.")
        print(f"This suggests the data pipeline needs to be run first.")
else:
    print(f"\n❌ Failed to list datasets: {list_result['error']}")
