## Setup and Configuration


In [1]:
# Setup
import sys
import os
import json
import requests
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print("‚úÖ Path setup complete")


Project root: /home/chris/github/AstrID
‚úÖ Path setup complete


In [None]:
# Configuration - UPDATED FOR REAL DATA
API_BASE = "http://127.0.0.1:8000"
MLFLOW_UI = "http://localhost:9003"
PREFECT_UI = "http://localhost:9004"

# Survey selection - USE DSS2 (has real observations)
TEST_SURVEY_NAME: Optional[str] = os.getenv("ASTRID_SURVEY_NAME", "dss2").lower()
ENV_MAP = {
    "hst": os.getenv("ASTRID_HST_SURVEY_ID", "05e6090c-bac5-4b78-8d7d-ae15a7dde50f"),
    "jwst": os.getenv("ASTRID_JWST_SURVEY_ID", "3ae172d0-c51a-4dad-8033-9813792ce503"),
    "dss2": os.getenv("ASTRID_DSS2_SURVEY_ID", "2127bdee-056c-4266-b1b3-20eb879cd543"),
    "tess": os.getenv("ASTRID_TESS_SURVEY_ID", "49e8d057-184a-4239-9bff-9be72fbcfd02"),
}
TEST_SURVEY_ID: Optional[str] = os.getenv("ASTRID_DSS2_SURVEY_ID", "")

# Training dataset configuration
TEST_COUNT = 10  # Use more observations for training
TEST_DATE_RANGE = {
    "start": "2025-01-01T00:00:00",  # Updated to 2025 (when your real data was created)
    "end": "2025-12-31T23:59:59"
}

# Authentication
AUTH_TOKEN: Optional[str] = None
AUTH_HEADERS: Dict[str, str] = {}

print(f"API Base: {API_BASE}")
print(f"Survey: name={TEST_SURVEY_NAME}, id={TEST_SURVEY_ID}")
print(f"Test Count: {TEST_COUNT}")
print(f"Date Range: {TEST_DATE_RANGE['start']} to {TEST_DATE_RANGE['end']}")
print("‚úÖ Configuration complete - Ready for REAL DATA training pipeline")


API Base: http://127.0.0.1:8000
Survey: name=dss2, id=2127bdee-056c-4266-b1b3-20eb879cd543
Test Count: 10
Date Range: 2025-01-01T00:00:00 to 2025-12-31T23:59:59
‚úÖ Configuration complete - Ready for REAL DATA training pipeline


## üîê Authentication Cell (Copy to Other Notebooks)

**Copy the cell below to any notebook that needs API authentication:**


In [3]:
from src.core.constants import TRAINING_PIPELINE_API_KEY

# Set up authentication for training pipeline
global AUTH_HEADERS
AUTH_HEADERS = {
    "X-API-Key": TRAINING_PIPELINE_API_KEY,
    "Content-Type": "application/json",
}

print("üîê Authentication configured for training pipeline")
print(f"API Key: {TRAINING_PIPELINE_API_KEY[:20]}...")
print("‚úÖ Ready to create training datasets from real observations")

üîê Authentication configured for training pipeline
API Key: astrid__v2B8H-b6qRBf...
‚úÖ Ready to create training datasets from real observations


## Survey Setup


In [4]:
def resolve_test_survey() -> str | None:
    """Resolve existing survey UUID from env; do not fabricate UUIDs."""
    if not AUTH_HEADERS:
        print("‚ùå Not authenticated. Please run the API key cell.")
        return None
    if not TEST_SURVEY_ID:
        print("‚ùå No survey UUID configured. Set ASTRID_SURVEY_ID or ASTRID_<NAME>_SURVEY_ID.")
        return None
    print(f"üéØ Using survey: name={TEST_SURVEY_NAME}, id={TEST_SURVEY_ID}")
    return TEST_SURVEY_ID

# Set the survey ID
TEST_SURVEY_ID = resolve_test_survey()


üéØ Using survey: name=dss2, id=2127bdee-056c-4266-b1b3-20eb879cd543


## Step 1: Health Checks


In [5]:
def check_service_health(url: str, service_name: str) -> bool:
    """Check if a service is healthy."""
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            print(f"‚úÖ {service_name}: Healthy")
            return True
        else:
            print(f"‚ùå {service_name}: HTTP {response.status_code}")
            return False
    except Exception as e:
        print(f"‚ùå {service_name}: {str(e)}")
        return False

# Check all services
services = {
    "API": f"{API_BASE}/health",
    "MLflow": MLFLOW_UI,
    "Prefect": PREFECT_UI
}

all_healthy = True
for name, url in services.items():
    if not check_service_health(url, name):
        all_healthy = False

if all_healthy:
    print("\nüéâ All services are healthy!")
else:
    print("\n‚ö†Ô∏è  Some services are not healthy. Please check docker-compose.")
    print("Run: docker-compose up -d api worker prefect mlflow redis")


‚úÖ API: Healthy
‚úÖ MLflow: Healthy
‚úÖ Prefect: Healthy

üéâ All services are healthy!


In [6]:
# CREATE TRAINING DATASET FROM REAL OBSERVATIONS
def create_training_dataset_from_real_observations() -> Dict[str, Any]:
    """Create training dataset from our real observations (not mock data)."""
    url = f"{API_BASE}/training/datasets/collect"
    payload = {
        "survey_ids": [TEST_SURVEY_NAME],  # Use DSS2 survey with real data
        "start": TEST_DATE_RANGE["start"],
        "end": TEST_DATE_RANGE["end"],
        "confidence_threshold": 0.1,  # Very low threshold to capture all real observations
        "max_samples": 50,  # Start with reasonable number
        "name": f"real_data_{TEST_SURVEY_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    }
    
    print(f"üéØ Creating training dataset from REAL observations...")
    print(f"Survey: {TEST_SURVEY_NAME} ({TEST_SURVEY_ID})")
    print(f"Date range: {payload['start']} to {payload['end']}")
    print(f"Confidence threshold: {payload['confidence_threshold']}")
    print(f"Max samples: {payload['max_samples']}")
    
    try:
        response = requests.post(url, json=payload, headers=AUTH_HEADERS, timeout=60)
        response.raise_for_status()
        
        result = response.json()
        dataset_info = result.get("data", {})
        
        print(f"‚úÖ Successfully created training dataset from real observations!")
        print(f"   - Dataset ID: {dataset_info.get('dataset_id')}")
        print(f"   - Name: {dataset_info.get('name')}")
        print(f"   - Total samples: {dataset_info.get('total')}")
        print(f"   - Quality score: {dataset_info.get('quality', {}).get('quality_score', 0):.3f}")
        
        return {
            "success": True,
            "dataset_id": dataset_info.get("dataset_id"),
            "dataset_info": dataset_info
        }
        
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Failed to create training dataset: {str(e)}")
        if hasattr(e, 'response') and e.response is not None:
            try:
                error_detail = e.response.json()
                print(f"   Error details: {error_detail}")
            except:
                print(f"   Response: {e.response.text}")
        return {"success": False, "error": str(e)}

# Create training dataset from real observations
print("üöÄ Creating training dataset from real observations...")
dataset_result = create_training_dataset_from_real_observations()

if dataset_result["success"]:
    dataset_id = dataset_result["dataset_id"]
    print(f"\nüéâ Training dataset created successfully!")
    print(f"Dataset ID: {dataset_id}")
    print(f"\nNext steps:")
    print(f"1. ‚úÖ Real observations collected into training dataset")
    print(f"2. üîÑ Run preprocessing pipeline on observations")
    print(f"3. üîÑ Run differencing pipeline to create difference images")
    print(f"4. üîÑ Run detection pipeline with U-Net model")
    print(f"5. üéØ Train ML models on processed data")
else:
    print(f"\n‚ùå Training dataset creation failed: {dataset_result['error']}")
    print(f"This might mean:")
    print(f"- No observations found in the specified date range")
    print(f"- API endpoint not available")
    print(f"- Authentication issues")
    dataset_id = None


üöÄ Creating training dataset from real observations...
üéØ Creating training dataset from REAL observations...
Survey: dss2 (2127bdee-056c-4266-b1b3-20eb879cd543)
Date range: 2025-01-01T00:00:00 to 2025-12-31T23:59:59
Confidence threshold: 0.1
Max samples: 50
‚ùå Failed to create training dataset: 404 Client Error: Not Found for url: http://127.0.0.1:8000/training/datasets/collect
   Error details: {'detail': 'No training samples found with specified criteria'}

‚ùå Training dataset creation failed: 404 Client Error: Not Found for url: http://127.0.0.1:8000/training/datasets/collect
This might mean:
- No observations found in the specified date range
- API endpoint not available
- Authentication issues


In [7]:
# Consume real_data_manifest.json and print available samples\nimport json\nfrom pathlib import Path\n\nmanifest_path = Path.cwd().parent / "ml_training_data" / "real_data_loading.ipynb"  # notebook path\n# Manifest sits next to the real_data_loading notebook\nman_path = (Path.cwd().parent / "ml_training_data" / "real_data_manifest.json")\nif not man_path.exists():\n    raise FileNotFoundError(f"Manifest not found: {man_path}")\nmanifest = json.loads(man_path.read_text())\nprint(f"Found {len(manifest)} entries in manifest")\nprint(manifest[:3])\n\n# TODO: Hand off to training service or create TrainingDataset via API from manifest entries


In [8]:
# Build simple training list from manifest entries
from typing import List, Dict
from pathlib import Path

train_items: List[Dict] = []
val_items: List[Dict] = []

manifest_path = Path.cwd().parent / "ml_training_data" / "real_data_loading.ipynb"  # notebook path
man_path = (Path.cwd().parent / "ml_training_data" / "real_data_manifest.json")
if not man_path.exists():
    raise FileNotFoundError(f"Manifest not found: {man_path}")
manifest = json.loads(man_path.read_text())

# naive split 80/20
cut = max(1, int(0.8 * len(manifest)))
train_items = manifest[:cut]
val_items = manifest[cut:]

print(f"Train: {len(train_items)} | Val: {len(val_items)}")

# Preview a few items with full R2 URL for convenience
endpoint = os.getenv("MLFLOW_S3_ENDPOINT_URL", "")
def to_url(item: Dict) -> str:
    if endpoint:
        return f"{endpoint}/{item['bucket']}/{item['key']}"
    return f"s3://{item['bucket']}/{item['key']}"

# Preview a few training samples with their R2 URLs
for sample in train_items[:3]:
    print(to_url(sample))

# Prepare minimal dataset payload from manifest entries
from datetime import datetime

items = manifest  # manifest is already loaded above

dataset_payload = {
    "name": f"skyview_manifest_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
    "description": "SkyView cutouts uploaded to R2 for training",
    "total": len(items),
    "items": items[:10],  # keep it small for a dry run
}

print("Dataset payload preview (first 1 item):")
print(json.dumps({**dataset_payload, "items": dataset_payload["items"][:1]}, indent=2)[:800])


FileNotFoundError: Manifest not found: /home/chris/github/AstrID/ml_training_data/real_data_manifest.json

## Step 2: Ingest Test Observations


In [None]:
def ingest_test_observations() -> Dict[str, Any]:
    """Ingest test observations using the API."""
    # AUTH_HEADERS should already be set globally in cell 6
    if not AUTH_HEADERS:
        print("‚ùå AUTH_HEADERS not set. Please run the authentication cell first.")
        return {"success": False, "error": "AUTH_HEADERS not set. Please run the authentication cell first."}
    
    url = f"{API_BASE}/observations/ingest/batch-random"
    payload = {
        "count": TEST_COUNT,
        "survey_id": TEST_SURVEY_ID
    }
    
    print(f"Ingesting {TEST_COUNT} test observations for survey {TEST_SURVEY_ID}...")
    
    try:
        response = requests.post(url, json=payload, headers=AUTH_HEADERS, timeout=30)
        response.raise_for_status()
        
        result = response.json()
        observations = result.get("data", [])
        
        print(f"‚úÖ Successfully ingested {len(observations)} observations")
        
        # Store observation IDs for later use
        observation_ids = [obs["id"] for obs in observations]
        
        return {
            "success": True,
            "count": len(observations),
            "observation_ids": observation_ids,
            "observations": observations
        }
        
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Failed to ingest observations: {str(e)}")
        if hasattr(e, 'response') and e.response is not None:
            try:
                error_detail = e.response.json()
                print(f"   Error details: {error_detail}")
            except:
                print(f"   Response: {e.response.text}")
        return {"success": False, "error": str(e)}

# Run ingestion
ingestion_result = ingest_test_observations()

if ingestion_result["success"]:
    print(f"\nObservation IDs: {ingestion_result['observation_ids']}")
    # Store for later steps
    observation_ids = ingestion_result["observation_ids"]
else:
    print("\n‚ùå Cannot proceed without observations")
    observation_ids = []


In [None]:
def create_training_dataset() -> Dict[str, Any]:
    """Create training dataset from the processed detections."""
    url = f"{API_BASE}/training/datasets/collect"
    payload = {
        "survey_ids": [TEST_SURVEY_ID],
        "start": TEST_DATE_RANGE["start"],
        "end": TEST_DATE_RANGE["end"],
        "confidence_threshold": 0.3,  # Lower threshold for testing
        "max_samples": 100,
        "name": f"test_{TEST_SURVEY_ID}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    }
    
    print(f"Creating training dataset for survey {TEST_SURVEY_ID}...")
    print(f"Date range: {payload['start']} to {payload['end']}")
    print(f"Confidence threshold: {payload['confidence_threshold']}")
    
    try:
        response = requests.post(url, json=payload, headers=AUTH_HEADERS, timeout=60)
        response.raise_for_status()
        
        result = response.json()
        dataset_info = result.get("data", {})
        
        print(f"‚úÖ Successfully created training dataset")
        print(f"   - Dataset ID: {dataset_info.get('dataset_id')}")
        print(f"   - Name: {dataset_info.get('name')}")
        print(f"   - Total samples: {dataset_info.get('total')}")
        print(f"   - Quality score: {dataset_info.get('quality', {}).get('quality_score', 0):.3f}")
        
        return {
            "success": True,
            "dataset_id": dataset_info.get("dataset_id"),
            "dataset_info": dataset_info
        }
        
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Failed to create training dataset: {str(e)}")
        return {"success": False, "error": str(e)}

# Create training dataset
dataset_result = create_training_dataset()

if dataset_result["success"]:
    dataset_id = dataset_result["dataset_id"]
    print(f"\nüéâ Training dataset created successfully!")
    print(f"Dataset ID: {dataset_id}")
else:
    print(f"\n‚ùå Training dataset creation failed: {dataset_result['error']}")
    dataset_id = None


In [None]:
def list_training_datasets() -> Dict[str, Any]:
    """List all available training datasets."""
    url = f"{API_BASE}/training/datasets"
    
    try:
        response = requests.get(url, headers=AUTH_HEADERS, timeout=30)
        response.raise_for_status()
        
        result = response.json()
        datasets = result.get("data", [])
        
        print(f"‚úÖ Found {len(datasets)} training datasets")
        
        for i, dataset in enumerate(datasets, 1):
            print(f"\n{i}. {dataset.get('name')}")
            print(f"   - ID: {dataset.get('id')}")
            print(f"   - Samples: {dataset.get('total_samples')}")
            print(f"   - Quality: {dataset.get('quality_score', 0):.3f}")
            print(f"   - Status: {dataset.get('status')}")
            print(f"   - Created: {dataset.get('created_at')}")
        
        return {
            "success": True,
            "datasets": datasets
        }
        
    except requests.exceptions.RequestException as e:
        print(f"‚ùå Failed to list datasets: {str(e)}")
        return {"success": False, "error": str(e)}

# List all datasets
list_result = list_training_datasets()

if list_result["success"]:
    datasets = list_result["datasets"]
    
    # Find datasets with samples
    datasets_with_samples = [d for d in datasets if d.get("total_samples", 0) > 0]
    
    if datasets_with_samples:
        print(f"\nüéâ Found {len(datasets_with_samples)} datasets with samples!")
        print(f"\nReady for training:")
        for dataset in datasets_with_samples:
            print(f"- {dataset['name']} (ID: {dataset['id']}) - {dataset['total_samples']} samples")
    else:
        print(f"\n‚ö†Ô∏è  No datasets have samples yet.")
        print(f"This suggests the data pipeline needs to be run first.")
else:
    print(f"\n‚ùå Failed to list datasets: {list_result['error']}")
