# ‚úàÔ∏è Complete Cascade Prediction: Train + Deploy to SageMaker
## End-to-End Pipeline in One Notebook

**What this notebook does**:
1. ‚úÖ Loads 10M flight records from Kaggle
2. ‚úÖ Engineers 28 features (zero data leakage)
3. ‚úÖ Trains XGBoost with temporal validation
4. ‚úÖ Saves model + artifacts as tar.gz
5. ‚úÖ Deploys to SageMaker endpoint (SKLearn framework)
6. ‚úÖ Tests with CSV and raw JSON inputs

**Requirements**:
- SageMaker Notebook Instance (ml.m5.xlarge or larger)
- 16GB+ RAM
- IAM role with SageMaker + S3 permissions

**Time**: ~25 minutes total (15min training + 10min deployment)

**Cost**: $0.115/hour (endpoint) = ~$84/month

---

In [None]:
# ============================================================================
# PART 1: IMPORTS & CONFIGURATION
# ============================================================================

import sys
import os
import warnings
import gc
warnings.filterwarnings('ignore')

# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# ML libraries
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    f1_score, recall_score, precision_score
)
import xgboost as xgb
import joblib
import tarfile
import json
import time

# AWS libraries
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn import SKLearnModel

# Memory tracking
import psutil

def print_memory_usage(label=""):
    process = psutil.Process(os.getpid())
    mem_gb = process.memory_info().rss / (1024 ** 3)
    print(f"{'[' + label + ']' if label else ''} Memory: {mem_gb:.2f} GB")
    return mem_gb

# Display settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', '{:.4f}'.format)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 6)

print("="*80)
print("‚úÖ IMPORTS COMPLETE")
print("="*80)
print(f"XGBoost version: {xgb.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"SageMaker SDK: {sagemaker.__version__}")
print_memory_usage("Initial")

In [None]:
# ============================================================================
# PART 2: LOAD DATA FROM KAGGLE
# ============================================================================

import kagglehub

print("="*80)
print("üì• LOADING 10M FLIGHT RECORDS")
print("="*80)

try:
    # Download dataset
    data_path = kagglehub.dataset_download("bulter22/airline-data")
    print(f"\n‚úì Data path: {data_path}")
    
    # Find CSV file
    airline_path = os.path.join(data_path, "airline.csv.shuffle")
    if not os.path.exists(airline_path):
        airline_path = os.path.join(data_path, "airline.csv")
    
    print(f"\nüìÅ Loading from: {airline_path}")
    print("   Loading 10,000,000 rows...")
    
    # Load data
    df_raw = pd.read_csv(
        airline_path, 
        nrows=10_000_000, 
        low_memory=False, 
        encoding='latin-1', 
        encoding_errors='ignore'
    )
    
    print(f"\n‚úÖ Loaded {len(df_raw):,} records")
    print(f"‚úì Columns: {df_raw.shape[1]}")
    print(f"\nColumns: {list(df_raw.columns[:10])}...")
    print_memory_usage("After loading")
    
except Exception as e:
    print(f"‚ùå Error loading data: {e}")
    raise

In [None]:
# ============================================================================
# PART 3: DATA CLEANING
# ============================================================================

print("\n" + "="*80)
print("üßπ DATA CLEANING")
print("="*80)

df = df_raw.copy()
original_size = len(df)

# Remove cancelled/diverted flights
df = df[df['Cancelled'] == 0].copy()
if 'Diverted' in df.columns:
    df = df[df['Diverted'] == 0].copy()

# Keep only flights with tail numbers
df = df[df['TailNum'].notna()].copy()

# Create FlightDate
if 'FlightDate' not in df.columns:
    if all(col in df.columns for col in ['Year', 'Month', 'DayofMonth']):
        df['FlightDate'] = pd.to_datetime(df[['Year', 'Month', 'DayofMonth']].rename(columns={'DayofMonth': 'Day'}))
    elif all(col in df.columns for col in ['Year', 'Month', 'DayOfMonth']):
        df['FlightDate'] = pd.to_datetime(df[['Year', 'Month', 'DayOfMonth']].rename(columns={'DayOfMonth': 'Day'}))

# Remove missing critical values
critical_cols = ['ArrDelay', 'DepDelay', 'CRSDepTime', 'CRSArrTime', 'Distance', 'Origin', 'Dest', 'FlightDate']
df = df.dropna(subset=critical_cols)

# Data quality filters
df = df[df['Distance'] > 0]
df = df[(df['ArrDelay'] >= -60) & (df['ArrDelay'] <= 600)]

retention_rate = len(df) / original_size * 100
print(f"\n‚úÖ Cleaned to {len(df):,} records ({retention_rate:.2f}% retention)")
print(f"‚úì {df['TailNum'].nunique():,} unique aircraft")
print(f"‚úì Date range: {df['FlightDate'].min()} to {df['FlightDate'].max()}")
print_memory_usage("After cleaning")

# Free memory
del df_raw
gc.collect()

In [None]:
# ============================================================================
# PART 4: CREATE CASCADE TARGET
# ============================================================================

print("\n" + "="*80)
print("üéØ CASCADE TARGET CREATION")
print("="*80)

# Sort by aircraft and time
df = df.sort_values(['TailNum', 'FlightDate', 'CRSDepTime']).reset_index(drop=True)

print("\n[1/3] Identifying next flight for each aircraft...")
df['NextFlight_DepDelay'] = df.groupby('TailNum')['DepDelay'].shift(-1)
df['NextFlight_ArrDelay'] = df.groupby('TailNum')['ArrDelay'].shift(-1)
df['NextFlight_Date'] = df.groupby('TailNum')['FlightDate'].shift(-1)
df['NextFlight_CRSDepTime'] = df.groupby('TailNum')['CRSDepTime'].shift(-1)

# Calculate turnaround time
df['TurnaroundTime'] = df['NextFlight_CRSDepTime'] - df['CRSArrTime']
df.loc[df['TurnaroundTime'] < 0, 'TurnaroundTime'] += 2400
df['TurnaroundTime'] = df['TurnaroundTime'] / 100  # Convert to hours

print("\n[2/3] Defining cascade conditions...")
cascade_conditions = (
    (df['ArrDelay'] > 15) &
    (df['NextFlight_DepDelay'] > 15) &
    (df['NextFlight_Date'] == df['FlightDate']) &
    (df['TurnaroundTime'] > 0) &
    (df['TurnaroundTime'] < 24)
)

df['CausedCascade'] = cascade_conditions.astype(int)
df = df[df['NextFlight_DepDelay'].notna()].copy()

print("\n[3/3] Cascade statistics:")
print("="*80)
print(f"‚úì {len(df):,} flights with next-flight data")
print(f"\nDistribution:")
print(df['CausedCascade'].value_counts())
cascade_rate = df['CausedCascade'].mean() * 100
print(f"\nüìä Cascade Rate: {cascade_rate:.2f}%")
gc.collect()

In [None]:
# ============================================================================
# PART 5: TEMPORAL TRAIN-TEST SPLIT (NO DATA LEAKAGE)
# ============================================================================

print("\n" + "="*80)
print("üìÖ TEMPORAL TRAIN-TEST SPLIT")
print("="*80)

df = df.sort_values('FlightDate').reset_index(drop=True)

# 75% training, 25% test
split_date = df['FlightDate'].quantile(0.75)
train_df = df[df['FlightDate'] < split_date].copy()
test_df = df[df['FlightDate'] >= split_date].copy()

print(f"\nüìÖ Training: {train_df['FlightDate'].min()} to {train_df['FlightDate'].max()}")
print(f"üìÖ Test: {test_df['FlightDate'].min()} to {test_df['FlightDate'].max()}")
print(f"\n‚úì Training: {len(train_df):,} samples")
print(f"‚úì Test: {len(test_df):,} samples")
print(f"\n‚úì Train cascade rate: {train_df['CausedCascade'].mean()*100:.2f}%")
print(f"‚úì Test cascade rate: {test_df['CausedCascade'].mean()*100:.2f}%")

assert train_df['FlightDate'].max() < test_df['FlightDate'].min(), "‚ùå Temporal overlap!"
print("\n‚úÖ NO TEMPORAL OVERLAP")

In [None]:
# ============================================================================
# PART 6: FEATURE ENGINEERING
# ============================================================================

print("\n" + "="*80)
print("‚öôÔ∏è FEATURE ENGINEERING")
print("="*80)

def engineer_all_features(df):
    """Apply all feature engineering steps"""
    # Temporal (7)
    df['Hour'] = (df['CRSDepTime'] // 100).astype(int)
    df['DayOfWeek'] = df['FlightDate'].dt.dayofweek
    df['Month'] = df['FlightDate'].dt.month
    df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
    df['IsRushHour'] = df['Hour'].isin([6, 7, 8, 16, 17, 18]).astype(int)
    df['IsEarlyMorning'] = df['Hour'].isin([5, 6, 7, 8]).astype(int)
    df['IsLateNight'] = df['Hour'].isin([21, 22, 23, 0, 1, 2]).astype(int)
    
    # Flight (3)
    df['Distance'] = df['Distance'].astype(float)
    df['CRSElapsedTime'] = df['CRSElapsedTime'].astype(float)
    df['IsShortHaul'] = (df['Distance'] < 500).astype(int)
    
    # Incoming delay (3)
    df['IncomingDelay'] = df.groupby('TailNum')['ArrDelay'].shift(1).fillna(0)
    df['IncomingDepDelay'] = df.groupby('TailNum')['DepDelay'].shift(1).fillna(0)
    df['HasIncomingDelay'] = (df['IncomingDelay'] > 15).astype(int)
    
    # Turnaround (4)
    df['TurnaroundMinutes'] = df['TurnaroundTime'] * 60
    df['TightTurnaround'] = (df['TurnaroundTime'] < 1.0).astype(int)
    df['CriticalTurnaround'] = (df['TurnaroundTime'] < 0.75).astype(int)
    df['InsufficientBuffer'] = ((df['TurnaroundMinutes'] - df['IncomingDelay']) < 30).astype(int)
    
    # Utilization (4)
    df['PositionInRotation'] = df.groupby(['TailNum', 'FlightDate']).cumcount() + 1
    df['IsFirstFlight'] = (df['PositionInRotation'] == 1).astype(int)
    df['IsEarlyRotation'] = (df['PositionInRotation'] <= 3).astype(int)
    df['IsLateRotation'] = (df['PositionInRotation'] >= 5).astype(int)
    
    return df

# Calculate historical stats from TRAINING ONLY
print("\n[1/2] Calculating historical statistics (training data only)...")

train_df = engineer_all_features(train_df)

# Route stats
route_stats = train_df.groupby(['Origin', 'Dest']).agg({
    'ArrDelay': ['mean', 'std']
}).reset_index()
route_stats.columns = ['Origin', 'Dest', 'RouteAvgDelay', 'RouteStdDelay']
route_stats['RouteRobustnessScore'] = (100 - route_stats['RouteStdDelay'].fillna(30).clip(0, 60)).clip(0, 100)

# Origin stats
origin_stats = train_df.groupby('Origin').agg({
    'DepDelay': 'mean',
    'TaxiOut': 'mean'
}).reset_index()
origin_stats.columns = ['Origin', 'Origin_AvgDepDelay', 'OriginCongestion']

# Dest stats
dest_stats = train_df.groupby('Dest').agg({
    'ArrDelay': 'mean',
    'TaxiIn': 'mean'
}).reset_index()
dest_stats.columns = ['Dest', 'Dest_AvgArrDelay', 'DestCongestion']

print(f"   ‚úì {len(route_stats):,} routes")
print(f"   ‚úì {len(origin_stats):,} origins")
print(f"   ‚úì {len(dest_stats):,} destinations")

# Store for deployment
train_stats = {
    'route': route_stats,
    'origin': origin_stats,
    'dest': dest_stats
}

# Merge to train
train_df = train_df.merge(route_stats, on=['Origin', 'Dest'], how='left')
train_df = train_df.merge(origin_stats, on='Origin', how='left')
train_df = train_df.merge(dest_stats, on='Dest', how='left')

print("\n[2/2] Applying to test set (using training stats)...")
test_df = engineer_all_features(test_df)
test_df = test_df.merge(route_stats, on=['Origin', 'Dest'], how='left')
test_df = test_df.merge(origin_stats, on='Origin', how='left')
test_df = test_df.merge(dest_stats, on='Dest', how='left')

# Fill missing values
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
train_medians = train_df[numeric_cols].median()
train_df[numeric_cols] = train_df[numeric_cols].fillna(train_medians)
test_df[numeric_cols] = test_df[numeric_cols].fillna(train_medians)

print("\n‚úÖ FEATURE ENGINEERING COMPLETE (28 features)")
print("‚úÖ ZERO DATA LEAKAGE")

In [None]:
# ============================================================================
# PART 7: MODEL TRAINING (XGBoost)
# ============================================================================

print("\n" + "="*80)
print("ü§ñ MODEL TRAINING")
print("="*80)

# Feature list (28 features)
feature_cols = [
    # Temporal (7)
    'Hour', 'DayOfWeek', 'Month', 'IsWeekend', 'IsRushHour', 'IsEarlyMorning', 'IsLateNight',
    # Flight (3)
    'Distance', 'CRSElapsedTime', 'IsShortHaul',
    # Incoming delay (3)
    'IncomingDelay', 'HasIncomingDelay', 'IncomingDepDelay',
    # Turnaround (4)
    'TurnaroundMinutes', 'TightTurnaround', 'CriticalTurnaround', 'InsufficientBuffer',
    # Utilization (4)
    'PositionInRotation', 'IsFirstFlight', 'IsEarlyRotation', 'IsLateRotation',
    # Historical (7)
    'RouteAvgDelay', 'RouteStdDelay', 'RouteRobustnessScore',
    'Origin_AvgDepDelay', 'OriginCongestion', 'Dest_AvgArrDelay', 'DestCongestion'
]

# Prepare data
X_train = train_df[feature_cols].fillna(0)
y_train = train_df['CausedCascade']
X_test = test_df[feature_cols].fillna(0)
y_test = test_df['CausedCascade']

print(f"\n‚úì X_train shape: {X_train.shape}")
print(f"‚úì X_test shape: {X_test.shape}")

# Calculate class weight
neg_count = (y_train == 0).sum()
pos_count = (y_train == 1).sum()
scale_pos_weight = neg_count / pos_count

print(f"\nüí° Class imbalance: {scale_pos_weight:.2f}:1")
print(f"   Using scale_pos_weight = {scale_pos_weight:.2f}")

# Train model (CRITICAL: use_label_encoder=False for SageMaker compatibility)
print("\n‚è≥ Training XGBoost model...")
start_time = time.time()

cascade_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    use_label_encoder=False,   # ‚úÖ REQUIRED for SageMaker
    eval_metric='logloss',      # ‚úÖ REQUIRED for XGBoost 1.0+
    random_state=42,
    n_jobs=-1
)

cascade_model.fit(X_train, y_train, verbose=False)
train_time = time.time() - start_time

print(f"\n‚úÖ Training complete in {train_time:.1f}s")

# Evaluate
y_pred = cascade_model.predict(X_test)
y_proba = cascade_model.predict_proba(X_test)[:, 1]

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("\nüìä PERFORMANCE (Test Set):")
print("="*80)
print(f"  F1 Score:   {f1:.4f}")
print(f"  Recall:     {recall:.4f} ({recall*100:.1f}% of cascades caught)")
print(f"  Precision:  {precision:.4f} ({precision*100:.1f}% accuracy)")
print(f"  AUC-ROC:    {auc:.4f}")

print("\n‚úÖ MODEL TRAINING COMPLETE")

In [None]:
# ============================================================================
# PART 8: SAVE MODEL FOR SAGEMAKER
# ============================================================================

print("\n" + "="*80)
print("üíæ SAVING MODEL ARTIFACTS")
print("="*80)

# Create models directory
model_dir = 'cascade_prediction_v2'
if os.path.exists(model_dir):
    import shutil
    shutil.rmtree(model_dir)
os.makedirs(model_dir)

# Save model
model_path = os.path.join(model_dir, 'cascade_model_v2.joblib')
joblib.dump(cascade_model, model_path)
print(f"\n‚úì Model saved: {model_path}")

# Save feature names
feature_names_path = os.path.join(model_dir, 'feature_names.json')
with open(feature_names_path, 'w') as f:
    json.dump(feature_cols, f)
print(f"‚úì Features saved: {feature_names_path}")

# Save training statistics (for production inference)
stats_path = os.path.join(model_dir, 'training_statistics.pkl')
joblib.dump(train_stats, stats_path)
stats_size_mb = os.path.getsize(stats_path) / (1024 ** 2)
print(f"‚úì Statistics saved: {stats_path} ({stats_size_mb:.2f} MB)")

# Save metadata
metadata = {
    'model_version': '2.0',
    'model_type': 'CascadePrediction_XGBoost',
    'training_date': str(datetime.now().date()),
    'performance': {
        'f1_score': float(f1),
        'recall': float(recall),
        'precision': float(precision),
        'auc_roc': float(auc)
    }
}

metadata_path = os.path.join(model_dir, 'metadata.json')
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"‚úì Metadata saved: {metadata_path}")

# Create tar.gz for SageMaker
tar_path = 'cascade_prediction_v2_model.tar.gz'
with tarfile.open(tar_path, 'w:gz') as tar:
    tar.add(model_dir, arcname='.')

tar_size_mb = os.path.getsize(tar_path) / (1024 ** 2)
print(f"\n‚úÖ SageMaker package: {tar_path} ({tar_size_mb:.2f} MB)")
print("‚úÖ MODEL SAVED - READY FOR DEPLOYMENT")

In [None]:
# ============================================================================
# PART 9: CREATE INFERENCE SCRIPT
# ============================================================================

print("\n" + "="*80)
print("üìù CREATING INFERENCE SCRIPT")
print("="*80)

inference_code = '''import json
import joblib
import os
import numpy as np
import pandas as pd
import xgboost as xgb  # REQUIRED for unpickling
from datetime import datetime

HISTORICAL_STATS = None

def model_fn(model_dir):
    """Load model and statistics"""
    global HISTORICAL_STATS
    model = joblib.load(os.path.join(model_dir, 'cascade_model_v2.joblib'))
    HISTORICAL_STATS = joblib.load(os.path.join(model_dir, 'training_statistics.pkl'))
    return model

def input_fn(request_body, content_type='text/csv'):
    """Parse input (CSV or JSON)"""
    if isinstance(request_body, bytes):
        request_body = request_body.decode('utf-8')
    
    if content_type == 'text/csv':
        features = [float(x.strip()) for x in request_body.strip().split(',')]
        return np.array(features).reshape(1, -1)
    
    elif content_type == 'application/json':
        data = json.loads(request_body)
        if 'features' in data:
            return np.array(data['features']).reshape(1, -1)
        elif 'origin' in data and 'dest' in data:
            features = engineer_features_from_raw(data)
            return np.array(features).reshape(1, -1)
        else:
            raise ValueError("JSON must have 'features' or raw flight data")

def engineer_features_from_raw(flight_data):
    """Convert raw flight data to 28 features"""
    global HISTORICAL_STATS
    
    # Temporal
    if 'scheduled_departure_time' in flight_data:
        hour = int(flight_data['scheduled_departure_time'].split(':')[0]) if isinstance(flight_data['scheduled_departure_time'], str) else int(flight_data['scheduled_departure_time'])
    else:
        hour = flight_data.get('hour', 12)
    
    day_of_week = int(flight_data.get('day_of_week', 2))
    month = int(flight_data.get('month', 6))
    is_weekend = 1 if day_of_week in [5, 6] else 0
    is_rush_hour = 1 if hour in [6, 7, 8, 16, 17, 18] else 0
    is_early_morning = 1 if hour in [5, 6, 7, 8] else 0
    is_late_night = 1 if hour in [21, 22, 23, 0, 1, 2] else 0
    
    # Flight
    distance = float(flight_data.get('distance', 800))
    crs_elapsed_time = float(flight_data.get('crs_elapsed_time', 120))
    is_short_haul = 1 if distance < 500 else 0
    
    # Incoming delay
    incoming_delay = float(flight_data.get('incoming_delay', 0))
    has_incoming_delay = 1 if incoming_delay > 15 else 0
    incoming_dep_delay = float(flight_data.get('incoming_dep_delay', 0))
    
    # Turnaround
    turnaround_time = float(flight_data.get('turnaround_time', 120))
    turnaround_minutes = turnaround_time
    tight_turnaround = 1 if turnaround_time < 60 else 0
    critical_turnaround = 1 if turnaround_time < 45 else 0
    insufficient_buffer = 1 if (turnaround_time - incoming_delay) < 30 else 0
    
    # Utilization
    position_in_rotation = int(flight_data.get('position_in_rotation', 1))
    is_first_flight = 1 if position_in_rotation == 1 else 0
    is_early_rotation = 1 if position_in_rotation <= 3 else 0
    is_late_rotation = 1 if position_in_rotation >= 5 else 0
    
    # Historical
    origin = str(flight_data.get('origin', 'LAX')).upper()
    dest = str(flight_data.get('dest', 'JFK')).upper()
    
    route_avg_delay = 5.0
    route_std_delay = 15.0
    route_robustness = 70.0
    origin_avg_dep_delay = 8.0
    origin_congestion = 15.0
    dest_avg_arr_delay = 6.0
    dest_congestion = 12.0
    
    if HISTORICAL_STATS:
        try:
            route_stats = HISTORICAL_STATS['route']
            route_match = route_stats[(route_stats['Origin'] == origin) & (route_stats['Dest'] == dest)]
            if len(route_match) > 0:
                route_avg_delay = float(route_match['RouteAvgDelay'].iloc[0])
                route_std_delay = float(route_match['RouteStdDelay'].iloc[0])
                route_robustness = float(route_match['RouteRobustnessScore'].iloc[0])
            
            origin_stats = HISTORICAL_STATS['origin']
            origin_match = origin_stats[origin_stats['Origin'] == origin]
            if len(origin_match) > 0:
                origin_avg_dep_delay = float(origin_match['Origin_AvgDepDelay'].iloc[0])
                origin_congestion = float(origin_match['OriginCongestion'].iloc[0])
            
            dest_stats = HISTORICAL_STATS['dest']
            dest_match = dest_stats[dest_stats['Dest'] == dest]
            if len(dest_match) > 0:
                dest_avg_arr_delay = float(dest_match['Dest_AvgArrDelay'].iloc[0])
                dest_congestion = float(dest_match['DestCongestion'].iloc[0])
        except Exception as e:
            pass
    
    return [
        hour, day_of_week, month, is_weekend, is_rush_hour, is_early_morning, is_late_night,
        distance, crs_elapsed_time, is_short_haul,
        incoming_delay, has_incoming_delay, incoming_dep_delay,
        turnaround_minutes, tight_turnaround, critical_turnaround, insufficient_buffer,
        position_in_rotation, is_first_flight, is_early_rotation, is_late_rotation,
        route_avg_delay, route_std_delay, route_robustness,
        origin_avg_dep_delay, origin_congestion, dest_avg_arr_delay, dest_congestion
    ]

def predict_fn(input_data, model):
    """Make predictions"""
    probs = model.predict_proba(input_data)[:, 1]
    results = []
    for prob in probs:
        if prob >= 0.50:
            tier = "CRITICAL"
            action = "IMMEDIATE: Swap aircraft or adjust schedule"
        elif prob >= 0.30:
            tier = "HIGH"
            action = "ALERT: Consider aircraft swap"
        elif prob >= 0.15:
            tier = "ELEVATED"
            action = "MONITOR: Pre-position ground crew"
        else:
            tier = "NORMAL"
            action = "ROUTINE: Standard operations"
        
        results.append({
            'cascade_probability': float(prob),
            'cascade_prediction': int(prob >= 0.30),
            'risk_tier': tier,
            'recommended_action': action
        })
    return results

def output_fn(predictions, accept='application/json'):
    """Format output"""
    response = {
        'predictions': predictions,
        'model_version': '2.0',
        'timestamp': datetime.utcnow().isoformat() + 'Z'
    }
    return json.dumps(response, indent=2)
'''

# Save inference script
with open('inference_sagemaker.py', 'w') as f:
    f.write(inference_code)

print("\n‚úÖ Inference script created: inference_sagemaker.py")
print("‚úÖ READY FOR SAGEMAKER DEPLOYMENT")

In [None]:
# ============================================================================
# PART 10: DEPLOY TO SAGEMAKER (SKLearn Framework)
# ============================================================================

print("\n" + "="*80)
print("üöÄ DEPLOYING TO SAGEMAKER")
print("="*80)

try:
    # Initialize SageMaker
    sagemaker_session = sagemaker.Session()
    role = get_execution_role()
    region = boto3.Session().region_name
    endpoint_name = 'cascade-prediction-sklearn-v1'
    
    print(f"\n‚úì Region: {region}")
    print(f"‚úì Role: {role[:50]}...")
    print(f"‚úì Endpoint: {endpoint_name}")
    
    # Verify files
    if not os.path.exists(tar_path):
        raise FileNotFoundError(f"Model not found: {tar_path}")
    if not os.path.exists('inference_sagemaker.py'):
        raise FileNotFoundError("inference_sagemaker.py not found")
    
    print(f"\n‚úì Model: {tar_path} ({tar_size_mb:.2f} MB)")
    print(f"‚úì Inference: inference_sagemaker.py")
    
    # Upload to S3
    print("\n[1/3] Uploading to S3...")
    model_data = sagemaker_session.upload_data(
        path=tar_path,
        key_prefix='cascade-prediction/model'
    )
    print(f"   ‚úì Uploaded: {model_data}")
    
    # Create model (USE SKLEARN FRAMEWORK - THE FIX!)
    print("\n[2/3] Creating SageMaker model...")
    model_name = f'cascade-sklearn-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    
    sklearn_model = SKLearnModel(
        model_data=model_data,
        role=role,
        entry_point='inference_sagemaker.py',
        framework_version='1.2-1',  # ‚úÖ SKLearn with XGBoost support
        py_version='py3',
        name=model_name,
        sagemaker_session=sagemaker_session
    )
    
    print(f"   ‚úì Model: {model_name}")
    print(f"   ‚úì Framework: SKLearn 1.2-1 (includes XGBoost, pandas, numpy)")
    
    # Deploy endpoint
    print("\n[3/3] Deploying endpoint...")
    print(f"   Name: {endpoint_name}")
    print(f"   Instance: ml.m5.large ($0.115/hour)")
    print("\n‚è≥ Deploying (8-12 minutes)...")
    print("   Watch for '!' at end of dashes\n")
    
    predictor = sklearn_model.deploy(
        initial_instance_count=1,
        instance_type='ml.m5.large',
        endpoint_name=endpoint_name,
        wait=True
    )
    
    print("\n" + "="*80)
    print("‚úÖ DEPLOYMENT SUCCESSFUL!")
    print("="*80)
    
except Exception as e:
    print(f"\n‚ùå DEPLOYMENT FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise

In [None]:
# ============================================================================
# PART 11: TEST ENDPOINT (CSV Format)
# ============================================================================

print("\n" + "="*80)
print("üß™ TEST 1: CSV FORMAT (28 Preprocessed Features)")
print("="*80)

# Test with preprocessed features
test_features = [
    18, 2, 6, 0, 1, 0, 0,           # Temporal (7)
    800, 120, 0,                     # Flight (3)
    25, 1, 20,                       # Incoming delay (3)
    45, 1, 0, 1,                     # Turnaround (4)
    3, 0, 1, 0,                      # Utilization (4)
    5.2, 12.3, 75.0, 8.5, 15.2, 6.8, 12.1  # Historical (7)
]

csv_data = ','.join(map(str, test_features))
result = predictor.predict(csv_data, initial_args={'ContentType': 'text/csv'})
prediction = json.loads(result)['predictions'][0]

print(f"\nüìä Result:")
print(f"   Cascade Probability: {prediction['cascade_probability']:.2%}")
print(f"   Risk Tier: {prediction['risk_tier']}")
print(f"   Action: {prediction['recommended_action']}")
print("\n‚úÖ CSV TEST PASSED")

In [None]:
# ============================================================================
# PART 12: TEST ENDPOINT (Raw JSON Format)
# ============================================================================

print("\n" + "="*80)
print("üß™ TEST 2: RAW JSON FORMAT (Automatic Feature Engineering)")
print("="*80)

# Test with raw flight data (features engineered automatically)
raw_flight = {
    "origin": "LAX",
    "dest": "JFK",
    "scheduled_departure_time": "18:00",
    "day_of_week": 2,          # Wednesday
    "month": 6,                 # June
    "distance": 800,
    "crs_elapsed_time": 120,
    "incoming_delay": 25,       # Previous flight 25min late
    "incoming_dep_delay": 20,
    "turnaround_time": 45,      # Only 45min buffer
    "position_in_rotation": 3   # 3rd flight of day
}

print(f"\nüìã Input:")
print(f"   Route: {raw_flight['origin']} ‚Üí {raw_flight['dest']}")
print(f"   Departure: {raw_flight['scheduled_departure_time']}")
print(f"   Incoming Delay: {raw_flight['incoming_delay']} min")
print(f"   Turnaround: {raw_flight['turnaround_time']} min")

result = predictor.predict(
    json.dumps(raw_flight),
    initial_args={'ContentType': 'application/json'}
)
prediction = json.loads(result)['predictions'][0]

print(f"\nüìä Result:")
print(f"   Cascade Probability: {prediction['cascade_probability']:.2%}")
print(f"   Risk Tier: {prediction['risk_tier']}")
print(f"   Action: {prediction['recommended_action']}")
print("\n‚úÖ RAW JSON TEST PASSED")

print("\n" + "="*80)
print("‚úÖ ALL TESTS PASSED!")
print("="*80)
print(f"""
üéâ YOUR CASCADE PREDICTION ENDPOINT IS LIVE!

Endpoint: {endpoint_name}
Region: {region}
Status: InService ‚úÖ

Input Formats:
  ‚úÖ CSV (28 features)
  ‚úÖ JSON with 'features' array
  ‚úÖ JSON with raw flight data

Performance:
  ‚Ä¢ Recall: {recall:.1%} (catches {recall*100:.0f}% of cascades)
  ‚Ä¢ Precision: {precision:.1%}
  ‚Ä¢ AUC: {auc:.3f}

üí∞ Cost: $0.115/hour = ~$84/month

‚ö†Ô∏è  REMEMBER TO DELETE ENDPOINT WHEN DONE (see next cell)
""")

In [None]:
# ============================================================================
# PART 13: CLEANUP (DELETE ENDPOINT TO STOP CHARGES)
# ============================================================================
# ‚ö†Ô∏è  UNCOMMENT AND RUN THIS CELL WHEN YOU'RE DONE TO STOP CHARGES

# import boto3

# sm_client = boto3.client('sagemaker')
# endpoint_name = 'cascade-prediction-sklearn-v1'

# print("üóëÔ∏è  Deleting endpoint...")
# sm_client.delete_endpoint(EndpointName=endpoint_name)
# print("‚úì Endpoint deleted")

# print("\nüóëÔ∏è  Deleting endpoint config...")
# sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name)
# print("‚úì Config deleted")

# print("\n‚úÖ CLEANUP COMPLETE")
# print("üí∞ Charges stopped!")
# print("üì¶ Model still saved in S3 for future use")