# 🚀 NASA Space Apps Challenge 2025 - Exoplanet Classifier
## A World Away: Hunting for Exoplanets with AI

### Project Overview
This notebook provides a comprehensive solution for classifying exoplanets using NASA's KOI (Kepler Objects of Interest), K2, and TESS (Transiting Exoplanet Survey Satellite) datasets. Our goal is to create a high-performance, GPU-accelerated machine learning system capable of identifying confirmed exoplanets, candidates, and false positives.

### Challenge Goals
- ✅ **High Accuracy**: Achieve >68% classification accuracy
- ✅ **GPU Optimization**: Leverage RTX 4060 for accelerated training
- ✅ **Real-time Predictions**: Single input and batch processing
- ✅ **Explainable AI**: SHAP analysis for model interpretability
- ✅ **Production Ready**: Streamlit app with standardized API
- ✅ **Memory Optimized**: Efficient data processing pipeline

### Dataset Information
- **KOI Dataset**: Kepler Objects of Interest with stellar and planetary parameters
- **K2 Dataset**: Extended Kepler mission observations
- **TESS Dataset**: Transiting Exoplanet Survey Satellite discoveries
- **Total Samples**: 21,000+ labeled examples
- **Classes**: CONFIRMED, CANDIDATE, FALSE_POSITIVE

## 🔧 Section 1: Environment Setup and GPU Configuration

Setting up the optimal environment for exoplanet classification with GPU acceleration.

In [None]:
# Import Required Libraries
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Core Data Science Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# GPU and Deep Learning Libraries
import tensorflow as tf
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset
    PYTORCH_AVAILABLE = True
except ImportError:
    PYTORCH_AVAILABLE = False
    print("⚠️ PyTorch not available")

# Explainability and Visualization
try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    SHAP_AVAILABLE = False
    print("⚠️ SHAP not available - install with: pip install shap")

# Memory optimization
import gc
import psutil
import joblib
from datetime import datetime

print("🚀 NASA Space Apps 2025 - Exoplanet Classifier")
print("=" * 60)
print(f"Python Version: {sys.version}")
print(f"TensorFlow Version: {tf.__version__}")
if PYTORCH_AVAILABLE:
    print(f"PyTorch Version: {torch.__version__}")
print(f"Memory Available: {psutil.virtual_memory().available / (1024**3):.1f} GB")

In [None]:
# GPU Configuration and Detection
def configure_gpu():
    """Configure GPU settings for optimal performance"""
    print("🔍 GPU Configuration Status")
    print("=" * 40)
    
    # TensorFlow GPU Configuration
    print("📊 TensorFlow GPU Status:")
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print(f"✅ Found {len(gpus)} GPU(s)")
            for i, gpu in enumerate(gpus):
                print(f"   GPU {i}: {gpu}")
        except RuntimeError as e:
            print(f"⚠️ GPU configuration error: {e}")
    else:
        print("❌ No TensorFlow GPUs found")
    
    # PyTorch GPU Configuration
    if PYTORCH_AVAILABLE:
        print(f"\n📊 PyTorch GPU Status:")
        if torch.cuda.is_available():
            print(f"✅ CUDA Available: {torch.cuda.device_count()} device(s)")
            print(f"   Current Device: {torch.cuda.get_device_name()}")
            print(f"   Memory Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
            
            # Set device
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            print(f"   Using Device: {device}")
            return device
        else:
            print("❌ CUDA not available for PyTorch")
            return torch.device('cpu')
    else:
        print("❌ PyTorch not installed")
        return None

# Configure GPU
device = configure_gpu()

# Memory optimization function
def optimize_memory():
    """Optimize memory usage"""
    gc.collect()
    if PYTORCH_AVAILABLE and torch.cuda.is_available():
        torch.cuda.empty_cache()
    memory_percent = psutil.virtual_memory().percent
    print(f"💾 Memory Usage: {memory_percent:.1f}%")

optimize_memory()

## 📊 Section 2: Data Loading and Exploration

Loading and exploring the NASA exoplanet datasets with memory optimization.

In [None]:
# Memory-Optimized Data Loading
class OptimizedDataLoader:
    """Memory-efficient data loader for NASA exoplanet datasets"""
    
    def __init__(self, data_dir="data/raw/"):
        self.data_dir = data_dir
        self.datasets = {}
        self.combined_data = None
        
    def load_dataset(self, filename, dataset_name):
        """Load individual dataset with memory optimization"""
        filepath = os.path.join(self.data_dir, filename)
        
        if not os.path.exists(filepath):
            print(f"❌ File not found: {filepath}")
            return None
            
        print(f"📥 Loading {dataset_name}...")
        
        # Load with optimized dtypes
        df = pd.read_csv(filepath, low_memory=False)
        
        # Memory optimization
        original_memory = df.memory_usage(deep=True).sum() / 1024**2
        
        # Optimize numeric columns
        for col in df.select_dtypes(include=[np.number]).columns:
            col_min = df[col].min()
            col_max = df[col].max()
            
            if str(df[col].dtype).startswith('int'):
                if col_min > np.iinfo(np.int8).min and col_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
        
        # Optimize string columns
        for col in df.select_dtypes(include=['object']).columns:
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:
                df[col] = df[col].astype('category')
        
        optimized_memory = df.memory_usage(deep=True).sum() / 1024**2
        print(f"   📊 Shape: {df.shape}")
        print(f"   💾 Memory: {original_memory:.1f}MB → {optimized_memory:.1f}MB "
              f"({((original_memory - optimized_memory) / original_memory * 100):.1f}% reduction)")
        
        # Add dataset source
        df['dataset_source'] = dataset_name
        
        self.datasets[dataset_name] = df
        return df
    
    def load_all_datasets(self):
        """Load all NASA exoplanet datasets"""
        print("🚀 Loading NASA Exoplanet Datasets")
        print("=" * 40)
        
        # Load individual datasets
        koi_data = self.load_dataset('koi.csv', 'KOI')
        k2_data = self.load_dataset('k2.csv', 'K2')  
        tess_data = self.load_dataset('toi.csv', 'TESS')  # Note: TESS data in toi.csv
        
        return koi_data, k2_data, tess_data

# Initialize data loader and load datasets
data_loader = OptimizedDataLoader()
koi_df, k2_df, tess_df = data_loader.load_all_datasets()

# Display basic information
print(f"\n📊 Dataset Summary:")
if koi_df is not None:
    print(f"   KOI Dataset: {koi_df.shape}")
if k2_df is not None:
    print(f"   K2 Dataset: {k2_df.shape}")
if tess_df is not None:
    print(f"   TESS Dataset: {tess_df.shape}")

optimize_memory()

In [None]:
# Explore dataset structures and find common columns
def explore_datasets(datasets):
    """Explore dataset structures and identify common features"""
    print("🔍 Dataset Structure Analysis")
    print("=" * 40)
    
    dataset_info = {}
    all_columns = set()
    
    for name, df in datasets.items():
        if df is not None:
            print(f"\n📊 {name} Dataset:")
            print(f"   Shape: {df.shape}")
            print(f"   Columns: {df.shape[1]}")
            
            # Check for label columns
            possible_labels = ['koi_disposition', 'av_training_set', 'disposition', 
                             'tfopwg_disposition', 'Disposition']
            label_col = None
            for col in possible_labels:
                if col in df.columns:
                    label_col = col
                    print(f"   Label Column: {col}")
                    if df[col].dtype == 'object':
                        print(f"   Unique Labels: {df[col].unique()}")
                        print(f"   Label Counts:\n{df[col].value_counts()}")
                    break
            
            if label_col is None:
                print("   ⚠️ No label column found")
            
            dataset_info[name] = {
                'shape': df.shape,
                'columns': list(df.columns),
                'label_column': label_col,
                'numeric_columns': list(df.select_dtypes(include=[np.number]).columns),
                'categorical_columns': list(df.select_dtypes(include=['object', 'category']).columns)
            }
            
            all_columns.update(df.columns)
    
    # Find common columns across datasets
    common_columns = set(dataset_info[list(dataset_info.keys())[0]]['columns'])
    for info in dataset_info.values():
        common_columns = common_columns.intersection(set(info['columns']))
    
    print(f"\n🔗 Common Columns Across Datasets: {len(common_columns)}")
    print(f"   {sorted(list(common_columns))}")
    
    return dataset_info, common_columns

# Analyze loaded datasets
dataset_info, common_columns = explore_datasets({
    'KOI': koi_df,
    'K2': k2_df, 
    'TESS': tess_df
})

# Display sample data from each dataset
print("\n📋 Sample Data Preview:")
for name, df in [('KOI', koi_df), ('K2', k2_df), ('TESS', tess_df)]:
    if df is not None:
        print(f"\n{name} Dataset Sample:")
        print(df.head(2).to_string())

## 🔄 Section 3: Data Preprocessing and Harmonization

Standardizing and harmonizing data across all three NASA datasets.

In [None]:
# Advanced Data Preprocessing and Harmonization System
class NASADataHarmonizer:
    """Harmonize and preprocess NASA exoplanet datasets"""
    
    def __init__(self):
        self.feature_mapping = {}
        self.label_encoder = LabelEncoder()
        self.scaler = StandardScaler()
        self.processed_data = None
        
    def harmonize_datasets(self, koi_df, k2_df, tess_df):
        """Harmonize column names and data types across datasets"""
        print("🔄 Harmonizing NASA Datasets")
        print("=" * 40)
        
        harmonized_datasets = []
        
        # Define standard feature mapping
        feature_map = {
            'koi_period': 'period',
            'koi_prad': 'radius', 
            'koi_teq': 'temperature',
            'koi_insol': 'insolation',
            'koi_depth': 'depth',
            'ra': 'ra',
            'dec': 'dec'
        }
        
        # Process each dataset
        for name, df in [('KOI', koi_df), ('K2', k2_df), ('TESS', tess_df)]:
            if df is None:
                continue
                
            print(f"\n📊 Processing {name} Dataset...")
            processed_df = df.copy()
            
            # Standardize column names
            for old_name, new_name in feature_map.items():
                if old_name in processed_df.columns:
                    processed_df = processed_df.rename(columns={old_name: new_name})
            
            # Handle different label column formats
            label_col = None
            target_labels = []
            
            if 'koi_disposition' in processed_df.columns:
                label_col = 'koi_disposition'
                # Map KOI labels to standard format
                label_mapping = {
                    'CONFIRMED': 'CONFIRMED',
                    'CANDIDATE': 'CANDIDATE', 
                    'FALSE POSITIVE': 'FALSE_POSITIVE'
                }
                processed_df['label'] = processed_df[label_col].map(label_mapping)
                target_labels = processed_df['label'].dropna().unique()
                
            elif 'disposition' in processed_df.columns:
                label_col = 'disposition'
                processed_df['label'] = processed_df[label_col]
                target_labels = processed_df['label'].unique()
                
            elif 'Disposition' in processed_df.columns:
                label_col = 'Disposition'
                processed_df['label'] = processed_df[label_col]
                target_labels = processed_df['label'].unique()
            
            # Select core features that exist across datasets
            core_features = ['period', 'radius', 'temperature', 'insolation', 'depth', 'ra', 'dec']
            available_features = [f for f in core_features if f in processed_df.columns]
            
            if label_col and len(available_features) > 0:
                # Keep only essential columns
                keep_columns = available_features + ['label', 'dataset_source']
                final_df = processed_df[keep_columns].copy()
                
                # Remove rows with missing labels
                final_df = final_df.dropna(subset=['label'])
                
                print(f"   ✅ Features: {available_features}")
                print(f"   ✅ Labels: {target_labels}")
                print(f"   ✅ Samples: {len(final_df)}")
                
                harmonized_datasets.append(final_df)
            else:
                print(f"   ⚠️ Insufficient features or no labels found")
        
        # Combine all datasets
        if harmonized_datasets:
            combined_df = pd.concat(harmonized_datasets, ignore_index=True, sort=False)
            
            # Final cleanup
            combined_df = combined_df.dropna()  # Remove any remaining NaN values
            
            print(f"\n🔗 Combined Dataset:")
            print(f"   Shape: {combined_df.shape}")
            print(f"   Features: {[col for col in combined_df.columns if col not in ['label', 'dataset_source']]}")
            print(f"   Labels: {combined_df['label'].unique()}")
            print(f"   Label Distribution:\n{combined_df['label'].value_counts()}")
            
            self.processed_data = combined_df
            return combined_df
        else:
            print("❌ No datasets could be harmonized")
            return None
    
    def prepare_features_and_labels(self, data):
        """Prepare features and labels for machine learning"""
        if data is None:
            return None, None
            
        print("\n🎯 Preparing Features and Labels")
        print("=" * 30)
        
        # Separate features and labels
        feature_cols = [col for col in data.columns if col not in ['label', 'dataset_source']]
        X = data[feature_cols].copy()
        y = data['label'].copy()
        
        # Handle any remaining missing values
        X = X.fillna(X.median())
        
        # Feature scaling
        X_scaled = pd.DataFrame(
            self.scaler.fit_transform(X),
            columns=X.columns,
            index=X.index
        )
        
        # Label encoding
        y_encoded = self.label_encoder.fit_transform(y)
        
        print(f"✅ Features shape: {X_scaled.shape}")
        print(f"✅ Labels shape: {y_encoded.shape}")
        print(f"✅ Label mapping: {dict(zip(self.label_encoder.classes_, range(len(self.label_encoder.classes_))))}")
        
        return X_scaled, y_encoded, feature_cols

# Initialize harmonizer and process data
harmonizer = NASADataHarmonizer()
combined_data = harmonizer.harmonize_datasets(koi_df, k2_df, tess_df)

if combined_data is not None:
    X, y, feature_names = harmonizer.prepare_features_and_labels(combined_data)
    print(f"\n🎉 Data Harmonization Complete!")
    print(f"   Total Samples: {len(X) if X is not None else 0}")
    print(f"   Features: {len(feature_names) if feature_names else 0}")
else:
    print("❌ Data harmonization failed")

optimize_memory()

## 🔬 Section 4: Feature Engineering and Selection

Creating advanced features for improved exoplanet classification.

In [None]:
# Advanced Feature Engineering for Exoplanet Classification
class ExoplanetFeatureEngineer:
    """Create physics-based and statistical features for exoplanet classification"""
    
    def __init__(self):
        self.engineered_features = []
    
    def create_physics_features(self, df):
        """Create physics-based features from planetary parameters"""
        print("🔬 Creating Physics-Based Features")
        print("=" * 35)
        
        enhanced_df = df.copy()
        new_features = []
        
        # Physics-based features
        if all(col in df.columns for col in ['period', 'radius']):
            # Orbital velocity (simplified)
            enhanced_df['orbital_velocity'] = 2 * np.pi * enhanced_df['radius'] / (enhanced_df['period'] + 1e-8)
            new_features.append('orbital_velocity')
            
            # Planet size ratio
            enhanced_df['period_radius_ratio'] = enhanced_df['period'] / (enhanced_df['radius'] + 1e-8)
            new_features.append('period_radius_ratio')
        
        if all(col in df.columns for col in ['temperature', 'insolation']):
            # Habitability proxy
            enhanced_df['habitability_index'] = enhanced_df['temperature'] / np.sqrt(enhanced_df['insolation'] + 1e-8)
            new_features.append('habitability_index')
            
            # Stellar energy
            enhanced_df['stellar_energy'] = enhanced_df['temperature'] * enhanced_df['insolation']
            new_features.append('stellar_energy')
        
        if all(col in df.columns for col in ['depth', 'radius']):
            # Transit signal strength
            enhanced_df['transit_signal'] = enhanced_df['depth'] * enhanced_df['radius']
            new_features.append('transit_signal')
        
        # Logarithmic transformations for skewed features
        skewed_features = ['period', 'radius', 'insolation', 'depth']
        for feature in skewed_features:
            if feature in df.columns:
                enhanced_df[f'{feature}_log'] = np.log1p(enhanced_df[feature])
                new_features.append(f'{feature}_log')
        
        print(f"✅ Created {len(new_features)} physics-based features")
        self.engineered_features.extend(new_features)
        
        return enhanced_df
    
    def create_statistical_features(self, df):
        """Create statistical and interaction features"""
        print("\n📊 Creating Statistical Features")
        print("=" * 30)
        
        enhanced_df = df.copy()
        new_features = []
        
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col not in ['label']]
        
        # Interaction features (top combinations)
        important_pairs = [
            ('period', 'radius'),
            ('temperature', 'insolation'),
            ('radius', 'depth')
        ]
        
        for col1, col2 in important_pairs:
            if col1 in numeric_cols and col2 in numeric_cols:
                # Product
                enhanced_df[f'{col1}_{col2}_product'] = enhanced_df[col1] * enhanced_df[col2]
                new_features.append(f'{col1}_{col2}_product')
                
                # Ratio
                enhanced_df[f'{col1}_{col2}_ratio'] = enhanced_df[col1] / (enhanced_df[col2] + 1e-8)
                new_features.append(f'{col1}_{col2}_ratio')
        
        print(f"✅ Created {len(new_features)} statistical features")
        self.engineered_features.extend(new_features)
        
        return enhanced_df

# Apply feature engineering if we have data
if X is not None and combined_data is not None:
    print("🔧 Engineering Advanced Features")
    print("=" * 40)
    
    feature_engineer = ExoplanetFeatureEngineer()
    
    # Start with original features
    enhanced_data = combined_data.copy()
    
    # Apply feature engineering
    enhanced_data = feature_engineer.create_physics_features(enhanced_data)
    enhanced_data = feature_engineer.create_statistical_features(enhanced_data)
    
    # Prepare final features
    feature_cols = [col for col in enhanced_data.columns if col not in ['label', 'dataset_source']]
    X_enhanced = enhanced_data[feature_cols].copy()
    
    # Handle any new missing values
    X_enhanced = X_enhanced.fillna(X_enhanced.median())
    
    # Scale the enhanced features
    scaler_enhanced = StandardScaler()
    X_final = pd.DataFrame(
        scaler_enhanced.fit_transform(X_enhanced),
        columns=X_enhanced.columns
    )
    
    print(f"\n🎯 Final Feature Set:")
    print(f"   Original Features: {len([f for f in feature_names if f in X_enhanced.columns])}")
    print(f"   Engineered Features: {len(feature_engineer.engineered_features)}")
    print(f"   Total Features: {X_final.shape[1]}")
    print(f"   Samples: {X_final.shape[0]}")
    
    # Store the scaler for later use
    harmonizer.scaler_enhanced = scaler_enhanced
    
else:
    print("⚠️ No data available for feature engineering")
    X_final = X
    
optimize_memory()

## 🚀 Section 5: GPU-Accelerated Model Training

Advanced ensemble and neural network training optimized for RTX 4060 GPU.

In [None]:
# Advanced GPU-Optimized Model Training System
class ExoplanetModelTrainer:
    """Comprehensive model training system optimized for exoplanet classification"""
    
    def __init__(self):
        self.models = {}
        self.model_scores = {}
        self.training_history = {}
        self.device = device
    
    def train_gradient_boosting_models(self, X_train, X_test, y_train, y_test):
        """Train optimized gradient boosting models"""
        print("🌟 Training Gradient Boosting Models")
        print("=" * 35)
        
        # XGBoost with GPU acceleration (if available)
        xgb_params = {
            'n_estimators': 500,
            'max_depth': 8,
            'learning_rate': 0.1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'n_jobs': -1,
            'eval_metric': 'logloss'
        }
        
        # Add GPU support if available
        if torch.cuda.is_available():
            xgb_params['tree_method'] = 'gpu_hist'
            xgb_params['gpu_id'] = 0
            print("🚀 Using GPU acceleration for XGBoost")
        
        xgb_model = XGBClassifier(**xgb_params)
        xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        
        xgb_pred = xgb_model.predict(X_test)
        xgb_score = accuracy_score(y_test, xgb_pred)
        
        self.models['XGBoost'] = xgb_model
        self.model_scores['XGBoost'] = xgb_score
        print(f"   XGBoost Accuracy: {xgb_score:.4f}")
        
        # LightGBM with GPU acceleration
        lgb_params = {
            'n_estimators': 500,
            'max_depth': 8,
            'learning_rate': 0.1,
            'num_leaves': 31,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'n_jobs': -1,
            'verbose': -1
        }
        
        if torch.cuda.is_available():
            try:
                lgb_params['device'] = 'gpu'
                lgb_params['gpu_platform_id'] = 0
                lgb_params['gpu_device_id'] = 0
                print("🚀 Using GPU acceleration for LightGBM")
            except:
                print("⚠️ GPU not available for LightGBM, using CPU")
        
        lgb_model = LGBMClassifier(**lgb_params)
        lgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='logloss')
        
        lgb_pred = lgb_model.predict(X_test)
        lgb_score = accuracy_score(y_test, lgb_pred)
        
        self.models['LightGBM'] = lgb_model
        self.model_scores['LightGBM'] = lgb_score
        print(f"   LightGBM Accuracy: {lgb_score:.4f}")
        
        # CatBoost with GPU acceleration
        cat_params = {
            'iterations': 500,
            'depth': 8,
            'learning_rate': 0.1,
            'l2_leaf_reg': 3,
            'random_seed': 42,
            'verbose': False,
            'allow_writing_files': False
        }
        
        if torch.cuda.is_available():
            cat_params['task_type'] = 'GPU'
            cat_params['devices'] = '0'
            print("🚀 Using GPU acceleration for CatBoost")
        
        try:
            cat_model = CatBoostClassifier(**cat_params)
            cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)
            
            cat_pred = cat_model.predict(X_test)
            cat_score = accuracy_score(y_test, cat_pred)
            
            self.models['CatBoost'] = cat_model
            self.model_scores['CatBoost'] = cat_score
            print(f"   CatBoost Accuracy: {cat_score:.4f}")
        except Exception as e:
            print(f"   CatBoost training failed: {str(e)}")
    
    def train_neural_networks(self, X_train, X_test, y_train, y_test):
        """Train optimized neural networks with TensorFlow/Keras"""
        print("\n🧠 Training Neural Networks")
        print("=" * 25)
        
        # Convert to TensorFlow format
        X_train_tf = tf.constant(X_train.values.astype(np.float32))
        X_test_tf = tf.constant(X_test.values.astype(np.float32))
        y_train_tf = tf.constant(y_train.astype(np.int32))
        y_test_tf = tf.constant(y_test.astype(np.int32))
        
        # Advanced Neural Architecture
        with tf.device('/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'):
            nn_model = tf.keras.Sequential([
                tf.keras.layers.Input(shape=(X_train.shape[1],)),
                tf.keras.layers.Dense(256, activation='relu'),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.3),
                tf.keras.layers.Dense(128, activation='relu'),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.3),
                tf.keras.layers.Dense(64, activation='relu'),
                tf.keras.layers.BatchNormalization(),
                tf.keras.layers.Dropout(0.2),
                tf.keras.layers.Dense(32, activation='relu'),
                tf.keras.layers.Dropout(0.1),
                tf.keras.layers.Dense(len(np.unique(y_train)), activation='softmax')
            ])
            
            # Advanced optimizer and callbacks
            optimizer = tf.keras.optimizers.Adam(
                learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
                    initial_learning_rate=0.001,
                    decay_steps=100,
                    decay_rate=0.96
                )
            )
            
            nn_model.compile(
                optimizer=optimizer,
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy']
            )
            
            # Training callbacks
            callbacks = [
                tf.keras.callbacks.EarlyStopping(
                    monitor='val_accuracy',
                    patience=20,
                    restore_best_weights=True
                ),
                tf.keras.callbacks.ReduceLROnPlateau(
                    monitor='val_loss',
                    factor=0.5,
                    patience=10,
                    min_lr=1e-7
                )
            ]
            
            # Train the model
            history = nn_model.fit(
                X_train_tf, y_train_tf,
                validation_data=(X_test_tf, y_test_tf),
                epochs=200,
                batch_size=128,
                callbacks=callbacks,
                verbose=0
            )
            
            # Evaluate
            nn_pred = np.argmax(nn_model.predict(X_test_tf, verbose=0), axis=1)
            nn_score = accuracy_score(y_test, nn_pred)
            
            self.models['Neural_Network'] = nn_model
            self.model_scores['Neural_Network'] = nn_score
            self.training_history['Neural_Network'] = history
            print(f"   Neural Network Accuracy: {nn_score:.4f}")
    
    def train_ensemble_models(self, X_train, X_test, y_train, y_test):
        """Train ensemble models for maximum performance"""
        print("\n🎯 Training Ensemble Models")
        print("=" * 25)
        
        # Voting Classifier
        if len(self.models) >= 2:
            voting_models = [(name, model) for name, model in self.models.items() 
                           if name != 'Neural_Network']  # Exclude NN for voting
            
            if voting_models:
                voting_clf = VotingClassifier(estimators=voting_models, voting='hard')
                voting_clf.fit(X_train, y_train)
                
                voting_pred = voting_clf.predict(X_test)
                voting_score = accuracy_score(y_test, voting_pred)
                
                self.models['Voting_Ensemble'] = voting_clf
                self.model_scores['Voting_Ensemble'] = voting_score
                print(f"   Voting Ensemble Accuracy: {voting_score:.4f}")
        
        # Random Forest (as additional ensemble member)
        rf_model = RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )
        rf_model.fit(X_train, y_train)
        
        rf_pred = rf_model.predict(X_test)
        rf_score = accuracy_score(y_test, rf_pred)
        
        self.models['Random_Forest'] = rf_model
        self.model_scores['Random_Forest'] = rf_score
        print(f"   Random Forest Accuracy: {rf_score:.4f}")
    
    def get_best_model(self):
        """Get the best performing model"""
        if not self.model_scores:
            return None, 0
        
        best_model_name = max(self.model_scores, key=self.model_scores.get)
        best_score = self.model_scores[best_model_name]
        best_model = self.models[best_model_name]
        
        return best_model, best_score, best_model_name

# Initialize and train models if we have data
if X_final is not None and y is not None:
    print("🎯 Starting Comprehensive Model Training")
    print("=" * 45)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X_final, y, 
        test_size=0.2, 
        random_state=42,
        stratify=y
    )
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    
    # Initialize trainer
    trainer = ExoplanetModelTrainer()
    
    # Train all models
    trainer.train_gradient_boosting_models(X_train, X_test, y_train, y_test)
    trainer.train_neural_networks(X_train, X_test, y_train, y_test)
    trainer.train_ensemble_models(X_train, X_test, y_train, y_test)
    
    # Get best model
    best_model, best_score, best_model_name = trainer.get_best_model()
    
    print(f"\n🏆 Best Model Results")
    print("=" * 20)
    print(f"Best Model: {best_model_name}")
    print(f"Best Accuracy: {best_score:.4f}")
    
    # Display all model scores
    print(f"\n📊 All Model Scores:")
    for name, score in sorted(trainer.model_scores.items(), key=lambda x: x[1], reverse=True):
        print(f"   {name}: {score:.4f}")
    
    # Memory optimization
    optimize_memory()
    
else:
    print("⚠️ No data available for model training")
    trainer = None

## 📊 Section 6: Model Evaluation and Explainability

Comprehensive evaluation metrics and SHAP explainability for model interpretation.

In [None]:
# Comprehensive Model Evaluation and Explainability System
class ExoplanetModelEvaluator:
    """Advanced evaluation and explanation system for exoplanet models"""
    
    def __init__(self, trainer, X_test, y_test):
        self.trainer = trainer
        self.X_test = X_test
        self.y_test = y_test
        self.evaluation_results = {}
    
    def comprehensive_evaluation(self):
        """Perform comprehensive evaluation of all models"""
        print("📊 Comprehensive Model Evaluation")
        print("=" * 35)
        
        for model_name, model in self.trainer.models.items():
            print(f"\n🔍 Evaluating {model_name}")
            
            # Skip neural network for certain operations
            if model_name == 'Neural_Network':
                predictions = np.argmax(model.predict(self.X_test, verbose=0), axis=1)
                probabilities = model.predict(self.X_test, verbose=0)
            else:
                predictions = model.predict(self.X_test)
                probabilities = model.predict_proba(self.X_test) if hasattr(model, 'predict_proba') else None
            
            # Calculate metrics
            accuracy = accuracy_score(self.y_test, predictions)
            precision = precision_score(self.y_test, predictions, average='weighted', zero_division=0)
            recall = recall_score(self.y_test, predictions, average='weighted', zero_division=0)
            f1 = f1_score(self.y_test, predictions, average='weighted', zero_division=0)
            
            # Store results
            self.evaluation_results[model_name] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1,
                'predictions': predictions,
                'probabilities': probabilities
            }
            
            print(f"   Accuracy:  {accuracy:.4f}")
            print(f"   Precision: {precision:.4f}")
            print(f"   Recall:    {recall:.4f}")
            print(f"   F1-Score:  {f1:.4f}")
    
    def plot_confusion_matrices(self):
        """Create confusion matrices for all models"""
        print(f"\n📈 Creating Confusion Matrices")
        
        n_models = len(self.evaluation_results)
        if n_models == 0:
            print("   No models to evaluate")
            return
        
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        axes = axes.flatten() if n_models > 1 else [axes]
        
        for idx, (model_name, results) in enumerate(self.evaluation_results.items()):
            if idx >= len(axes):
                break
                
            cm = confusion_matrix(self.y_test, results['predictions'])
            
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx])
            axes[idx].set_title(f'{model_name}\nAccuracy: {results["accuracy"]:.4f}')
            axes[idx].set_xlabel('Predicted')
            axes[idx].set_ylabel('Actual')
        
        # Hide unused subplots
        for idx in range(n_models, len(axes)):
            axes[idx].axis('off')
        
        plt.tight_layout()
        plt.show()
    
    def feature_importance_analysis(self):
        """Analyze feature importance for applicable models"""
        print(f"\n🔍 Feature Importance Analysis")
        print("=" * 30)
        
        feature_names = self.X_test.columns.tolist()
        importance_data = {}
        
        for model_name, model in self.trainer.models.items():
            try:
                if hasattr(model, 'feature_importances_'):
                    # Tree-based models
                    importance_data[model_name] = model.feature_importances_
                elif hasattr(model, 'coef_'):
                    # Linear models
                    importance_data[model_name] = np.abs(model.coef_[0])
                elif model_name == 'Voting_Ensemble' and hasattr(model, 'estimators_'):
                    # Voting ensemble - average importance from tree-based estimators
                    importances = []
                    for estimator in model.estimators_:
                        if hasattr(estimator, 'feature_importances_'):
                            importances.append(estimator.feature_importances_)
                    if importances:
                        importance_data[model_name] = np.mean(importances, axis=0)
            except Exception as e:
                print(f"   Could not extract importance for {model_name}: {str(e)}")
        
        if importance_data:
            # Create feature importance plot
            fig, ax = plt.subplots(figsize=(12, 8))
            
            # Get the best model's importance
            if self.trainer.get_best_model()[2] in importance_data:
                best_model_name = self.trainer.get_best_model()[2]
                importances = importance_data[best_model_name]
                
                # Sort features by importance
                feature_importance_df = pd.DataFrame({
                    'feature': feature_names,
                    'importance': importances
                }).sort_values('importance', ascending=True)
                
                # Plot top 20 features
                top_features = feature_importance_df.tail(20)
                
                plt.barh(range(len(top_features)), top_features['importance'])
                plt.yticks(range(len(top_features)), top_features['feature'])
                plt.xlabel('Feature Importance')
                plt.title(f'Top 20 Features - {best_model_name}')
                plt.tight_layout()
                plt.show()
                
                print(f"✅ Feature importance plotted for {best_model_name}")
            else:
                print("   No feature importance available for best model")
        else:
            print("   No feature importance data available")
    
    def shap_explainability(self):
        """Create SHAP explanations for model predictions"""
        print(f"\n🔬 SHAP Explainability Analysis")
        print("=" * 30)
        
        try:
            import shap
            shap.initjs()
            
            best_model, _, best_model_name = self.trainer.get_best_model()
            
            if best_model_name == 'Neural_Network':
                print("   SHAP analysis not supported for Neural Networks in this implementation")
                return
            
            # Create SHAP explainer
            if best_model_name in ['XGBoost', 'LightGBM', 'Random_Forest']:
                explainer = shap.TreeExplainer(best_model)
            elif hasattr(best_model, 'predict_proba'):
                explainer = shap.Explainer(best_model.predict_proba, self.X_test[:100])
            else:
                print(f"   SHAP not supported for {best_model_name}")
                return
            
            # Calculate SHAP values (using subset for performance)
            sample_size = min(100, len(self.X_test))
            shap_values = explainer.shap_values(self.X_test.iloc[:sample_size])
            
            # Summary plot
            if isinstance(shap_values, list):
                shap_values = shap_values[1]  # Use positive class for binary classification
            
            plt.figure(figsize=(12, 8))
            shap.summary_plot(shap_values, self.X_test.iloc[:sample_size], show=False)
            plt.title(f'SHAP Summary Plot - {best_model_name}')
            plt.tight_layout()
            plt.show()
            
            print(f"✅ SHAP analysis completed for {best_model_name}")
            
        except ImportError:
            print("   SHAP not installed. Install with: pip install shap")
        except Exception as e:
            print(f"   SHAP analysis failed: {str(e)}")
    
    def create_evaluation_summary(self):
        """Create comprehensive evaluation summary"""
        print(f"\n📋 Evaluation Summary")
        print("=" * 20)
        
        if not self.evaluation_results:
            print("   No evaluation results available")
            return
        
        # Create summary DataFrame
        summary_data = []
        for model_name, results in self.evaluation_results.items():
            summary_data.append({
                'Model': model_name,
                'Accuracy': results['accuracy'],
                'Precision': results['precision'],
                'Recall': results['recall'],
                'F1-Score': results['f1_score']
            })
        
        summary_df = pd.DataFrame(summary_data)
        summary_df = summary_df.sort_values('Accuracy', ascending=False)
        
        print(summary_df.to_string(index=False, float_format='{:.4f}'.format))
        
        return summary_df

# Perform comprehensive evaluation if we have trained models
if trainer is not None and trainer.models:
    print("🎯 Starting Comprehensive Model Evaluation")
    print("=" * 45)
    
    # Initialize evaluator
    evaluator = ExoplanetModelEvaluator(trainer, X_test, y_test)
    
    # Run all evaluations
    evaluator.comprehensive_evaluation()
    evaluator.plot_confusion_matrices()
    evaluator.feature_importance_analysis()
    evaluator.shap_explainability()
    
    # Create summary
    evaluation_summary = evaluator.create_evaluation_summary()
    
    # Memory optimization
    optimize_memory()
    
else:
    print("⚠️ No trained models available for evaluation")
    evaluator = None
    evaluation_summary = None

## 💾 Section 7: Model Serialization and Deployment

Save trained models and create prediction APIs for deployment.

In [None]:
# Model Serialization and Deployment System
import joblib
import json
import pickle
from datetime import datetime

class ExoplanetModelDeployment:
    """Handle model serialization and deployment preparation"""
    
    def __init__(self, trainer, harmonizer, feature_engineer=None, scaler=None):
        self.trainer = trainer
        self.harmonizer = harmonizer
        self.feature_engineer = feature_engineer
        self.scaler = scaler
        self.model_dir = "models"
        
        # Create models directory
        os.makedirs(self.model_dir, exist_ok=True)
    
    def save_all_models(self):
        """Save all trained models with metadata"""
        print("💾 Saving All Trained Models")
        print("=" * 28)
        
        if not self.trainer or not self.trainer.models:
            print("   No models to save")
            return
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        for model_name, model in self.trainer.models.items():
            try:
                model_path = os.path.join(self.model_dir, f"{model_name}_{timestamp}")
                
                if model_name == 'Neural_Network':
                    # Save TensorFlow model
                    model.save(f"{model_path}.h5")
                    print(f"   ✅ Saved {model_name} to {model_path}.h5")
                else:
                    # Save scikit-learn compatible models
                    joblib.dump(model, f"{model_path}.joblib")
                    print(f"   ✅ Saved {model_name} to {model_path}.joblib")
                
                # Save model metadata
                metadata = {
                    'model_name': model_name,
                    'accuracy': self.trainer.model_scores.get(model_name, 0),
                    'timestamp': timestamp,
                    'model_type': type(model).__name__
                }
                
                with open(f"{model_path}_metadata.json", 'w') as f:
                    json.dump(metadata, f, indent=2)
                
            except Exception as e:
                print(f"   ❌ Failed to save {model_name}: {str(e)}")
    
    def save_best_model(self):
        """Save the best performing model with preprocessing pipeline"""
        print(f"\n🏆 Saving Best Model Pipeline")
        print("=" * 30)
        
        best_model, best_score, best_model_name = self.trainer.get_best_model()
        
        if best_model is None:
            print("   No best model available")
            return
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        pipeline_name = f"exoplanet_classifier_pipeline_{timestamp}"
        
        # Create complete pipeline
        pipeline_data = {
            'model': best_model,
            'model_name': best_model_name,
            'accuracy': best_score,
            'harmonizer': self.harmonizer,
            'scaler': self.scaler if hasattr(self, 'scaler') else None,
            'feature_names': list(X_final.columns) if X_final is not None else None,
            'timestamp': timestamp,
            'metadata': {
                'dataset_info': {
                    'total_samples': len(combined_data) if combined_data is not None else 0,
                    'features': X_final.shape[1] if X_final is not None else 0,
                    'classes': len(np.unique(y)) if y is not None else 0
                },
                'training_info': {
                    'test_accuracy': best_score,
                    'model_type': type(best_model).__name__
                }
            }
        }
        
        try:
            # Save complete pipeline
            if best_model_name == 'Neural_Network':
                # Save neural network separately
                best_model.save(os.path.join(self.model_dir, f"{pipeline_name}_model.h5"))
                pipeline_data['model'] = None  # Remove model from pickle
                joblib.dump(pipeline_data, os.path.join(self.model_dir, f"{pipeline_name}.joblib"))
                print(f"   ✅ Neural network saved separately as {pipeline_name}_model.h5")
            else:
                joblib.dump(pipeline_data, os.path.join(self.model_dir, f"{pipeline_name}.joblib"))
            
            print(f"   ✅ Best model pipeline saved: {pipeline_name}.joblib")
            print(f"   📊 Model: {best_model_name}")
            print(f"   🎯 Accuracy: {best_score:.4f}")
            
            return pipeline_name
            
        except Exception as e:
            print(f"   ❌ Failed to save pipeline: {str(e)}")
            return None
    
    def create_prediction_interface(self):
        """Create a prediction interface class"""
        print(f"\n🔧 Creating Prediction Interface")
        print("=" * 30)
        
        interface_code = '''
import joblib
import pandas as pd
import numpy as np
import tensorflow as tf
from typing import Dict, List, Union

class ExoplanetPredictor:
    """Production-ready exoplanet classification interface"""
    
    def __init__(self, pipeline_path: str, neural_model_path: str = None):
        """Initialize the predictor with saved pipeline"""
        self.pipeline = joblib.load(pipeline_path)
        self.model = self.pipeline['model']
        self.model_name = self.pipeline['model_name']
        self.harmonizer = self.pipeline['harmonizer']
        self.scaler = self.pipeline['scaler']
        self.feature_names = self.pipeline['feature_names']
        
        # Load neural network separately if needed
        if neural_model_path and self.model_name == 'Neural_Network':
            self.model = tf.keras.models.load_model(neural_model_path)
    
    def predict_single(self, sample_data: Dict) -> Dict:
        """Predict a single exoplanet sample"""
        # Convert to DataFrame
        df = pd.DataFrame([sample_data])
        
        # Apply harmonization and preprocessing
        processed_df = self.harmonizer.harmonize_datasets(df, 'user_input')
        
        # Feature engineering if available
        if hasattr(self.harmonizer, 'feature_engineer'):
            processed_df = self.harmonizer.feature_engineer.create_physics_features(processed_df)
            processed_df = self.harmonizer.feature_engineer.create_statistical_features(processed_df)
        
        # Select and scale features
        X = processed_df[self.feature_names]
        if self.scaler:
            X = pd.DataFrame(
                self.scaler.transform(X),
                columns=X.columns
            )
        
        # Make prediction
        if self.model_name == 'Neural_Network':
            probabilities = self.model.predict(X.values, verbose=0)[0]
            prediction = np.argmax(probabilities)
        else:
            prediction = self.model.predict(X)[0]
            probabilities = self.model.predict_proba(X)[0] if hasattr(self.model, 'predict_proba') else None
        
        return {
            'prediction': int(prediction),
            'confidence': float(np.max(probabilities)) if probabilities is not None else None,
            'probabilities': probabilities.tolist() if probabilities is not None else None,
            'model_used': self.model_name
        }
    
    def predict_batch(self, samples: List[Dict]) -> List[Dict]:
        """Predict multiple samples"""
        return [self.predict_single(sample) for sample in samples]
    
    def get_model_info(self) -> Dict:
        """Get model information"""
        return {
            'model_name': self.model_name,
            'accuracy': self.pipeline['accuracy'],
            'features_count': len(self.feature_names),
            'timestamp': self.pipeline['timestamp'],
            'metadata': self.pipeline['metadata']
        }

# Usage example:
# predictor = ExoplanetPredictor('path_to_pipeline.joblib')
# result = predictor.predict_single({'period': 3.5, 'radius': 1.2, 'temperature': 5800, ...})
        '''
        
        # Save the interface code
        with open('exoplanet_predictor.py', 'w') as f:
            f.write(interface_code)
        
        print("   ✅ Prediction interface created: exoplanet_predictor.py")
        
        return interface_code

# Execute deployment if we have trained models
if trainer is not None and trainer.models:
    print("🚀 Preparing Model Deployment")
    print("=" * 30)
    
    # Initialize deployment system
    deployment = ExoplanetModelDeployment(
        trainer=trainer,
        harmonizer=harmonizer,
        scaler=scaler_enhanced if 'scaler_enhanced' in locals() else None
    )
    
    # Save all models
    deployment.save_all_models()
    
    # Save best model pipeline
    pipeline_name = deployment.save_best_model()
    
    # Create prediction interface
    prediction_interface = deployment.create_prediction_interface()
    
    print(f"\n✅ Deployment preparation completed!")
    print(f"   📁 Models saved in: {deployment.model_dir}/")
    print(f"   🏆 Best pipeline: {pipeline_name}")
    print(f"   🔧 Prediction interface: exoplanet_predictor.py")
    
    # Memory optimization
    optimize_memory()
    
else:
    print("⚠️ No trained models available for deployment")
    deployment = None