<a href="https://colab.research.google.com/github/Khaled-Wael/NASA_Project/blob/main/Nasa%20code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# EXOPLANET DETECTION PIPELINE
!pip install lightkurve
!pip install tensorflow
!pip install keras_tuner

#Libraries Upload
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pltn
import seaborn as sns
from scipy.signal import savgol_filter
from scipy.interpolate import interp1d
from scipy import stats
import lightkurve as lk
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from tensorflow.keras.losses import BinaryFocalCrossentropy
import keras_tuner as kt
import warnings
import os
import pickle
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("✅ Libraries imported successfully")
print(f"TensorFlow version: {tf.__version__}")
print(f"Lightkurve version: {lk.__version__}")

# Data Loading
# 1. DATA LOADING AND PARSING
# ============================================
def load_and_parse_datasets():
    """Load and merge datasets for stellar params"""
    datasets = {}

    # Load K2
    print("Loading K2 dataset...")
    with open('/content/sample_data/k2pandc_2025.09.25_19.49.42.csv', 'r') as f:
        lines = f.readlines()
        header_line = None
        for i, line in enumerate(lines):
            if 'pl_name' in line and 'hostname' in line:
                header_line = i
                break
        if header_line:
            k2_data = pd.read_csv('/content/sample_data/k2pandc_2025.09.25_19.49.42.csv', skiprows=header_line, sep=',', engine='python')
        else:
            k2_data = pd.read_csv('/content/sample_data/k2pandc_2025.09.25_19.49.42.csv', skiprows=97, header=None, names=['raw_data'])
    datasets['K2'] = k2_data

    # Load ExoMiner
    print("Loading ExoMiner dataset...")
    exominer_data = pd.read_csv('/content/sample_data/exominer_vetting_tess-spoc-2-min-s1s67_dashtable_dvm-url_scoregt0.1 (1).csv')
    datasets['ExoMiner'] = exominer_data

    # Load TOI
    print("Loading TOI dataset...")
    try:
        toi_data = pd.read_csv('/content/sample_data/TOI_2025.09.25_19.49.24.csv', comment='#', engine='python')
        datasets['TOI'] = toi_data
    except:
        datasets['TOI'] = pd.DataFrame()

    # Merge ExoMiner with TOI
    if not datasets['TOI'].empty:
        print("Merging ExoMiner with TOI...")
        exominer_data = exominer_data.merge(toi_data, left_on='TIC ID', right_on='tid', how='left')
        exominer_data = exominer_data.rename(columns={
            'st_teff': 'Stellar Effective Temperature [K]',
            'st_rad': 'Stellar Radius [Solar Radii]',
            'st_logg': 'Stellar Surface Gravity [log10(cm/s*2)]'
        })
        datasets['ExoMiner'] = exominer_data

    # Load Cumulative
    print("Loading Cumulative dataset...")
    try:
        cumulative_data = pd.read_csv('/content/sample_data/cumulative_2025.09.25_19.49.05.csv', comment='#', engine='python')
        datasets['Cumulative'] = cumulative_data
    except:
        datasets['Cumulative'] = pd.DataFrame()

    return datasets

# Data Preprocessing and Feature Engineering
class ExoplanetDataPreprocessor:
    """Preprocess exoplanet data for machine learning"""
    def __init__(self):
        self.scaler = StandardScaler()  # For all features
        self.stellar_scaler = StandardScaler()  # For stellar features only
        self.feature_columns = []
        self.scaler_file = 'scaler.pkl'
        self.stellar_scaler_file = 'stellar_scaler.pkl'

    def create_labels(self, datasets):
        """Create binary labels for classification"""
        exominer_data = datasets['ExoMiner']
        exominer_data['label'] = (exominer_data['ExoMiner Score'] > 0.7).astype(int)
        print("✅ Labels created")
        print(f"ExoMiner - Planets: {exominer_data['label'].sum()}, "
              f"Non-planets: {len(exominer_data) - exominer_data['label'].sum()}")
        return datasets

    def engineer_features(self, datasets):
        """Create features from available data"""
        exominer_data = datasets['ExoMiner']

        # Define feature columns
        transit_columns = [
            'Orbital Period [day]',
            'Transit Duration [hour]',
            'Transit Depth [ppm]',
            'Planet Radius [Earth Radii]',
            'MES',
            'Transit Model SNR',
            'Number of transits observed'
        ]
        stellar_columns = [
            'Stellar Effective Temperature [K]',
            'Stellar Radius [Solar Radii]',
            'Stellar Surface Gravity [log10(cm/s*2)]'
        ]
        feature_columns = transit_columns + stellar_columns
        features = exominer_data[feature_columns].copy()

        # Impute missing
        features = features.fillna(features.mean())

        # Store original index
        original_index = features.index

        # Remove outliers
        features_cleaned = self.remove_outliers_iqr(features, threshold=2.0)

        # Cleaned index
        cleaned_index = features_cleaned.index

        # Derived features
        features_cleaned['depth_duration_ratio'] = features_cleaned['Transit Depth [ppm]'] / features_cleaned['Transit Duration [hour]']
        features_cleaned['period_snr_ratio'] = features_cleaned['Orbital Period [day]'] / features_cleaned['Transit Model SNR']

        # Scale stellar features separately
        stellar_features = features_cleaned[stellar_columns]
        stellar_scaled = self.stellar_scaler.fit_transform(stellar_features)
        with open(self.stellar_scaler_file, 'wb') as f:
            pickle.dump(self.stellar_scaler, f)

        # Scale all features
        feature_names = features_cleaned.columns.tolist()
        features_scaled = self.scaler.fit_transform(features_cleaned)
        with open(self.scaler_file, 'wb') as f:
            pickle.dump(self.scaler, f)

        features_df = pd.DataFrame(features_scaled, columns=feature_names, index=cleaned_index)
        self.feature_columns = feature_names

        datasets['ExoMiner_Features'] = features_df
        print("✅ Features engineered")
        print(f"Original samples: {len(original_index)}, After cleaning: {len(cleaned_index)}")
        print(f"Feature columns: {self.feature_columns}")
        return datasets, cleaned_index

    def remove_outliers_iqr(self, df, threshold=2.0):
        """Remove outliers using Interquartile Range method"""
        clean_df = df.copy()
        for column in df.columns:
            if df[column].dtype in ['float64', 'int64']:
                Q1 = df[column].quantile(0.25)
                Q3 = df[column].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - threshold * IQR
                upper_bound = Q3 + threshold * IQR
                mask = (clean_df[column] >= lower_bound) & (clean_df[column] <= upper_bound)
                clean_df = clean_df[mask]
        return clean_df

## LIGHT CURVE PROCESSING
class LightCurveProcessor:
    """Process light curves for AstroNet input with global/local views"""
    def __init__(self, global_length=2001, local_length=201):
        self.global_length = global_length
        self.local_length = local_length

    def download_single_lightcurve(self, tic_id):
        """Download TESS light curve for a given TIC ID (with caching)"""
        cache_file = f"lightcurve_{tic_id}.pkl"
        if os.path.exists(cache_file):
            try:
                return pd.read_pickle(cache_file)
            except Exception as e:
                print(f"⚠ Cache read error for TIC {tic_id}: {e}")
                os.remove(cache_file)

        try:
            search_result = lk.search_lightcurve(f"TIC {tic_id}", mission='TESS', author='SPOC')
            if len(search_result) == 0:
                print(f"❌ No light curves found for TIC {tic_id}")
                return None

            lc_collection = search_result.download_all()
            if lc_collection is None or len(lc_collection) == 0:
                print(f"❌ No data downloaded for TIC {tic_id}")
                return None

            lc = lc_collection.stitch().remove_nans().normalize()

            time = np.array(lc.time.value, dtype=float)
            flux = np.array(lc.flux.value, dtype=float)
            flux_err = np.array(lc.flux_err.value, dtype=float) if lc.flux_err is not None else np.full_like(flux, np.nan)

            lc_data = {'tic_id': tic_id, 'time': time, 'flux': flux, 'flux_err': flux_err}
            pd.to_pickle(lc_data, cache_file)

            print(f"✅ Downloaded TIC {tic_id} with {len(time)} points")
            return lc_data

        except Exception as e:
            print(f"❌ Download error for TIC {tic_id}: {e}")
            return None

    def preprocess_lightcurve(self, time, flux, period=None, detrend=True, normalize=True, augment=False):
        """Preprocess single light curve"""
        mask = np.isfinite(time) & np.isfinite(flux)
        time_clean, flux_clean = time[mask], flux[mask]
        if len(flux_clean) < 100:
            return None, None

        if normalize:
            flux_clean = flux_clean / np.median(flux_clean)

        if detrend and len(flux_clean) > 101:
            try:
                window_length = min(101, len(flux_clean) - 1)
                if window_length % 2 == 0:
                    window_length -= 1
                trend = savgol_filter(flux_clean, window_length, 2)
                flux_clean /= trend
            except Exception as e:
                print(f"⚠ Detrending failed: {e}")

        if augment:
            flux_clean += np.random.normal(0, 0.01 * np.std(flux_clean), len(flux_clean))

        # Create global view (full light curve)
        global_view = self.resample_to_fixed_length(flux_clean, self.global_length)

        # Create local view (zoomed around center or mid transit)
        mid_idx = len(flux_clean) // 2
        half_window = self.local_length // 2
        start = max(0, mid_idx - half_window)
        end = min(len(flux_clean), mid_idx + half_window)
        local_flux = flux_clean[start:end]
        local_view = self.resample_to_fixed_length(local_flux, self.local_length)

        return global_view, local_view

    def resample_to_fixed_length(self, flux, length):
        """Resample flux to a fixed sequence length"""
        if len(flux) == length:
            return flux
        x_old = np.linspace(0, 1, len(flux))
        x_new = np.linspace(0, 1, length)
        interp = interp1d(x_old, flux, kind='linear', fill_value='extrapolate')
        return interp(x_new)
def preprocess_lightcurve(self, time, flux, period=None, detrend=True, normalize=True, augment=False):
    """Preprocess single light curve"""
    mask = np.isfinite(time) & np.isfinite(flux)
    time_clean, flux_clean = time[mask], flux[mask]
    if len(flux_clean) < 100:
        return None, None

    if normalize:
        flux_clean = flux_clean / np.median(flux_clean)

    if detrend and len(flux_clean) > 101:
        try:
            window_length = min(101, len(flux_clean) - 1)
            if window_length % 2 == 0:
                window_length -= 1
            trend = savgol_filter(flux_clean, window_length, 2)
            flux_clean /= trend
        except Exception as e:
            print(f"⚠ Detrending failed: {e}")

    if augment:
        flux_clean += np.random.normal(0, 0.01 * np.std(flux_clean), len(flux_clean))

    # Create global view (full light curve)
    global_view = self.resample_to_fixed_length(flux_clean, self.global_length)

    # Create local view (zoomed around center or mid transit)
    mid_idx = len(flux_clean) // 2
    half_window = self.local_length // 2
    start = max(0, mid_idx - half_window)
    end = min(len(flux_clean), mid_idx + half_window)
    local_flux = flux_clean[start:end]
    local_view = self.resample_to_fixed_length(local_flux, self.local_length)

    return global_view, local_view
    def resample_to_fixed_length(self, flux, length):
        """Resample flux to a fixed sequence length"""
        if len(flux) == length:
            return flux
        x_old = np.linspace(0, 1, len(flux))
        x_new = np.linspace(0, 1, length)
        interp = interp1d(x_old, flux, kind='linear', fill_value='extrapolate')
        return interp(x_new)

#Transfer Learning
class ExoplanetTransferLearningModel:
    """Improved AstroNet-like model"""
    def __init__(self, global_shape=(2001, 1), local_shape=(201, 1), stellar_shape=(3,), num_classes=1):
        self.global_shape = global_shape
        self.local_shape = local_shape
        self.stellar_shape = stellar_shape
        self.num_classes = num_classes
        self.model = None

    def create_model(self):
        global_input = layers.Input(shape=self.global_shape, name='global_lc')
        local_input = layers.Input(shape=self.local_shape, name='local_lc')
        stellar_input = layers.Input(shape=self.stellar_shape, name='stellar_params')

        def cnn_branch(input_layer, filters=[16, 32, 64, 128, 256]):
            x = input_layer
            for f in filters:
                x = layers.Conv1D(f, 5, activation='relu', padding='same')(x)
                x = layers.MaxPooling1D(5, strides=2, padding='same')(x)
            x = layers.Flatten()(x)
            return x

        x_global = cnn_branch(global_input)
        x_local = cnn_branch(local_input)

        x = layers.Concatenate()([x_global, x_local])
        x = layers.Dense(256, activation='relu')(x)
        residual = layers.Dense(256, activation='relu')(x)
        x = layers.Add()([x, residual])
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(0.3)(x)

        s = layers.Dense(32, activation='relu')(stellar_input)
        x = layers.Concatenate()([x, s])
        x = layers.Dense(128, activation='relu')(x)
        x = layers.Dropout(0.5)(x)

        outputs = layers.Dense(1, activation='sigmoid')(x)

        self.model = keras.Model(inputs=[global_input, local_input, stellar_input], outputs=outputs)
        return self.model

    def compile_model(self, learning_rate=0.001):
        self.model.compile(
            optimizer=keras.optimizers.Adam(learning_rate),
            loss=BinaryFocalCrossentropy(alpha=0.25, gamma=2.0),
            metrics=['accuracy', keras.metrics.Precision(name='precision'),
                     keras.metrics.Recall(name='recall'), keras.metrics.AUC(name='auc')]
        )
        print("✅ Model compiled with focal loss")

    def setup_callbacks(self):
        return [
            callbacks.EarlyStopping(monitor='val_auc', patience=20, mode='max', restore_best_weights=True),
            callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=10, min_lr=1e-6, mode='max'),
            callbacks.ModelCheckpoint('best_model.h5', monitor='val_auc', save_best_only=True, mode='max')
        ]

#Model Training and Evaluation
class ModelTrainer:
    """Train and evaluate the exoplanet detection model"""
    def __init__(self):
        self.history = None

    def prepare_lc_datasets(self, datasets, cleaned_index, processor, preprocessor):
        global_lc = []
        local_lc = []
        stellar = []
        labels = []
        exominer_data = datasets['ExoMiner']

        for idx in cleaned_index:
            tic_id = exominer_data.loc[idx, 'TIC ID']
            period = exominer_data.loc[idx, 'Orbital Period [day]']
            label = exominer_data.loc[idx, 'label']
            lc_data = processor.download_single_lightcurve(tic_id)
            if lc_data is not None:
                g_flux, l_flux = processor.preprocess_lightcurve(lc_data['time'], lc_data['flux'], period=period)
                if g_flux is not None and l_flux is not None:
                    global_lc.append(g_flux)
                    local_lc.append(l_flux)
                    stellar_row = exominer_data.loc[idx, ['Stellar Effective Temperature [K]',
                                                         'Stellar Radius [Solar Radii]',
                                                         'Stellar Surface Gravity [log10(cm/s*2)]']].values
                    stellar_row = np.nan_to_num(stellar_row)

                    # Scale stellar features using the stellar scaler
                    try:
                        with open(preprocessor.stellar_scaler_file, 'rb') as f:
                            scaler = pickle.load(f)
                        stellar_row = scaler.transform(stellar_row.reshape(1, -1)).flatten()
                    except Exception as e:
                        print(f"Error loading stellar scaler for TIC {tic_id}: {e}. Using raw stellar features.")
                        stellar_row = stellar_row  # Fallback to raw features
                    stellar.append(stellar_row)
                    labels.append(label)

        if not global_lc:
            raise ValueError("No valid light curves processed.")

        global_lc = np.array(global_lc).reshape(-1, processor.global_length, 1)
        local_lc = np.array(local_lc).reshape(-1, processor.local_length, 1)
        stellar = np.array(stellar)
        labels = np.array(labels)

        X_train_g, X_test_g, X_train_l, X_test_l, X_train_s, X_test_s, y_train, y_test = train_test_split(
            global_lc, local_lc, stellar, labels, test_size=0.2, random_state=42, stratify=labels
        )
        X_train_g, X_val_g, X_train_l, X_val_l, X_train_s, X_val_s, y_train, y_val = train_test_split(
            X_train_g, X_train_l, X_train_s, y_train, test_size=0.2, random_state=42, stratify=y_train
        )

        return X_train_g, X_val_g, X_test_g, X_train_l, X_val_l, X_test_l, X_train_s, X_val_s, X_test_s, y_train, y_val, y_test

    def compute_class_weights(self, y_train):
        class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
        weight_dict = {i: weight for i, weight in enumerate(class_weights)}
        print(f"📊 Class weights: {weight_dict}")
        return weight_dict

    def train_model(self, model, X_train_g, X_train_l, X_train_s, y_train, X_val_g, X_val_l, X_val_s, y_val,
                    epochs=200, batch_size=64, class_weight=None):
        callbacks = model.setup_callbacks()
        self.history = model.model.fit(
            [X_train_g, X_train_l, X_train_s], y_train,
            validation_data=([X_val_g, X_val_l, X_val_s], y_val),
            epochs=epochs,
            batch_size=batch_size,
            class_weight=class_weight,
            callbacks=callbacks,
            verbose=1
        )
        return self.history

    def evaluate_model(self, model, X_test_g, X_test_l, X_test_s, y_test):
        model.model = keras.models.load_model('best_model.h5')
        y_pred_proba = model.model.predict([X_test_g, X_test_l, X_test_s], verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int).flatten()
        test_accuracy = accuracy_score(y_test, y_pred)
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = auc(fpr, tpr)
        report = classification_report(y_test, y_pred, output_dict=True)
        print(f"\n📈 TEST RESULTS:")
        print(f" Accuracy: {test_accuracy:.4f}")
        print(f" ROC AUC: {roc_auc:.4f}")
        print(f" Precision: {report['1']['precision']:.4f}")
        print(f" Recall: {report['1']['recall']:.4f}")
        return y_pred, y_pred_proba

    def cross_validate(self, model_class, datasets, cleaned_index, processor, preprocessor, n_splits=5):
        auc_scores = []
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        labels = datasets['ExoMiner']['label'].loc[cleaned_index]

        for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(labels)), labels)):
            print(f"Cross-validation Fold {fold+1}/{n_splits}")
            train_cleaned = cleaned_index[train_idx]
            val_cleaned = cleaned_index[val_idx]
            X_train_g, X_val_g, X_train_l, X_val_l, X_train_s, X_val_s, y_train, y_val = \
                self.prepare_lc_datasets_for_cv(datasets, train_cleaned, val_cleaned, processor, preprocessor)

            model = model_class()
            model.create_model()
            model.compile_model()
            self.train_model(model, X_train_g, X_train_l, X_train_s, y_train,
                           X_val_g, X_val_l, X_val_s, y_val, epochs=50)
            y_pred_proba = model.model.predict([X_val_g, X_val_l, X_val_s], verbose=0)
            fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
            auc_scores.append(auc(fpr, tpr))

        print(f"\nCross-validation AUC: {np.mean(auc_scores):.3f} ± {np.std(auc_scores):.3f}")
        return auc_scores

    def prepare_lc_datasets_for_cv(self, datasets, train_index, val_index, processor, preprocessor):
        global_lc_train = []
        local_lc_train = []
        stellar_train = []
        labels_train = []
        global_lc_val = []
        local_lc_val = []
        stellar_val = []
        labels_val = []
        exominer_data = datasets['ExoMiner']

        # Process training data
        for idx in train_index:
            tic_id = exominer_data.loc[idx, 'TIC ID']
            period = exominer_data.loc[idx, 'Orbital Period [day]']
            label = exominer_data.loc[idx, 'label']
            lc_data = processor.download_single_lightcurve(tic_id)
            if lc_data is not None:
                g_flux, l_flux = processor.preprocess_lightcurve(lc_data['time'], lc_data['flux'], period=period)
                if g_flux is not None and l_flux is not None:
                    global_lc_train.append(g_flux)
                    local_lc_train.append(l_flux)
                    stellar_row = exominer_data.loc[idx, ['Stellar Effective Temperature [K]',
                                                         'Stellar Radius [Solar Radii]',
                                                         'Stellar Surface Gravity [log10(cm/s*2)]']].values
                    stellar_row = np.nan_to_num(stellar_row)
                    try:
                        with open(preprocessor.stellar_scaler_file, 'rb') as f:
                            scaler = pickle.load(f)
                        stellar_row = scaler.transform(stellar_row.reshape(1, -1)).flatten()
                    except Exception as e:
                        print(f"Error loading stellar scaler for TIC {tic_id}: {e}. Using raw stellar features.")
                        stellar_row = stellar_row
                    stellar_train.append(stellar_row)
                    labels_train.append(label)

        # Process validation data
        for idx in val_index:
            tic_id = exominer_data.loc[idx, 'TIC ID']
            period = exominer_data.loc[idx, 'Orbital Period [day]']
            label = exominer_data.loc[idx, 'label']
            lc_data = processor.download_single_lightcurve(tic_id)
            if lc_data is not None:
                g_flux, l_flux = processor.preprocess_lightcurve(lc_data['time'], lc_data['flux'], period=period)
                if g_flux is not None and l_flux is not None:
                    global_lc_val.append(g_flux)
                    local_lc_val.append(l_flux)
                    stellar_row = exominer_data.loc[idx, ['Stellar Effective Temperature [K]',
                                                         'Stellar Radius [Solar Radii]',
                                                         'Stellar Surface Gravity [log10(cm/s*2)]']].values
                    stellar_row = np.nan_to_num(stellar_row)
                    try:
                        with open(preprocessor.stellar_scaler_file, 'rb') as f:
                            scaler = pickle.load(f)
                        stellar_row = scaler.transform(stellar_row.reshape(1, -1)).flatten()
                    except Exception as e:
                        print(f"Error loading stellar scaler for TIC {tic_id}: {e}. Using raw stellar features.")
                        stellar_row = stellar_row
                    stellar_val.append(stellar_row)
                    labels_val.append(label)

        if not global_lc_train or not global_lc_val:
            raise ValueError("No valid light curves processed for cross-validation.")

        X_train_g = np.array(global_lc_train).reshape(-1, processor.global_length, 1)
        X_train_l = np.array(local_lc_train).reshape(-1, processor.local_length, 1)
        X_train_s = np.array(stellar_train)
        y_train = np.array(labels_train)
        X_val_g = np.array(global_lc_val).reshape(-1, processor.global_length, 1)
        X_val_l = np.array(local_lc_val).reshape(-1, processor.local_length, 1)
        X_val_s = np.array(stellar_val)
        y_val = np.array(labels_val)

        return X_train_g, X_val_g, X_train_l, X_val_l, X_train_s, X_val_s, y_train, y_val

#Hyper Parameter Tuning
def tune_hyperparameters(X_train_g, X_train_l, X_train_s, y_train, X_val_g, X_val_l, X_val_s, y_val):
    """Tune hyperparameters using Keras Tuner"""
    def build_model(hp):
        model = ExoplanetTransferLearningModel()
        model.create_model()
        model.compile_model(learning_rate=hp.Float('lr', 1e-5, 1e-3, sampling='log'))
        return model.model

    tuner = kt.Hyperband(
        build_model,
        objective='val_auc',
        max_epochs=200,
        factor=3,
        directory='tuner_dir',
        project_name='exoplanet'
    )

    tuner.search([X_train_g, X_train_l, X_train_s], y_train,
                 validation_data=([X_val_g, X_val_l, X_val_s], y_val),
                 epochs=50)

    best_hps = tuner.get_best_hyperparameters()[0]
    print(f"Best learning rate: {best_hps.get('lr')}")
    return tuner.get_best_models(1)[0], best_hps

# ============================================
# 7. VISUALIZATION AND ANALYSIS
# ============================================
def visualize_results(history, y_test, y_pred, y_pred_proba):
    """Visualize training history and evaluation metrics"""
    # Plot training history
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label='val_accuracy')
    plt.plot(history.history['auc'], label='auc')
    plt.plot(history.history['val_auc'], label='val_auc')
    plt.xlabel('Epoch')
    plt.ylabel('Metric Value')
    plt.title('Training History')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Not Exoplanet', 'Exoplanet'],
                yticklabels=['Not Exoplanet', 'Exoplanet'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()

def test_with_real_lightcurves(model, datasets, preprocessor, num_samples=10):
    """Test the model with real light curves not used in training"""
    print(f"\n🔬 Testing with {num_samples} real light curves:")
    exominer_data = datasets['ExoMiner']
    all_indices = exominer_data.index
    cleaned_index = datasets['ExoMiner_Features'].index
    unseen_indices = all_indices.difference(cleaned_index)

    if len(unseen_indices) < num_samples:
        print("Not enough unseen samples to test.")
        return

    test_indices = np.random.choice(unseen_indices, num_samples, replace=False)
    processor = LightCurveProcessor()

    try:
        with open(preprocessor.stellar_scaler_file, 'rb') as f:
            scaler = pickle.load(f)
    except Exception as e:
        print(f"Error loading stellar scaler: {e}. Using raw stellar features.")
        scaler = None

    for idx in test_indices:
        tic_id = exominer_data.loc[idx, 'TIC ID']
        period = exominer_data.loc[idx, 'Orbital Period [day]']
        true_label = exominer_data.loc[idx, 'label']
        lc_data = processor.download_single_lightcurve(tic_id)
        if lc_data is not None:
            g_flux, l_flux = processor.preprocess_lightcurve(lc_data['time'], lc_data['flux'], period=period)
            if g_flux is not None and l_flux is not None:
                stellar_row = exominer_data.loc[idx, ['Stellar Effective Temperature [K]',
                                                    'Stellar Radius [Solar Radii]',
                                                    'Stellar Surface Gravity [log10(cm/s*2)]']].values
                stellar_row = np.nan_to_num(stellar_row).reshape(1, -1)
                if scaler is not None:
                    stellar_row = scaler.transform(stellar_row).flatten()
                g_flux = np.array(g_flux).reshape(1, processor.global_length, 1)
                l_flux = np.array(l_flux).reshape(1, processor.local_length, 1)
                y_pred_proba = model.model.predict([g_flux, l_flux, stellar_row], verbose=0)
                predicted_label = (y_pred_proba > 0.5).astype(int).flatten()[0]
                print(f" TIC ID: {tic_id}, True Label: {true_label}, "
                      f"Predicted Probability: {y_pred_proba[0][0]:.4f}, Predicted Label: {predicted_label}")
            else:
                print(f" TIC ID: {tic_id} - Preprocessing failed.")
        else:
            print(f" TIC ID: {tic_id} - Light curve download failed.")

def analyze_feature_importance(datasets):
    """Analyze feature importance using RandomForest"""
    print("\n📊 Analyzing Feature Importance:")
    exominer_features = datasets['ExoMiner_Features']
    labels = datasets['ExoMiner'].loc[exominer_features.index, 'label']

    if len(exominer_features) == 0 or len(labels) == 0:
        print("Not enough data to analyze feature importance.")
        return

    try:
        X_train, X_test, y_train, y_test = train_test_split(
            exominer_features, labels, test_size=0.2, random_state=42, stratify=labels
        )
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)
        importances = rf_model.feature_importances_
        feature_names = exominer_features.columns
        feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)
        print("Top 10 Feature Importance:")
        print(feature_importance_df.head(10))

        plt.figure(figsize=(10, 6))
        sns.barplot(x='importance', y='feature', data=feature_importance_df.head(10), palette='viridis')
        plt.title('Top 10 Feature Importance from RandomForest')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.show()
    except Exception as e:
        print(f"Error during feature importance analysis: {e}")

#Execution of pipeline
def main():
    """Execute the exoplanet detection pipeline with a limit of 50 light curves"""
    datasets = load_and_parse_datasets()
    preprocessor = ExoplanetDataPreprocessor()
    datasets = preprocessor.create_labels(datasets)
    datasets, cleaned_index = preprocessor.engineer_features(datasets)

    # Limit cleaned_index to 40 TIC IDs to stay within 50 total light curves
    if len(cleaned_index) > 40:
        cleaned_index = np.random.choice(cleaned_index, size=40, replace=False)
        print(f"✅ Limited cleaned_index to 40 TIC IDs for light curve downloads")

    processor = LightCurveProcessor()
    trainer = ModelTrainer()

    X_train_g, X_val_g, X_test_g, X_train_l, X_val_l, X_test_l, X_train_s, X_val_s, X_test_s, y_train, y_val, y_test = \
        trainer.prepare_lc_datasets(datasets, cleaned_index, processor, preprocessor)

    best_model_hp, best_hps = tune_hyperparameters(
        X_train_g, X_train_l, X_train_s, y_train, X_val_g, X_val_l, X_val_s, y_val
    )

    # Re-create and train the final model
    model = ExoplanetTransferLearningModel()
    model.create_model()
    model.compile_model(learning_rate=best_hps.get('lr'))

    X_train_g_full = np.concatenate((X_train_g, X_val_g))
    X_train_l_full = np.concatenate((X_train_l, X_val_l))
    X_train_s_full = np.concatenate((X_train_s, X_val_s))
    y_train_full = np.concatenate((y_train, y_val))

    class_weights = trainer.compute_class_weights(y_train_full)
    history = trainer.train_model(
        model, X_train_g_full, X_train_l_full, X_train_s_full, y_train_full,
        X_test_g, X_test_l, X_test_s, y_test, epochs=200, batch_size=64, class_weight=class_weights
    )

    y_pred, y_pred_proba = trainer.evaluate_model(model, X_test_g, X_test_l, X_test_s, y_test)

    visualize_results(history, y_test, y_pred, y_pred_proba)
    test_with_real_lightcurves(model, datasets, preprocessor, num_samples=10)  # 10 test light curves
    analyze_feature_importance(datasets)
    auc_scores = trainer.cross_validate(ExoplanetTransferLearningModel, datasets, cleaned_index, processor, preprocessor)

    print(f"\n🎉 PIPELINE COMPLETED! Total light curves downloaded: ~50 (40 from cleaned_index + 10 test)")
    return model, history
# Run the pipeline
if __name__ == "__main__":
    try:
        model, history = main()
        print("\n🎉 PIPELINE COMPLETED!")
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()

Trial 73 Complete [00h 00m 18s]
val_auc: 0.5

Best val_auc So Far: 0.5
Total elapsed time: 00h 19m 22s

Search: Running Trial #74

Value             |Best Value So Far |Hyperparameter
1.1946e-05        |0.00070477        |lr
3                 |3                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
4                 |4                 |tuner/bracket
0                 |0                 |tuner/round

✅ Model compiled with focal loss
Epoch 1/3
