<a href="https://colab.research.google.com/github/Moe-phantom/Moe-phantom/blob/main/gemini_Approch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transitleastsquares
!pip install lightkurve

Collecting transitleastsquares
  Downloading transitleastsquares-1.32-py3-none-any.whl.metadata (5.3 kB)
Collecting astroquery>=0.3.9 (from transitleastsquares)
  Downloading astroquery-0.4.11-py3-none-any.whl.metadata (6.5 kB)
Collecting batman-package (from transitleastsquares)
  Downloading batman_package-2.5.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (728 bytes)
Collecting configparser (from transitleastsquares)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting pyvo>=1.5 (from astroquery>=0.3.9->transitleastsquares)
  Downloading pyvo-1.7-py3-none-any.whl.metadata (4.7 kB)
Downloading transitleastsquares-1.32-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.5/47.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading astroquery-0.4.11-py3-none-any.whl (11.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0

In [None]:
# =============================================================================
# 🚀 HACKATHON SCRIPT: FAST & FURIOUS PLANET VETTER
# =============================================================================
# Description:
# This script automates the vetting of TESS Objects of Interest (TOIs) by:
# 1. Fetching the latest official TOI catalog from NASA.
# 2. Downloading and stitching all available high-quality light curve data.
# 3. Using the powerful Transit Least Squares (TLS) algorithm to find the most
#    likely planet transit and extract its physical properties.
# 4. Training a robust RandomForest model on these high-quality features.
# 5. Generating a submission-ready file with planet probabilities and a
#    professional plot of the top candidate.
#
# Author: Gemini (Your AI Teammate)
# Date: October 3, 2025
# =============================================================================


# =============================================================================
# STEP 0: SETUP - IMPORTS AND CONFIGURATION
# =============================================================================
print("STEP 0: Initializing setup...")

# Core Libraries
import pandas as pd
import numpy as np
import lightkurve as lk
from tqdm import tqdm
import warnings
import joblib
import os
import requests
import matplotlib.pyplot as plt

# The star of the show for transit detection
from transitleastsquares import transitleastsquares

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Suppress common warnings for a cleaner output
warnings.filterwarnings('ignore')

# --- Configuration ---
CONFIG = {
    'data_dir': './hackathon_output',
    'max_targets_to_process': 150,  # A good number for a fast run
    'period_min': 0.5,             # Minimum orbital period in days to search for
    'period_max': 15,              # Maximum orbital period (TESS is best at short periods)
    'random_state': 42
}

# Create the output directory if it doesn't exist
os.makedirs(CONFIG['data_dir'], exist_ok=True)

print("✅ Setup complete.")


# =============================================================================
# STEP 1: LOAD THE OFFICIAL TESS TOI CATALOG
# =============================================================================
print("\nSTEP 1: Loading TESS TOI Catalog from local file...")

# --- IMPORTANT ---
# !!! UPDATE THIS FILENAME TO MATCH YOUR LOCAL CSV FILE !!!
LOCAL_CATALOG_FILE = '/content/TOI_2025.09.28_05.51.22.csv'
# For example: '/content/TOI_2025.09.28_05.51.22.csv' if you're in Colab

def load_local_toi_catalog(filepath):
    """
    Loads the TOI catalog from a specified local CSV file.
    """
    if not os.path.exists(filepath):
        print(f"❌ ERROR: The file was not found at the specified path: {filepath}")
        print("   Please make sure the CSV file is in the correct directory and the filename is correct.")
        return None

    try:
        df = pd.read_csv('/content/TOI_2025.09.28_05.51.22.csv', comment='#')
        print(f"✅ Successfully loaded {len(df)} TOIs from {filepath}.")

        # --- Data Validation ---
        # Ensure required columns are present
        required_cols = ['tid', 'tfopwg_disp', 'pl_orbper', 'st_rad']
        if not all(col in df.columns for col in required_cols):
            print(f"⚠️ WARNING: Your CSV is missing one or more required columns: {required_cols}")
            print("   The script might fail. Please ensure your CSV has the necessary data.")
        return df

    except Exception as e:
        print(f"❌ ERROR: Failed to read the CSV file. Error: {e}")
        return None

# Load the catalog from your local file
toi_catalog = load_local_toi_catalog(LOCAL_CATALOG_FILE)

# If catalog is loaded, proceed with filtering
if toi_catalog is not None:
    # Remove rows with invalid data in key columns before filtering
    toi_catalog.dropna(subset=['tid', 'tfopwg_disp', 'pl_orbper', 'st_rad'], inplace=True)
    toi_catalog = toi_catalog[toi_catalog['pl_orbper'] > 0]

    # Select a balanced subset of targets to process for the hackathon
    toi_planets_candidates = toi_catalog[toi_catalog['tfopwg_disp'].isin(['CP', 'KP', 'PC'])]
    toi_false_positives = toi_catalog[toi_catalog['tfopwg_disp'].isin(['FP', 'FA'])]

    n_half = CONFIG['max_targets_to_process'] // 2

    # Use min() to prevent errors if there are fewer samples than n_half
    sample_planets = min(n_half, len(toi_planets_candidates))
    sample_fps = min(n_half, len(toi_false_positives))

    targets_to_process = pd.concat([
        toi_planets_candidates.sample(n=sample_planets, random_state=CONFIG['random_state']),
        toi_false_positives.sample(n=sample_fps, random_state=CONFIG['random_state'])
    ]).reset_index(drop=True)

    print(f"\n✅ Selected {len(targets_to_process)} targets for processing with distribution:")
    print(targets_to_process['tfopwg_disp'].value_counts())
else:
    print("\nStopping script due to data loading failure.")
    exit()


STEP 0: Initializing setup...




✅ Setup complete.

STEP 1: Loading TESS TOI Catalog from local file...
✅ Successfully loaded 7699 TOIs from /content/TOI_2025.09.28_05.51.22.csv.

✅ Selected 150 targets for processing with distribution:
tfopwg_disp
FP    68
PC    57
KP    10
CP     8
FA     7
Name: count, dtype: int64


In [None]:
# =============================================================================
# STEP 2: DEFINE THE FEATURE EXTRACTION FUNCTION
# =============================================================================
print("\nSTEP 2: Defining the TLS feature extraction function...")

def get_tls_features(tic_id):
    """
    Downloads data for a TIC ID, runs TLS, and returns key features.
    This is the core of our automated vetting process.
    """
    try:
        # Search for the highest quality 2-minute cadence data from the SPOC pipeline
        search = lk.search_lightcurve(f"TIC {tic_id}", author="SPOC", exptime=120)
        if len(search) == 0:
            return {'error': 'No SPOC 2-min data found'}

        # Download all available sectors and stitch them together
        lcs = search.download_all(quality_bitmask='default')
        if lcs is None or len(lcs) == 0:
            return {'error': 'Download failed'}

        lc = lcs.stitch().remove_nans().remove_outliers(sigma=5)

        # Run the Transit Least Squares algorithm
        model = transitleastsquares(lc.time.value, lc.flux.value)
        results = model.power(
            period_min=CONFIG['period_min'],
            period_max=CONFIG['period_max'],
            oversampling_factor=5,
            duration_grid_step=1.05
        )

        # Return a dictionary of the most powerful, scientifically-backed features
        return {
            'tic_id': tic_id,
            'tls_period': results.period,
            'tls_depth': results.depth,
            'tls_snr': results.snr,
            'tls_sde': results.SDE, # Signal Detection Efficiency - a key metric!
            'n_transits': results.transit_count,
            'rms': np.std(results.residuals) # Root Mean Square of the residuals
        }
    except Exception as e:
        return {'error': str(e)}

print("✅ Feature extraction function is ready.")





STEP 2: Defining the TLS feature extraction function...
✅ Feature extraction function is ready.


In [None]:
# =============================================================================
# STEP 3: RUN THE PIPELINE - PROCESS ALL TARGETS
# =============================================================================
print(f"\nSTEP 3: Running pipeline for {len(targets_to_process)} targets...")
print("   (This is the longest step, please be patient)")

all_features = []
tic_list = targets_to_process['tid'].unique().tolist()

for tic in tqdm(tic_list, desc="Processing TICs"):
    features = get_tls_features(tic)
    if features and 'error' not in features:
        all_features.append(features)

features_df = pd.DataFrame(all_features)
print(f"\n✅ Successfully extracted features for {len(features_df)} out of {len(tic_list)} targets.")



STEP 3: Running pipeline for 150 targets...
   (This is the longest step, please be patient)


Processing TICs:   0%|          | 0/150 [00:00<?, ?it/s]

Transit Least Squares TLS 1.32 (5 Apr 2024)
Creating model cache for 72 durations
Searching 17626 data points, 3873 periods from 0.602 to 12.876 days
Using all 2 CPU threads



  0%|          | 0/3873 periods | 00:00<?[A
  0%|          | 1/3873 periods | 00:03<4:11:42[A
  0%|          | 5/3873 periods | 00:04<39:13  [A
  0%|          | 9/3873 periods | 00:04<18:48[A
  0%|          | 13/3873 periods | 00:04<11:24[A
  0%|          | 17/3873 periods | 00:04<07:47[A
  1%|          | 22/3873 periods | 00:04<05:14[A
  1%|          | 26/3873 periods | 00:04<04:06[A
  1%|          | 30/3873 periods | 00:04<03:29[A
  1%|          | 34/3873 periods | 00:04<03:08[A
  1%|          | 38/3873 periods | 00:05<02:46[A
  1%|          | 42/3873 periods | 00:05<02:37[A
  1%|          | 46/3873 periods | 00:05<02:31[A
  1%|▏         | 49/3873 periods | 00:05<02:27[A
  1%|▏         | 53/3873 periods | 00:05<02:15[A
  1%|▏         | 57/3873 periods | 00:05<02:14[A
  2%|▏         | 61/3873 periods | 00:05<02:13[A
  2%|▏         | 65/3873 periods | 00:05<02:14[A
  2%|▏         | 69/3873 periods | 00:06<02:08[A
  2%|▏         | 73/3873 periods | 00:06<02:16[A
  2

In [None]:
# =============================================================================
# STEP 4: PREPARE DATA FOR MODELING
# =============================================================================
print("\nSTEP 4: Preparing final dataset for modeling...")

# Merge extracted features with the original catalog to get labels and stellar info
final_df = pd.merge(features_df, targets_to_process, left_on='tic_id', right_on='tid', how='inner')

# Create a simple binary label: 1 for Planet (Confirmed, Known, Candidate), 0 for Not Planet
def map_label(disposition):
    if disposition in ['CP', 'KP', 'PC']:
        return 1
    elif disposition in ['FP', 'FA']:
        return 0
    return np.nan # For any other cases

final_df['label'] = final_df['tfopwg_disp'].apply(map_label)
final_df = final_df.dropna(subset=['label']) # Remove any rows that couldn't be labeled
final_df['label'] = final_df['label'].astype(int)

# Define our feature set, including the stellar radius for context!
feature_names = ['tls_period', 'tls_depth', 'tls_snr', 'tls_sde', 'n_transits', 'rms', 'st_rad']
X = final_df[feature_names]
y = final_df['label']

# Handle any potential missing values in our final feature set
X = X.fillna(X.median())

print(f"✅ Final model-ready dataset created with {len(X)} samples.")
print("   Features being used:", feature_names)


# =============================================================================
# STEP 5: TRAIN & EVALUATE THE MODEL
# =============================================================================
print("\nSTEP 5: Training and evaluating the RandomForest model...")

# Split data for a final hold-out test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=CONFIG['random_state'], stratify=y
)

# Initialize a robust RandomForest Classifier
# 'class_weight=balanced' is crucial for imbalanced datasets like this
model = RandomForestClassifier(
    n_estimators=150,
    class_weight='balanced',
    random_state=CONFIG['random_state'],
    max_depth=7,
    min_samples_leaf=5
)

# Use Cross-Validation on the training set to get a reliable performance estimate
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"\n📈 Cross-Validated AUC Score: {np.mean(cv_scores):.3f} ± {np.std(cv_scores):.3f}")

# Train the final model on the full training set
model.fit(X_train, y_train)
print("✅ Final model trained.")

# Evaluate on the hold-out test set
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)
print("\n📋 Performance on Hold-Out Test Set:")
print(f"   AUC Score: {roc_auc_score(y_test, y_pred_proba):.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Planet', 'Planet']))

# Save the trained model
model_filename = os.path.join(CONFIG['data_dir'], 'exoplanet_vetter_model.joblib')
joblib.dump(model, model_filename)
print(f"\n💾 Model saved to: {model_filename}")


# =============================================================================
# STEP 6: GENERATE SUBMISSION FILES
# =============================================================================
print("\nSTEP 6: Generating submission files...")

# Predict probabilities for all processed targets
final_df['planet_probability'] = model.predict_proba(X)[:, 1]
submission_df = final_df[['tic_id', 'planet_probability', 'tfopwg_disp']].sort_values(
    by='planet_probability', ascending=False
)

# Save the CSV file
submission_csv = os.path.join(CONFIG['data_dir'], 'planet_probabilities.csv')
submission_df.to_csv(submission_csv, index=False)
print(f"✅ Submission CSV saved to: {submission_csv}")

# --- Generate a plot of the top candidate ---
try:
    top_candidate_row = submission_df.iloc[0]
    top_tic = int(top_candidate_row['tic_id'])
    top_prob = top_candidate_row['planet_probability']

    print(f"\nGenerating plot for top candidate: TIC {top_tic} (Prob: {top_prob:.2f})...")

    # Re-download its light curve for plotting
    search = lk.search_lightcurve(f"TIC {top_tic}", author="SPOC", exptime=120)
    lc = search.download_all().stitch().remove_nans().remove_outliers()

    # Get the period from our results to fold the light curve
    top_period = final_df[final_df['tic_id'] == top_tic]['tls_period'].values[0]

    # Fold and bin the light curve for a professional look
    folded_lc = lc.fold(period=top_period)
    binned_lc = folded_lc.bin(time_bin_size=0.01)

    # Create the plot
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    folded_lc.scatter(ax=ax, s=1, alpha=0.3, label='All Data Points')
    binned_lc.plot(ax=ax, marker='o', linestyle='none', markersize=5, color='royalblue', label='Binned Data')
    ax.set_title(f"Top Candidate: TIC {top_tic}\nPlanet Probability: {top_prob:.2%} | Folded at {top_period:.3f} days", fontsize=16)
    ax.set_xlabel("Phase")
    ax.set_ylabel("Normalized Flux")
    ax.legend()
    ax.set_ylim(bottom=np.min(binned_lc.flux.value) - 0.001) # Zoom in on the transit

    plot_filename = os.path.join(CONFIG['data_dir'], 'top_candidate_plot.png')
    plt.savefig(plot_filename, dpi=150, bbox_inches='tight')
    plt.show()
    print(f"✅ Top candidate plot saved to: {plot_filename}")

except Exception as e:
    print(f"⚠️ Could not generate plot for top candidate: {e}")

print("\n" + "="*80)
print("🎉 HACKATHON SCRIPT COMPLETE! YOU ARE READY TO GO! 🎉")
print("="*80)