In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
import xgboost as xgb
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load and prepare the data
df = pd.read_csv('../data/exoplanet.csv', comment='#')

# Select key features for habitability prediction
features = [
    'pl_rade', 'pl_radj', 'pl_bmasse', 'pl_bmassj', 'pl_orbper', 
    'pl_orbsmax', 'pl_eqt', 'st_teff', 'st_rad', 'st_mass', 'sy_dist'
]

# Create feature DataFrame
X = df[features].copy()

print(f"Feature matrix shape: {X.shape}")
print(f"Features selected: {features}")

# Display basic statistics
display(X.describe())


In [None]:
# Create habitability target variable based on scientific criteria
def create_habitability_target(df):
    """
    Create habitability target based on multiple criteria:
    - Planet radius: 0.5 to 2.0 Earth radii (potentially rocky)
    - Equilibrium temperature: 200K to 320K (liquid water possible)
    - Stellar temperature: 3000K to 7000K (stable main sequence)
    - Orbital period: reasonable range for habitable zone
    """
    
    conditions = []
    
    # Planet radius condition (Earth-like size)
    if 'pl_rade' in df.columns:
        radius_condition = (df['pl_rade'] >= 0.5) & (df['pl_rade'] <= 2.0)
        conditions.append(radius_condition)
    
    # Equilibrium temperature condition (habitable range)
    if 'pl_eqt' in df.columns:
        temp_condition = (df['pl_eqt'] >= 200) & (df['pl_eqt'] <= 320)
        conditions.append(temp_condition)
    
    # Stellar temperature condition (main sequence stars)
    if 'st_teff' in df.columns:
        stellar_temp_condition = (df['st_teff'] >= 3000) & (df['st_teff'] <= 7000)
        conditions.append(stellar_temp_condition)
    
    # Combine all conditions
    if conditions:
        habitability = conditions[0]
        for condition in conditions[1:]:
            habitability = habitability & condition
    else:
        habitability = pd.Series([False] * len(df))
    
    return habitability.astype(int)

# Create target variable
y = create_habitability_target(df)

print(f"Habitability distribution:")
print(y.value_counts())
print(f"\nHabitable percentage: {(y.sum() / len(y)) * 100:.2f}%")


In [None]:
# Data preprocessing and cleaning
print("DATA PREPROCESSING")
print("=" * 30)

# Remove rows where all key features are missing
X_clean = X.dropna(subset=['pl_rade', 'pl_eqt', 'st_teff'], how='all')
y_clean = y[X_clean.index]

print(f"Original dataset size: {X.shape[0]:,}")
print(f"After removing rows with missing key features: {X_clean.shape[0]:,}")

# Handle missing values with median imputation
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(
    imputer.fit_transform(X_clean),
    columns=X_clean.columns,
    index=X_clean.index
)

print(f"Features after imputation: {X_imputed.shape[1]}")
print(f"Remaining samples: {X_imputed.shape[0]:,}")

# Check target distribution after cleaning
print(f"\nTarget distribution after cleaning:")
print(y_clean.value_counts())
print(f"Habitable percentage: {(y_clean.sum() / len(y_clean)) * 100:.2f}%")


In [None]:
# Train-test split and feature scaling
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y_clean, test_size=0.2, random_state=42, stratify=y_clean
)

print(f"Training set size: {X_train.shape[0]:,}")
print(f"Test set size: {X_test.shape[0]:,}")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set habitable percentage: {(y_train.sum() / len(y_train)) * 100:.2f}%")
print(f"Test set habitable percentage: {(y_test.sum() / len(y_test)) * 100:.2f}%")


In [None]:
# Model training and evaluation
print("MODEL TRAINING AND EVALUATION")
print("=" * 40)

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss'),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1)
}

# Train and evaluate models
model_scores = {}
trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='f1')
    
    # Test predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    test_f1 = sklearn.metrics.f1_score(y_test, y_pred)
    test_auc = roc_auc_score(y_test, y_pred_proba)
    
    model_scores[name] = {
        'CV_F1_mean': cv_scores.mean(),
        'CV_F1_std': cv_scores.std(),
        'Test_F1': test_f1,
        'Test_AUC': test_auc
    }
    
    print(f"Cross-validation F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Test F1 Score: {test_f1:.4f}")
    print(f"Test AUC Score: {test_auc:.4f}")

# Display results summary
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)
results_df = pd.DataFrame(model_scores).T
display(results_df.round(4))
