In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
kepler = pd.read_csv("kepler_data.csv", comment="#")
k2 = pd.read_csv("k2_data.csv", comment="#")
tess = pd.read_csv("tess_data.csv", comment = "#")

In [8]:
print("Original dataset sizes:")
print(f"Kepler: {len(kepler)} rows")
print(f"K2: {len(k2)} rows")
print(f"TESS: {len(tess)} rows")

# Define column mappings for each dataset
kepler_mapping = {
    'koi_period': 'orbital_period',
    'koi_prad': 'planet_radius',
    'koi_duration': 'transit_duration',
    'koi_depth': 'transit_depth',
    'koi_teq': 'equilibrium_temp',
    'koi_insol': 'insolation_flux',
    'koi_steff': 'stellar_teff',
    'koi_srad': 'stellar_radius',
    'koi_slogg': 'stellar_logg',
    'koi_disposition': 'disposition'
}

k2_mapping = {
    'pl_orbper': 'orbital_period',
    'pl_rade': 'planet_radius',
    'pl_trandur': 'transit_duration',
    'pl_trandep': 'transit_depth',
    'pl_eqt': 'equilibrium_temp',
    'pl_insol': 'insolation_flux',
    'st_teff': 'stellar_teff',
    'st_rad': 'stellar_radius',
    'st_logg': 'stellar_logg',
    'disposition': 'disposition'
}

tess_mapping = {
    'pl_orbper': 'orbital_period',
    'pl_rade': 'planet_radius',
    'pl_trandurh': 'transit_duration',
    'pl_trandep': 'transit_depth',
    'pl_eqt': 'equilibrium_temp',
    'pl_insol': 'insolation_flux',
    'st_teff': 'stellar_teff',
    'st_rad': 'stellar_radius',
    'st_logg': 'stellar_logg',
    'tfopwg_disp': 'disposition'
}

def preprocess_dataset(df, column_mapping, survey_name):
    """
    Preprocess a single dataset by selecting and renaming columns.
    """
    # Select only columns that exist in the dataframe
    available_cols = {k: v for k, v in column_mapping.items() if k in df.columns}
    
    if len(available_cols) < len(column_mapping):
        missing = set(column_mapping.keys()) - set(available_cols.keys())
        print(f"\nWarning ({survey_name}): Missing columns: {missing}")
    
    # Select and rename columns
    df_processed = df[list(available_cols.keys())].copy()
    df_processed = df_processed.rename(columns=available_cols)
    
    # Add survey identifier
    df_processed['survey'] = survey_name
    
    return df_processed

# Preprocess each dataset
kepler_processed = preprocess_dataset(kepler, kepler_mapping, 'Kepler')
k2_processed = preprocess_dataset(k2, k2_mapping, 'K2')
tess_processed = preprocess_dataset(tess, tess_mapping, 'TESS')

# CRITICAL: Convert K2 transit depth from percentage to ppm
# K2 uses %, others use ppm. 1% = 10,000 ppm
if 'transit_depth' in k2_processed.columns:
    print("\nConverting K2 transit depth from % to ppm...")
    k2_processed['transit_depth'] = k2_processed['transit_depth'] * 10000

# Standardize disposition labels
def standardize_disposition(disposition):
    """
    Standardize disposition labels across datasets.
    """
    if pd.isna(disposition):
        return np.nan
    
    disposition = str(disposition).upper().strip()
    
    # Map to standard labels
    if 'CONFIRMED' in disposition or disposition == 'CP':
        return 'CONFIRMED'
    elif 'CANDIDATE' in disposition or disposition == 'PC':
        return 'CANDIDATE'
    elif 'FALSE' in disposition or disposition == 'FP':
        return 'FALSE POSITIVE'
    elif 'REFUTED' in disposition:
        return 'FALSE POSITIVE'  # Treat refuted as false positive
    elif disposition == 'KP':
        return 'CONFIRMED'  # Known planet
    else:
        return disposition

# Apply disposition standardization
for df in [kepler_processed, k2_processed, tess_processed]:
    if 'disposition' in df.columns:
        df['disposition'] = df['disposition'].apply(standardize_disposition)

# Combine all datasets
combined_data = pd.concat([kepler_processed, k2_processed, tess_processed], 
                          ignore_index=True)

print(f"\nCombined dataset size: {len(combined_data)} rows")
print(f"\nColumn list: {list(combined_data.columns)}")

# Check for missing values
print("\n=== Missing Values Summary ===")
missing_summary = combined_data.isnull().sum()
missing_pct = (missing_summary / len(combined_data) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': missing_summary,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

# Disposition distribution
print("\n=== Disposition Distribution ===")
print(combined_data['disposition'].value_counts(dropna=False))

print("\n=== Disposition by Survey ===")
print(pd.crosstab(combined_data['survey'], combined_data['disposition'], 
                  margins=True, dropna=False))

# Remove rows with missing disposition (can't train without labels)
print(f"\nRows before removing missing disposition: {len(combined_data)}")
combined_data_clean = combined_data.dropna(subset=['disposition'])
print(f"Rows after removing missing disposition: {len(combined_data_clean)}")

# Display basic statistics
print("\n=== Feature Statistics ===")
numeric_cols = ['orbital_period', 'planet_radius', 'transit_duration', 
                'transit_depth', 'equilibrium_temp', 'insolation_flux',
                'stellar_teff', 'stellar_radius', 'stellar_logg']
print(combined_data_clean[numeric_cols].describe())

# Save processed data
combined_data_clean.to_csv("combined_exoplanet_data.csv", index=False)
print("\n✓ Saved processed data to 'combined_exoplanet_data.csv'")

# Optional: Create a version with no missing values in features
combined_data_complete = combined_data_clean.dropna(subset=numeric_cols)
print(f"\nComplete cases (no missing features): {len(combined_data_complete)} rows")
combined_data_complete.to_csv("combined_exoplanet_data_complete.csv", index=False)
print("✓ Saved complete cases to 'combined_exoplanet_data_complete.csv'")

# Summary statistics by disposition
print("\n=== Class Balance in Complete Dataset ===")
print(combined_data_complete['disposition'].value_counts())
print("\nProportions:")
print(combined_data_complete['disposition'].value_counts(normalize=True).round(3))

# Filter to only valid classes for training
valid_dispositions = ['CANDIDATE', 'FALSE POSITIVE', 'CONFIRMED']
combined_data_final = combined_data_complete[
    combined_data_complete['disposition'].isin(valid_dispositions)
].copy()

print(f"\n=== Final Training Dataset ===")
print(f"Total rows: {len(combined_data_final)}")
print("\nClass Distribution:")
print(combined_data_final['disposition'].value_counts())
print("\nClass Proportions:")
print(combined_data_final['disposition'].value_counts(normalize=True).round(3))

print("\nBy Survey:")
print(pd.crosstab(combined_data_final['survey'], 
                  combined_data_final['disposition'], 
                  margins=True))

# Save final training dataset
combined_data_final.to_csv("exoplanet_training_data.csv", index=False)
print("\n✓ Saved final training data to 'exoplanet_training_data.csv'")

Original dataset sizes:
Kepler: 9564 rows
K2: 4004 rows
TESS: 7703 rows


Combined dataset size: 21271 rows

Column list: ['orbital_period', 'planet_radius', 'transit_duration', 'transit_depth', 'equilibrium_temp', 'insolation_flux', 'stellar_teff', 'stellar_radius', 'stellar_logg', 'disposition', 'survey']

=== Missing Values Summary ===
                  Missing Count  Percentage
orbital_period              174        0.82
planet_radius              1714        8.06
transit_duration           4004       18.82
transit_depth              4367       20.53
equilibrium_temp           3833       18.02
insolation_flux            3872       18.20
stellar_teff               1651        7.76
stellar_radius             1018        4.79
stellar_logg               2876       13.52

=== Disposition Distribution ===
disposition
CANDIDATE         8032
FALSE POSITIVE    6351
CONFIRMED         6328
APC                462
FA                  98
Name: count, dtype: int64

=== Disposition by Survey ===
d

In [12]:
# Load the combined data
data = pd.read_csv("combined_exoplanet_data_complete.csv")

print("=== Original Dataset ===")
print(f"Total samples: {len(data)}")
print("\nDisposition distribution:")
print(data['disposition'].value_counts())

# Separate into labeled (for training) and unlabeled (for prediction)
# LABELED: Confirmed exoplanets and False Positives (we know ground truth)
# UNLABELED: Candidates (the "lost exoplanets" we want to identify)

labeled_classes = ['CONFIRMED', 'FALSE POSITIVE']
labeled_data = data[data['disposition'].isin(labeled_classes)].copy()

candidate_data = data[data['disposition'] == 'CANDIDATE'].copy()

print("\n=== Data Split ===")
print(f"Labeled data (for training/testing): {len(labeled_data)}")
print(f"  - CONFIRMED: {(labeled_data['disposition'] == 'CONFIRMED').sum()}")
print(f"  - FALSE POSITIVE: {(labeled_data['disposition'] == 'FALSE POSITIVE').sum()}")
print(f"\nCandidate data (for prediction): {len(candidate_data)}")

print("\n=== Labeled Data by Survey ===")
print(pd.crosstab(labeled_data['survey'], labeled_data['disposition'], margins=True))

print("\n=== Candidate Data by Survey ===")
print(candidate_data['survey'].value_counts())

# Define features
feature_cols = [
    'orbital_period', 
    'planet_radius', 
    'transit_duration', 
    'transit_depth', 
    'equilibrium_temp', 
    'insolation_flux',
    'stellar_teff', 
    'stellar_radius', 
    'stellar_logg'
]

# === LABELED DATA: Create train/test split ===
X_labeled = labeled_data[feature_cols]
y_labeled = labeled_data['disposition']
survey_labeled = labeled_data['survey']

# Create binary labels for classification
# 1 = CONFIRMED (planet), 0 = FALSE POSITIVE (not a planet)
y_binary = (y_labeled == 'CONFIRMED').astype(int)

print("\n=== Binary Classification Task ===")
print("Target: 1 = CONFIRMED planet, 0 = FALSE POSITIVE")
print(f"Class balance: {y_binary.mean():.3f} positive (CONFIRMED)")

# Stratified split by both disposition and survey
labeled_data['strata'] = labeled_data['disposition'] + '_' + labeled_data['survey']

X_train, X_test, y_train, y_test, survey_train, survey_test = train_test_split(
    X_labeled, y_binary, survey_labeled,
    test_size=0.25,
    random_state=67,
    stratify=labeled_data['strata']
)

print("\n=== Train/Test Split ===")
print(f"Training set: {len(X_train)} samples ({len(X_train)/len(labeled_data)*100:.1f}%)")
print(f"  - CONFIRMED: {y_train.sum()} ({y_train.mean()*100:.1f}%)")
print(f"  - FALSE POSITIVE: {(1-y_train).sum()} ({(1-y_train.mean())*100:.1f}%)")

print(f"\nTest set: {len(X_test)} samples ({len(X_test)/len(labeled_data)*100:.1f}%)")
print(f"  - CONFIRMED: {y_test.sum()} ({y_test.mean()*100:.1f}%)")
print(f"  - FALSE POSITIVE: {(1-y_test).sum()} ({(1-y_test.mean())*100:.1f}%)")

# === CANDIDATE DATA: Prepare for prediction ===
X_candidates = candidate_data[feature_cols]

print("\n=== Candidate Pool (Lost Exoplanets) ===")
print(f"Total candidates to classify: {len(X_candidates)}")
print("These will be used for final predictions after model is trained")

# Standardize features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train), 
    columns=feature_cols, 
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test), 
    columns=feature_cols, 
    index=X_test.index
)
X_candidates_scaled = pd.DataFrame(
    scaler.transform(X_candidates), 
    columns=feature_cols, 
    index=X_candidates.index
)

# Save datasets
# 1. Training data (unscaled)
train_df = X_train.copy()
train_df['label'] = y_train.values
train_df['survey'] = survey_train.values
train_df.to_csv("train_labeled.csv", index=False)

# 2. Test data (unscaled)
test_df = X_test.copy()
test_df['label'] = y_test.values
test_df['survey'] = survey_test.values
test_df.to_csv("test_labeled.csv", index=False)

# 3. Training data (scaled)
train_df_scaled = X_train_scaled.copy()
train_df_scaled['label'] = y_train.values
train_df_scaled['survey'] = survey_train.values
train_df_scaled.to_csv("train_labeled_scaled.csv", index=False)

# 4. Test data (scaled)
test_df_scaled = X_test_scaled.copy()
test_df_scaled['label'] = y_test.values
test_df_scaled['survey'] = survey_test.values
test_df_scaled.to_csv("test_labeled_scaled.csv", index=False)

# 5. Candidate data (unscaled)
candidates_df = X_candidates.copy()
candidates_df['survey'] = candidate_data['survey'].values
# Save some metadata for later reference
candidates_df['original_index'] = candidate_data.index
candidates_df.to_csv("candidates_unlabeled.csv", index=False)

# 6. Candidate data (scaled)
candidates_df_scaled = X_candidates_scaled.copy()
candidates_df_scaled['survey'] = candidate_data['survey'].values
candidates_df_scaled['original_index'] = candidate_data.index
candidates_df_scaled.to_csv("candidates_unlabeled_scaled.csv", index=False)

# 7. Save scaler for later use
import joblib
joblib.dump(scaler, 'feature_scaler.pkl')

print("\n=== Files Saved ===")
print("Training/Testing (use these to build your model):")
print("  ✓ train_labeled.csv / train_labeled_scaled.csv")
print("  ✓ test_labeled.csv / test_labeled_scaled.csv")
print("\nPrediction (use these to find lost exoplanets):")
print("  ✓ candidates_unlabeled.csv / candidates_unlabeled_scaled.csv")
print("\nScaler:")
print("  ✓ feature_scaler.pkl")

print("\n=== Feature Statistics (Training Set) ===")
print(X_train.describe())

print("\n=== Workflow Summary ===")
print("1. Train your model on train_labeled*.csv")
print("2. Evaluate performance on test_labeled*.csv")
print("3. Tune hyperparameters using cross-validation on training data")
print("4. Once satisfied, predict on candidates_unlabeled*.csv")
print("5. Candidates with high confidence scores = potential lost exoplanets!")

# Additional insight: How many candidates per survey?
print("\n=== Candidate Distribution ===")
candidates_by_survey = candidate_data['survey'].value_counts()
print(candidates_by_survey)
print(f"\nThat's {len(candidate_data)} potential exoplanets to discover!")

=== Original Dataset ===
Total samples: 15829

Disposition distribution:
disposition
CANDIDATE         5898
FALSE POSITIVE    5531
CONFIRMED         3971
APC                363
FA                  66
Name: count, dtype: int64

=== Data Split ===
Labeled data (for training/testing): 9502
  - CONFIRMED: 3971
  - FALSE POSITIVE: 5531

Candidate data (for prediction): 5898

=== Labeled Data by Survey ===
disposition  CONFIRMED  FALSE POSITIVE   All
survey                                      
Kepler            2744            4582  7326
TESS              1227             949  2176
All               3971            5531  9502

=== Candidate Data by Survey ===
survey
TESS      4023
Kepler    1875
Name: count, dtype: int64

=== Binary Classification Task ===
Target: 1 = CONFIRMED planet, 0 = FALSE POSITIVE
Class balance: 0.418 positive (CONFIRMED)

=== Train/Test Split ===
Training set: 7126 samples (75.0%)
  - CONFIRMED: 2978 (41.8%)
  - FALSE POSITIVE: 4148 (58.2%)

Test set: 2376 samples (