# NASA Exoplanet Data Analysis and Processing

This notebook merges NASA's three exoplanet datasets (Kepler, TESS, K2) to build a usable dataset for exoplanet classification.

## Datasets:
- **Kepler KOI Data**: Exoplanet candidates detected by the Kepler mission
- **TESS TOI Data**: Exoplanet candidates detected by the TESS mission
- **K2 Data**: Exoplanet candidates detected by the K2 mission

## Goals:
1. Merge the three datasets
2. Identify common columns
3. Improve data quality
4. Produce a dataset ready for exoplanet classification


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import sklearn
import warnings
warnings.filterwarnings('ignore')

# Visualization settings
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries loaded successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")


Kütüphaneler başarıyla yüklendi!
Pandas version: 2.3.3
NumPy version: 2.3.3
Scikit-learn version: 1.7.2


In [None]:
# Classification Model Training
# In this section, we will train a classification model with the unified dataset and classify candidates

print("=== CLASSIFICATION MODEL TRAINING ===")
print("=" * 50)


In [None]:
# Data loading and analysis
print("=== DATA LOADING AND ANALYSIS ===")

# Load unified dataset
df = pd.read_csv('unified_dataset_final.csv')
print(f"Unified dataset shape: {df.shape}")
print(f"Label distribution:")
print(df['label'].value_counts())
print(f"Label distribution percentage:")
print(df['label'].value_counts(normalize=True) * 100)

# Load training data (with labels)
train_df = pd.read_csv('training_data.csv')
print(f"\nTraining data shape: {train_df.shape}")
print(f"Training label distribution:")
print(train_df['label'].value_counts())

# Load candidate data
candidate_df = pd.read_csv('candidate_data.csv')
print(f"\nCandidate data shape: {candidate_df.shape}")
print(f"Candidate label distribution:")
print(candidate_df['label'].value_counts())


In [None]:
# Feature Engineering
print("\n=== FEATURE ENGINEERING ===")

def prepare_features(df):
    """Feature engineering and selection"""
    # Encode categorical variables
    le = LabelEncoder()
    if 'source_dataset' in df.columns:
        df['source_dataset_encoded'] = le.fit_transform(df['source_dataset'].astype(str))
    
    # Select numeric features
    numeric_features = [
        'ra_deg', 'dec_deg', 'period_days', 't0_bjd', 'transit_depth_ppm', 
        'duration_hours', 'impact_param', 'ecc', 'snr', 'rp_re', 'teq_k', 
        'insolation', 'teff_k', 'logg_cgs', 'feh_dex', 'mass_solar', 
        'radius_solar', 'mag_kepler', 'num_transits', 'ror_ratio', 'dor_ratio',
        'fp_flag_nt', 'fp_flag_ss', 'fp_flag_co', 'fp_flag_ec', 'mag_tess',
        'stellar_pmra', 'stellar_pmdec', 'stellar_distance'
    ]
    
    # Check available columns
    available_features = [col for col in numeric_features if col in df.columns]
    print(f"Number of available features: {len(available_features)}")
    print(f"Features: {available_features}")
    
    # Add source dataset encoded
    if 'source_dataset_encoded' in df.columns:
        available_features.append('source_dataset_encoded')
    
    # Build feature matrix
    X = df[available_features].copy()
    
    # Fill missing values
    imputer = SimpleImputer(strategy='median')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
    
    print(f"Feature matrix shape: {X_imputed.shape}")
    print(f"Missing value count: {X_imputed.isnull().sum().sum()}")
    
    return X_imputed, available_features

# Feature engineering for training data
X_train, features = prepare_features(train_df)
print(f"\nTraining features shape: {X_train.shape}")

# Feature engineering for candidate data
X_candidates, _ = prepare_features(candidate_df)
print(f"Candidate features shape: {X_candidates.shape}")

# Prepare labels
y_train = train_df['label'].values
print(f"Training labels shape: {y_train.shape}")
print(f"Training label distribution: {np.unique(y_train, return_counts=True)}")


In [None]:
# Model Training
print("\n=== MODEL TRAINING ===")

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

# Train-test split
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Train set shape: {X_train_split.shape}")
print(f"Test set shape: {X_test_split.shape}")

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

# Compare model performance via cross-validation
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_split, y_train_split, cv=5, scoring='accuracy')
    results[name] = {
        'model': model,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    print(f"CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Select the best model
best_model_name = max(results.keys(), key=lambda x: results[x]['cv_mean'])
best_model = results[best_model_name]['model']

print(f"\nBest model: {best_model_name}")
print(f"CV Accuracy: {results[best_model_name]['cv_mean']:.4f}")


In [None]:
# Model Evaluation
print(f"\n=== {best_model_name} MODEL EVALUATION ===")

# Evaluate the best model on the test set
best_model.fit(X_train_split, y_train_split)
y_pred = best_model.predict(X_test_split)
y_pred_proba = best_model.predict_proba(X_test_split)[:, 1] if hasattr(best_model, 'predict_proba') else None

# Metrics
accuracy = accuracy_score(y_test_split, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

if y_pred_proba is not None:
    auc = roc_auc_score(y_test_split, y_pred_proba)
    print(f"Test AUC: {auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_split, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_split, y_pred))


In [None]:
# Retrain on all training data and classify candidates
print(f"\n=== RETRAIN ON FULL TRAINING DATA ===")
best_model.fit(X_train, y_train)
print("Model retrained on full training data.")

print("\n=== CANDIDATE CLASSIFICATION ===")

# Classify candidates
candidate_predictions = best_model.predict(X_candidates)
candidate_probabilities = best_model.predict_proba(X_candidates)[:, 1] if hasattr(best_model, 'predict_proba') else None

# Append results to dataframe
candidate_predictions_df = candidate_df.copy()
candidate_predictions_df['predicted_label'] = candidate_predictions
if candidate_probabilities is not None:
    candidate_predictions_df['prediction_probability'] = candidate_probabilities

# Prediction distribution
print(f"Candidate prediction distribution:")
print(pd.Series(candidate_predictions).value_counts())
print(f"Candidate prediction distribution (%):")
print(pd.Series(candidate_predictions).value_counts(normalize=True) * 100)

print(f"\nTotal candidates: {len(candidate_predictions)}")
print(f"Predicted as exoplanet: {sum(candidate_predictions == 1)}")
print(f"Predicted as non-exoplanet: {sum(candidate_predictions == 0)}")


In [None]:
# Final Dataset Creation
print("\n=== FINAL DATASET CREATION ===")

# Add prediction columns to training data
train_final = train_df.copy()
train_final['predicted_label'] = train_final['label']  # Ground-truth labels
train_final['prediction_probability'] = 1.0  # 100% confidence for ground-truth
train_final['data_type'] = 'training'

# Add data_type to candidate predictions
candidate_final = candidate_predictions_df.copy()
candidate_final['data_type'] = 'candidate'

# Concatenate
final_dataset = pd.concat([train_final, candidate_final], ignore_index=True)

print(f"Final dataset shape: {final_dataset.shape}")
print(f"Training rows: {len(train_final)}")
print(f"Candidate rows: {len(candidate_final)}")
print(f"Overall predicted label distribution:")
print(final_dataset['predicted_label'].value_counts())
print(f"Predicted label distribution (%):")
print(final_dataset['predicted_label'].value_counts(normalize=True) * 100)


In [None]:
# Save Results
print("\n=== SAVING RESULTS ===")

# Save final dataset
final_dataset.to_csv('final_classified_dataset.csv', index=False)
print("final_classified_dataset.csv saved - All data (training + candidate predictions)")

# Save candidate-only predictions
candidate_predictions_df.to_csv('candidate_predictions.csv', index=False)
print("candidate_predictions.csv saved - Candidate predictions only")

# Model performance summary
print(f"\n=== MODEL PERFORMANCE SUMMARY ===")
print(f"Best model: {best_model_name}")
print(f"Test accuracy: {accuracy:.4f}")
print(f"Total training records: {len(train_df)}")
print(f"Total candidates: {len(candidate_df)}")
print(f"Final dataset size: {len(final_dataset)}")

# Feature importance (for Random Forest)
if hasattr(best_model, 'feature_importances_'):
    print(f"\n=== FEATURE IMPORTANCE (Top 10) ===")
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(feature_importance.head(10))


In [None]:
# Load datasets
print("Loading datasets...")

# Kepler KOI Data
kepler_df = pd.read_csv("Nasa-Exoplanet/kepler_koi_data_cleaned.csv")
print(f"Kepler dataset loaded: {kepler_df.shape}")

# TESS TOI Data  
tess_df = pd.read_csv("Nasa-Exoplanet/tess_toi_data_cleaned.csv")
print(f"TESS dataset loaded: {tess_df.shape}")

# K2 Data
k2_df = pd.read_csv("Nasa-Exoplanet/k2_data_cleaned.csv")
print(f"K2 dataset loaded: {k2_df.shape}")

print("\n=== DATASET SUMMARY ===")
print(f"Total rows: {kepler_df.shape[0] + tess_df.shape[0] + k2_df.shape[0]}")
print(f"Average number of columns: {(kepler_df.shape[1] + tess_df.shape[1] + k2_df.shape[1]) / 3:.0f}")


Veri setleri yükleniyor...
Kepler veri seti yüklendi: (9564, 150)
TESS veri seti yüklendi: (7703, 92)
K2 veri seti yüklendi: (4004, 357)

=== VERİ SETİ ÖZETİ ===
Toplam satır sayısı: 21271
Toplam sütun sayısı (ortalama): 200


In [None]:
# Analyze dataset columns
print("=== COLUMN ANALYSIS ===")

# Get columns for each dataset
kepler_cols = set(kepler_df.columns)
tess_cols = set(tess_df.columns)
k2_cols = set(k2_df.columns)

# Find common columns
common_cols = kepler_cols.intersection(tess_cols).intersection(k2_cols)
print(f"Number of common columns: {len(common_cols)}")
print(f"Common columns: {sorted(common_cols)}")

# Common columns between pairs
kepler_tess_common = kepler_cols.intersection(tess_cols)
kepler_k2_common = kepler_cols.intersection(k2_cols)
tess_k2_common = tess_cols.intersection(k2_cols)

print(f"\nKepler-TESS common columns: {len(kepler_tess_common)}")
print(f"Kepler-K2 common columns: {len(kepler_k2_common)}")
print(f"TESS-K2 common columns: {len(tess_k2_common)}")

# Identify important columns
important_cols = {
    'ra', 'dec', 'sky_coord.ra', 'sky_coord.dec',  # Coordinates
    'pl_orbper', 'pl_orbpererr1', 'pl_orbpererr2',  # Orbital period
    'pl_rade', 'pl_radeerr1', 'pl_radeerr2',  # Planet radius
    'pl_masse', 'pl_masseerr1', 'pl_masseerr2',  # Planet mass
    'st_teff', 'st_tefferr1', 'st_tefferr2',  # Stellar temperature
    'st_logg', 'st_loggerr1', 'st_loggerr2',  # Stellar logg
    'st_rad', 'st_raderr1', 'st_raderr2',  # Stellar radius
    'st_mass', 'st_masserr1', 'st_masserr2',  # Stellar mass
    'disposition', 'koi_disposition', 'tfopwg_disp'  # Disposition
}

print(f"\nImportant columns: {len(important_cols)}")
print(f"Important columns: {sorted(important_cols)}")


=== SÜTUN ANALİZİ ===
Ortak sütun sayısı: 4
Ortak sütunlar: ['dec', 'ra', 'sky_coord.dec', 'sky_coord.ra']

Kepler-TESS ortak sütunlar: 4
Kepler-K2 ortak sütunlar: 4
TESS-K2 ortak sütunlar: 43

Önemli sütunlar: 28
Önemli sütunlar: ['dec', 'disposition', 'koi_disposition', 'pl_masse', 'pl_masseerr1', 'pl_masseerr2', 'pl_orbper', 'pl_orbpererr1', 'pl_orbpererr2', 'pl_rade', 'pl_radeerr1', 'pl_radeerr2', 'ra', 'sky_coord.dec', 'sky_coord.ra', 'st_logg', 'st_loggerr1', 'st_loggerr2', 'st_mass', 'st_masserr1', 'st_masserr2', 'st_rad', 'st_raderr1', 'st_raderr2', 'st_teff', 'st_tefferr1', 'st_tefferr2', 'tfopwg_disp']


In [None]:
# Show basic statistics of the datasets
print("=== DATASET STATISTICS ===")

# Kepler dataset
print("\n--- KEPLER DATASET ---")
print(f"Rows: {kepler_df.shape[0]}")
print(f"Columns: {kepler_df.shape[1]}")
print(f"Missing value percentage: {kepler_df.isnull().sum().sum() / (kepler_df.shape[0] * kepler_df.shape[1]) * 100:.2f}%")

# TESS dataset
print("\n--- TESS DATASET ---")
print(f"Rows: {tess_df.shape[0]}")
print(f"Columns: {tess_df.shape[1]}")
print(f"Missing value percentage: {tess_df.isnull().sum().sum() / (tess_df.shape[0] * tess_df.shape[1]) * 100:.2f}%")

# K2 dataset
print("\n--- K2 DATASET ---")
print(f"Rows: {k2_df.shape[0]}")
print(f"Columns: {k2_df.shape[1]}")
print(f"Missing value percentage: {k2_df.isnull().sum().sum() / (k2_df.shape[0] * k2_df.shape[1]) * 100:.2f}%")

# Check disposition columns
print("\n=== DISPOSITION INFORMATION ===")
if 'koi_disposition' in kepler_df.columns:
    print("Kepler disposition distribution:")
    print(kepler_df['koi_disposition'].value_counts())
    
if 'tfopwg_disp' in tess_df.columns:
    print("\nTESS disposition distribution:")
    print(tess_df['tfopwg_disp'].value_counts())
    
if 'disposition' in k2_df.columns:
    print("\nK2 disposition distribution:")
    print(k2_df['disposition'].value_counts())


=== VERİ SETİ İSTATİSTİKLERİ ===

--- KEPLER VERİ SETİ ---
Satır sayısı: 9564
Sütun sayısı: 150
Eksik değer yüzdesi: 18.88%

--- TESS VERİ SETİ ---
Satır sayısı: 7703
Sütun sayısı: 92
Eksik değer yüzdesi: 11.24%

--- K2 VERİ SETİ ---
Satır sayısı: 4004
Sütun sayısı: 357
Eksik değer yüzdesi: 38.13%

=== DURUM BİLGİLERİ ===
Kepler disposition dağılımı:
koi_disposition
FALSE POSITIVE    4839
CONFIRMED         2746
CANDIDATE         1979
Name: count, dtype: int64

TESS disposition dağılımı:
tfopwg_disp
PC     4679
FP     1197
CP      684
KP      583
APC     462
FA       98
Name: count, dtype: int64

K2 disposition dağılımı:
disposition
CONFIRMED         2315
CANDIDATE         1374
FALSE POSITIVE     293
REFUTED             22
Name: count, dtype: int64


In [None]:
# Determine common columns for merging datasets
print("=== MERGE PREPARATION ===")

# COMPREHENSIVE feature mapping - common + dataset-specific features (45+)
common_features = {
    # Coordinates (present in all datasets)
    'ra': ['ra', 'ra', 'ra'],
    'dec': ['dec', 'dec', 'dec'],
    
    # Orbital properties
    'period_days': ['koi_period', 'pl_orbper', 'pl_orbper'],
    'eccentricity': ['koi_eccen', None, 'pl_orbeccen'],
    'inclination': ['koi_incl', None, 'pl_orbincl'],
    'semi_major_axis': ['koi_sma', None, 'pl_orbsmax'],
    
    # Planet properties
    'planet_radius_re': ['koi_prad', 'pl_rade', 'pl_rade'],
    'planet_mass_me': ['koi_smass', 'pl_masse', 'pl_masse'],
    'planet_density': [None, None, 'pl_dens'],
    'equilibrium_temp': ['koi_teq', 'pl_eqt', 'pl_eqt'],
    'insolation': ['koi_insol', 'pl_insol', 'pl_insol'],
    
    # Transit properties
    'transit_depth_ppm': ['koi_depth', 'pl_trandep', 'pl_trandep'],
    'transit_duration_hours': ['koi_duration', 'pl_trandurh', 'pl_trandur'],
    'transit_epoch': ['koi_time0bk', 'pl_tranmid', 'pl_tranmid'],
    'impact_parameter': ['koi_impact', None, 'pl_imppar'],
    
    # Stellar properties
    'stellar_teff': ['koi_steff', 'st_teff', 'st_teff'],
    'stellar_logg': ['koi_slogg', 'st_logg', 'st_logg'],
    'stellar_radius': ['koi_srad', 'st_rad', 'st_rad'],
    'stellar_mass': ['koi_smass', 'st_mass', 'st_mass'],
    'stellar_metallicity': ['koi_smet', None, 'st_met'],
    'stellar_age': ['koi_sage', None, 'st_age'],
    'stellar_density': [None, None, 'st_dens'],
    
    # Photometry
    'mag_kepler': ['koi_kepmag', None, 'sy_kepmag'],
    'mag_tess': [None, 'st_tmag', 'sy_tmag'],
    'mag_gaia_g': [None, None, 'sy_gaiamag'],
    'mag_bp': [None, None, 'sy_bmag'],
    'mag_rp': [None, None, 'sy_vmag'],
    'mag_j': ['koi_jmag', None, 'sy_jmag'],
    'mag_h': ['koi_hmag', None, 'sy_hmag'],
    'mag_k': ['koi_kmag', None, 'sy_kmag'],
    
    # RV properties
    'rv_semi_amplitude': [None, None, 'pl_rvamp'],
    
    # KEPLER-SPECIFIC IMPORTANT FEATURES
    'num_transits': ['koi_num_transits', None, None],
    'model_snr': ['koi_model_snr', None, None],
    'ror_ratio': ['koi_ror', None, None],
    'dor_ratio': ['koi_dor', None, None],
    'fp_flag_nt': ['koi_fpflag_nt', None, None],
    'fp_flag_ss': ['koi_fpflag_ss', None, None],
    'fp_flag_co': ['koi_fpflag_co', None, None],
    'fp_flag_ec': ['koi_fpflag_ec', None, None],
    
    # TESS-SPECIFIC IMPORTANT FEATURES
    'stellar_pmra': [None, 'st_pmra', None],
    'stellar_pmdec': [None, 'st_pmdec', None],
    'stellar_distance': [None, 'st_dist', None],
    
    # K2-SPECIFIC IMPORTANT FEATURES
    'planet_radius_j': [None, None, 'pl_radj'],
    'planet_mass_j': [None, None, 'pl_massj'],
    'stellar_luminosity': [None, None, 'st_lum'],
    'stellar_vsini': [None, None, 'st_vsin'],
    'stellar_radv': [None, None, 'st_radv'],
    
    # Disposition
    'disposition': ['koi_disposition', 'tfopwg_disp', 'disposition']
}

print("COMPREHENSIVE common features:")
for feature, cols in common_features.items():
    print(f"{feature}: {cols}")

# Add source column for each dataset
kepler_df['source_dataset'] = 'Kepler'
tess_df['source_dataset'] = 'TESS'
k2_df['source_dataset'] = 'K2'

print(f"\nDatasets tagged with source column.")
print(f"Kepler: {kepler_df['source_dataset'].value_counts()}")
print(f"TESS: {tess_df['source_dataset'].value_counts()}")
print(f"K2: {k2_df['source_dataset'].value_counts()}")


=== VERİ BİRLEŞTİRME HAZIRLIĞI ===
KAPSAMLI ortak özellikler:
ra: ['ra', 'ra', 'ra']
dec: ['dec', 'dec', 'dec']
period_days: ['koi_period', 'pl_orbper', 'pl_orbper']
eccentricity: ['koi_eccen', None, 'pl_orbeccen']
inclination: ['koi_incl', None, 'pl_orbincl']
semi_major_axis: ['koi_sma', None, 'pl_orbsmax']
planet_radius_re: ['koi_prad', 'pl_rade', 'pl_rade']
planet_mass_me: ['koi_smass', 'pl_masse', 'pl_masse']
planet_density: [None, None, 'pl_dens']
equilibrium_temp: ['koi_teq', 'pl_eqt', 'pl_eqt']
insolation: ['koi_insol', 'pl_insol', 'pl_insol']
transit_depth_ppm: ['koi_depth', 'pl_trandep', 'pl_trandep']
transit_duration_hours: ['koi_duration', 'pl_trandurh', 'pl_trandur']
transit_epoch: ['koi_time0bk', 'pl_tranmid', 'pl_tranmid']
impact_parameter: ['koi_impact', None, 'pl_imppar']
stellar_teff: ['koi_steff', 'st_teff', 'st_teff']
stellar_logg: ['koi_slogg', 'st_logg', 'st_logg']
stellar_radius: ['koi_srad', 'st_rad', 'st_rad']
stellar_mass: ['koi_smass', 'st_mass', 'st_mass']
st

In [None]:
# PROFESSIONAL SCHEMA-FIRST APPROACH
print("=== PROFESSIONAL MERGE PREPARATION ===")

# Unified Schema Definition (Schema-First Approach)
unified_schema = {
    # Identifiers
    'star_id': {'dtype': 'object', 'unit': 'dimensionless', 'description': 'Host star identifier'},
    'planet_id': {'dtype': 'object', 'unit': 'dimensionless', 'description': 'Planet/candidate identifier'},
    'source_dataset': {'dtype': 'object', 'unit': 'dimensionless', 'description': 'Source dataset (kepler/tess/k2)'},
    
    # Sky position
    'ra_deg': {'dtype': 'float64', 'unit': 'degrees', 'description': 'Right ascension'},
    'dec_deg': {'dtype': 'float64', 'unit': 'degrees', 'description': 'Declination'},
    
    # Labels
    'label': {'dtype': 'int8', 'unit': 'dimensionless', 'description': '1=confirmed planet, 0=false positive, -1=candidate'},
    'is_candidate': {'dtype': 'int8', 'unit': 'dimensionless', 'description': '1 if candidate (uncertain label)'},
    
    # Orbital parameters
    'period_days': {'dtype': 'float64', 'unit': 'days', 'description': 'Orbital period'},
    't0_bjd': {'dtype': 'float64', 'unit': 'BJD', 'description': 'Transit epoch'},
    'transit_depth_ppm': {'dtype': 'float64', 'unit': 'ppm', 'description': 'Transit depth'},
    'duration_hours': {'dtype': 'float64', 'unit': 'hours', 'description': 'Transit duration'},
    'impact_param': {'dtype': 'float64', 'unit': 'dimensionless', 'description': 'Impact parameter'},
    'ecc': {'dtype': 'float64', 'unit': 'dimensionless', 'description': 'Orbital eccentricity'},
    'snr': {'dtype': 'float64', 'unit': 'dimensionless', 'description': 'Signal-to-noise ratio'},
    
    # Planet parameters
    'rp_re': {'dtype': 'float64', 'unit': 'R_earth', 'description': 'Planet radius in Earth radii'},
    'mp_mj': {'dtype': 'float64', 'unit': 'M_jupiter', 'description': 'Planet mass in Jupiter masses'},
    'teq_k': {'dtype': 'float64', 'unit': 'Kelvin', 'description': 'Equilibrium temperature'},
    'insolation': {'dtype': 'float64', 'unit': 'S_earth', 'description': 'Insolation flux'},
    
    # Host star parameters
    'teff_k': {'dtype': 'float64', 'unit': 'Kelvin', 'description': 'Stellar effective temperature'},
    'logg_cgs': {'dtype': 'float64', 'unit': 'log(cm/s^2)', 'description': 'Stellar surface gravity'},
    'feh_dex': {'dtype': 'float64', 'unit': 'dex', 'description': 'Stellar metallicity'},
    'mass_solar': {'dtype': 'float64', 'unit': 'M_sun', 'description': 'Stellar mass'},
    'radius_solar': {'dtype': 'float64', 'unit': 'R_sun', 'description': 'Stellar radius'},
    
    # Photometry
    'mag_kepler': {'dtype': 'float64', 'unit': 'mag', 'description': 'Kepler-band magnitude'},
    'mag_tess': {'dtype': 'float64', 'unit': 'mag', 'description': 'TESS magnitude'},
    'mag_gaia_g': {'dtype': 'float64', 'unit': 'mag', 'description': 'Gaia G-band magnitude'},
    
    # KEPLER ÖZEL FEATURELAR
    'num_transits': {'dtype': 'int16', 'unit': 'count', 'description': 'Number of transits observed'},
    'model_snr': {'dtype': 'float64', 'unit': 'dimensionless', 'description': 'Model signal-to-noise ratio'},
    'ror_ratio': {'dtype': 'float64', 'unit': 'dimensionless', 'description': 'Planet-to-star radius ratio'},
    'dor_ratio': {'dtype': 'float64', 'unit': 'dimensionless', 'description': 'Planet-to-star density ratio'},
    'fp_flag_nt': {'dtype': 'int8', 'unit': 'flag', 'description': 'False positive flag: not transit-like'},
    'fp_flag_ss': {'dtype': 'int8', 'unit': 'flag', 'description': 'False positive flag: stellar variability'},
    'fp_flag_co': {'dtype': 'int8', 'unit': 'flag', 'description': 'False positive flag: centroid offset'},
    'fp_flag_ec': {'dtype': 'int8', 'unit': 'flag', 'description': 'False positive flag: ephemeris match'},
    
    # TESS ÖZEL FEATURELAR
    'stellar_pmra': {'dtype': 'float64', 'unit': 'mas/yr', 'description': 'Stellar proper motion in RA'},
    'stellar_pmdec': {'dtype': 'float64', 'unit': 'mas/yr', 'description': 'Stellar proper motion in Dec'},
    'stellar_distance': {'dtype': 'float64', 'unit': 'pc', 'description': 'Stellar distance'},
    
    # K2 ÖZEL FEATURELAR
    'planet_radius_j': {'dtype': 'float64', 'unit': 'R_jupiter', 'description': 'Planet radius in Jupiter radii'},
    'planet_mass_j': {'dtype': 'float64', 'unit': 'M_jupiter', 'description': 'Planet mass in Jupiter masses'},
    'stellar_luminosity': {'dtype': 'float64', 'unit': 'L_sun', 'description': 'Stellar luminosity'},
    'stellar_vsini': {'dtype': 'float64', 'unit': 'km/s', 'description': 'Stellar rotational velocity'},
    'stellar_radv': {'dtype': 'float64', 'unit': 'km/s', 'description': 'Stellar radial velocity'},
}

# Rename mappings for each dataset
rename_maps = {
    'kepler': {
        # Identifiers
        'kepid': 'star_id',
        'kepoi_name': 'planet_id',
        
        # Sky position
        'ra': 'ra_deg',
        'dec': 'dec_deg',
        
        # Disposition
        'koi_disposition': '_disposition',
        
        # Orbital parameters
        'koi_period': 'period_days',
        'koi_time0bk': '_t0_offset',  # Will need adjustment
        'koi_time0': 't0_bjd',
        'koi_depth': 'transit_depth_ppm',
        'koi_duration': 'duration_hours',
        'koi_impact': 'impact_param',
        'koi_eccen': 'ecc',
        'koi_model_snr': 'snr',
        
        # Planet parameters
        'koi_prad': 'rp_re',
        'koi_teq': 'teq_k',
        'koi_insol': 'insolation',
        
        # Stellar parameters
        'koi_steff': 'teff_k',
        'koi_slogg': 'logg_cgs',
        'koi_smet': 'feh_dex',
        'koi_smass': 'mass_solar',
        'koi_srad': 'radius_solar',
        
        # Photometry
        'koi_kepmag': 'mag_kepler',
        
        # Kepler özel
        'koi_num_transits': 'num_transits',
        'koi_ror': 'ror_ratio',
        'koi_dor': 'dor_ratio',
        'koi_fpflag_nt': 'fp_flag_nt',
        'koi_fpflag_ss': 'fp_flag_ss',
        'koi_fpflag_co': 'fp_flag_co',
        'koi_fpflag_ec': 'fp_flag_ec',
    },
    'tess': {
        # Identifiers
        'tid': 'star_id',
        'toi': 'planet_id',
        
        # Sky position
        'ra': 'ra_deg',
        'dec': 'dec_deg',
        
        # Disposition
        'tfopwg_disp': '_disposition',
        
        # Orbital parameters
        'pl_orbper': 'period_days',
        'pl_tranmid': 't0_bjd',
        'pl_trandep': 'transit_depth_ppm',
        'pl_trandurh': 'duration_hours',
        
        # Planet parameters
        'pl_rade': 'rp_re',
        'pl_eqt': 'teq_k',
        'pl_insol': 'insolation',
        
        # Stellar parameters
        'st_teff': 'teff_k',
        'st_logg': 'logg_cgs',
        'st_rad': 'radius_solar',
        'st_tmag': 'mag_tess',
        
        # TESS özel
        'st_pmra': 'stellar_pmra',
        'st_pmdec': 'stellar_pmdec',
        'st_dist': 'stellar_distance',
    },
    'k2': {
        # Identifiers
        'epic_number': 'star_id',
        'epic_hostname': 'star_id',
        'pl_name': 'planet_id',
        'k2_name': 'planet_id',
        
        # Sky position
        'ra': 'ra_deg',
        'dec': 'dec_deg',
        
        # Disposition
        'disposition': '_disposition',
        
        # Orbital parameters
        'pl_orbper': 'period_days',
        'pl_tranmid': 't0_bjd',
        'pl_trandep': '_transit_depth_percent',  # Will convert from % to ppm
        'pl_trandur': 'duration_hours',
        'pl_imppar': 'impact_param',
        'pl_orbeccen': 'ecc',
        
        # Planet parameters
        'pl_rade': 'rp_re',
        'pl_bmassj': 'mp_mj',
        'pl_massj': 'mp_mj',
        'pl_eqt': 'teq_k',
        'pl_insol': 'insolation',
        
        # Stellar parameters
        'st_teff': 'teff_k',
        'k2_teff': 'teff_k',
        'st_logg': 'logg_cgs',
        'k2_logg': 'logg_cgs',
        'st_met': 'feh_dex',
        'k2_metfe': 'feh_dex',
        'st_mass': 'mass_solar',
        'k2_mass': 'mass_solar',
        'st_rad': 'radius_solar',
        'k2_rad': 'radius_solar',
        
        # Photometry
        'k2_kepmag': 'mag_kepler',
        'sy_kepmag': 'mag_kepler',
        'sy_tmag': 'mag_tess',
        'sy_gaiamag': 'mag_gaia_g',
        
        # K2 özel
        'pl_radj': 'planet_radius_j',
        'pl_massj': 'planet_mass_j',
        'st_lum': 'stellar_luminosity',
        'st_vsin': 'stellar_vsini',
        'st_radv': 'stellar_radv',
    }
}

print("✅ PROFESSIONAL SCHEMA DEFINED")
print(f"Schema features: {len(unified_schema)}")
print(f"Kepler mappings: {len(rename_maps['kepler'])}")
print(f"TESS mappings: {len(rename_maps['tess'])}")
print(f"K2 mappings: {len(rename_maps['k2'])}")

# Add source column for each dataset
kepler_df['source_dataset'] = 'kepler'
tess_df['source_dataset'] = 'tess'
k2_df['source_dataset'] = 'k2'

print(f"\nDatasets tagged with source column.")
print(f"Kepler: {kepler_df['source_dataset'].value_counts()}")
print(f"TESS: {tess_df['source_dataset'].value_counts()}")
print(f"K2: {k2_df['source_dataset'].value_counts()}")


=== PROFESYONEL VERİ BİRLEŞTİRME HAZIRLIĞI ===
✅ PROFESYONEL SCHEMA TANIMLANDI
Schema features: 42
Kepler mappings: 29
TESS mappings: 19
K2 mappings: 36

Veri setleri source sütunu ile işaretlendi.
Kepler: source_dataset
kepler    9564
Name: count, dtype: int64
TESS: source_dataset
tess    7703
Name: count, dtype: int64
K2: source_dataset
k2    4004
Name: count, dtype: int64


In [69]:
# PROFESYONEL DISPOSITION MAPPING - BINARY CLASSIFICATION STRATEGY
def map_disposition_to_label(disposition):
    """
    Map disposition string to binary label for training.
    
    Strategy:
    - 1 = confirmed planet (for training)
    - 0 = false positive (for training) 
    - -1 = candidate (excluded from training, will be predicted later)
    
    Returns:
        (label, is_candidate) tuple
        label: 1=confirmed, 0=false positive, -1=candidate
        is_candidate: 1 if candidate, 0 otherwise
    """
    if pd.isna(disposition):
        return (-1, 1)  # Unknown = candidate
    
    disp_upper = str(disposition).upper().strip()
    
    # Confirmed planets (TRAINING DATA)
    if any(x in disp_upper for x in ['CONFIRMED', 'CP', 'CONFIRMED PLANET']):
        return (1, 0)
    
    # False positives (TRAINING DATA)
    if any(x in disp_upper for x in ['FALSE POSITIVE', 'FP', 'FA', 'FALSE ALARM', 'EB', 'REFUTED']):
        return (0, 0)
    
    # Candidates (EXCLUDED FROM TRAINING - will be predicted)
    if any(x in disp_upper for x in ['CANDIDATE', 'PC', 'APC', 'KP']):
        return (-1, 1)
    
    # Default: treat as candidate
    return (-1, 1)

# Test disposition mapping
print("=== BINARY CLASSIFICATION STRATEGY ===")
print("Training Data: Confirmed (1) + False Positive (0)")
print("Prediction Data: Candidates (-1) - will be predicted with XGBoost")
print("\nDisposition Mapping:")
test_dispositions = ['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE', 'PC', 'FP', 'CP', 'APC', 'KP', 'EB', 'REFUTED']
for disp in test_dispositions:
    label, is_candidate = map_disposition_to_label(disp)
    status = "TRAINING" if label in [0, 1] else "PREDICTION"
    print(f"{disp:15} -> label={label:2}, candidate={is_candidate} -> {status}")

print("\n✅ Binary classification strategy ready!")


=== BINARY CLASSIFICATION STRATEGY ===
Training Data: Confirmed (1) + False Positive (0)
Prediction Data: Candidates (-1) - will be predicted with XGBoost

Disposition Mapping:
CONFIRMED       -> label= 1, candidate=0 -> TRAINING
FALSE POSITIVE  -> label= 0, candidate=0 -> TRAINING
CANDIDATE       -> label=-1, candidate=1 -> PREDICTION
PC              -> label=-1, candidate=1 -> PREDICTION
FP              -> label= 0, candidate=0 -> TRAINING
CP              -> label= 1, candidate=0 -> TRAINING
APC             -> label=-1, candidate=1 -> PREDICTION
KP              -> label=-1, candidate=1 -> PREDICTION
EB              -> label= 0, candidate=0 -> TRAINING
REFUTED         -> label= 0, candidate=0 -> TRAINING

✅ Binary classification strategy ready!


In [None]:
# PROFESSIONAL SCHEMA MAPPING FUNCTION
def apply_schema_mapping(df, dataset_name):
    """Apply schema mapping to a single dataset."""
    print(f"\nMapping schema for {dataset_name}...")
    
    rename_map = rename_maps[dataset_name]
    
    # Find which columns exist in the dataframe
    existing_mappings = {}
    for src_col, tgt_col in rename_map.items():
        if src_col in df.columns:
            existing_mappings[src_col] = tgt_col
    
    # Rename columns
    df_mapped = df.rename(columns=existing_mappings)
    
    # Add source dataset identifier
    df_mapped['source_dataset'] = dataset_name
    
    # Handle disposition -> label mapping
    if '_disposition' in df_mapped.columns:
        label_data = df_mapped['_disposition'].apply(map_disposition_to_label)
        df_mapped['label'] = label_data.apply(lambda x: x[0])
        df_mapped['is_candidate'] = label_data.apply(lambda x: x[1])
        df_mapped = df_mapped.drop('_disposition', axis=1)
    else:
        df_mapped['label'] = -1
        df_mapped['is_candidate'] = 0
    
    # Handle K2 transit depth conversion (% to ppm)
    if dataset_name == 'k2' and '_transit_depth_percent' in df_mapped.columns:
        df_mapped['transit_depth_ppm'] = df_mapped['_transit_depth_percent'] * 10000  # % to ppm
        df_mapped = df_mapped.drop('_transit_depth_percent', axis=1)
    
    # Handle Kepler time offset (BJD - 2,454,833.0)
    if dataset_name == 'kepler' and '_t0_offset' in df_mapped.columns:
        if 't0_bjd' not in df_mapped.columns or df_mapped['t0_bjd'].isna().all():
            df_mapped['t0_bjd'] = df_mapped['_t0_offset'] + 2454833.0
        df_mapped = df_mapped.drop('_t0_offset', axis=1, errors='ignore')
    
    # Keep only columns that are in unified schema
    schema_cols = list(unified_schema.keys())
    existing_schema_cols = [col for col in schema_cols if col in df_mapped.columns]
    df_mapped = df_mapped[existing_schema_cols]
    
    # Remove duplicate columns (if any)
    df_mapped = df_mapped.loc[:, ~df_mapped.columns.duplicated()]
    
    print(f"  Mapped {len(existing_schema_cols)} columns")
    print(f"  Final shape: {df_mapped.shape}")
    print(f"  Label distribution: {df_mapped['label'].value_counts().to_dict()}")
    
    return df_mapped

# Test schema mapping
print("=== SCHEMA MAPPING TEST ===")
kepler_mapped = apply_schema_mapping(kepler_df, 'kepler')
tess_mapped = apply_schema_mapping(tess_df, 'tess')
k2_mapped = apply_schema_mapping(k2_df, 'k2')

print(f"\n✅ Schema mapping completed!")
print(f"Kepler mapped: {kepler_mapped.shape}")
print(f"TESS mapped: {tess_mapped.shape}")
print(f"K2 mapped: {k2_mapped.shape}")


=== SCHEMA MAPPING TEST ===

Mapping schema for kepler...
  Mapped 30 columns
  Final shape: (9564, 30)
  Label distribution: {0: 4839, 1: 2746, -1: 1979}

Mapping schema for tess...
  Mapped 21 columns
  Final shape: (7703, 21)
  Label distribution: {-1: 5724, 0: 1295, 1: 684}

Mapping schema for k2...
  Mapped 27 columns
  Final shape: (4004, 27)
  Label distribution: {1: 2315, -1: 1374, 0: 315}

✅ Schema mapping tamamlandı!
Kepler mapped: (9564, 30)
TESS mapped: (7703, 21)
K2 mapped: (4004, 27)


In [None]:
# PROFESSIONAL DATA MERGE AND FEATURE CLEANING
print("=== PROFESSIONAL DATA MERGE ===")

# Check for column conflicts before concatenation
print("\nChecking for column conflicts...")
all_columns = set()
for name, df in [('kepler', kepler_mapped), ('tess', tess_mapped), ('k2', k2_mapped)]:
    print(f"{name} columns: {df.columns.tolist()}")
    all_columns.update(df.columns)

print(f"\nTotal unique columns across all datasets: {len(all_columns)}")

# Ensure all DataFrames have the same columns
print("\nAligning column structure...")
common_columns = set(kepler_mapped.columns).intersection(set(tess_mapped.columns)).intersection(set(k2_mapped.columns))
print(f"Common columns: {len(common_columns)}")

# Add missing columns with NaN values
for df, name in [(kepler_mapped, 'kepler'), (tess_mapped, 'tess'), (k2_mapped, 'k2')]:
    missing_cols = common_columns - set(df.columns)
    for col in missing_cols:
        df[col] = np.nan
    print(f"{name} after alignment: {df.shape}")

# Concatenate all datasets
print("\nConcatenating datasets...")
unified_df = pd.concat([kepler_mapped, tess_mapped, k2_mapped], ignore_index=True, sort=False)

# AGGRESSIVE FEATURE CLEANING
print("\n=== AGGRESSIVE FEATURE CLEANING ===")
print(f"Original shape: {unified_df.shape}")

# 1. Remove columns with >80% missing values
print("\n1. Removing columns with >80% missing values...")
missing_pct = (unified_df.isnull().sum() / len(unified_df)) * 100
high_missing_cols = missing_pct[missing_pct > 80].index.tolist()
print(f"Removing {len(high_missing_cols)} columns with >80% missing")
unified_df = unified_df.drop(columns=high_missing_cols)

# 2. Remove constant columns (single value)
print("\n2. Removing constant columns...")
constant_cols = []
for col in unified_df.columns:
    if unified_df[col].nunique() <= 1:
        constant_cols.append(col)
print(f"Removing {len(constant_cols)} constant columns")
unified_df = unified_df.drop(columns=constant_cols)

# 3. Remove low variance numeric columns
print("\n3. Removing low variance columns...")
low_var_cols = []
for col in unified_df.select_dtypes(include=[np.number]).columns:
    if col not in ['label', 'is_candidate'] and unified_df[col].std() < 0.01:
        low_var_cols.append(col)
print(f"Removing {len(low_var_cols)} low variance columns")
unified_df = unified_df.drop(columns=low_var_cols)

# 4. Remove ID columns (not useful for prediction)
print("\n4. Removing ID columns...")
id_cols = [col for col in unified_df.columns if any(x in col.lower() for x in ['id', 'name', 'kepid', 'tid', 'epic'])]
id_cols = [col for col in id_cols if col not in ['star_id', 'planet_id']]  # Keep our unified IDs
print(f"Removing {len(id_cols)} ID columns")
unified_df = unified_df.drop(columns=id_cols)

# 5. Remove error columns (usually not predictive)
print("\n5. Removing error/uncertainty columns...")
error_cols = [col for col in unified_df.columns if any(x in col.lower() for x in ['err', 'error', 'uncertainty', 'flag'])]
error_cols = [col for col in error_cols if col not in ['fp_flag_nt', 'fp_flag_ss', 'fp_flag_co', 'fp_flag_ec']]  # Keep false positive flags
print(f"Removing {len(error_cols)} error columns")
unified_df = unified_df.drop(columns=error_cols)

print(f"\n✅ Feature cleaning complete!")
print(f"Final shape: {unified_df.shape}")
print(f"Removed {155 - unified_df.shape[1]} columns")
print(f"Remaining features: {unified_df.shape[1]}")

print(f"✓ Unified dataset: {unified_df.shape[0]} rows, {unified_df.shape[1]} columns")
print(f"  Label distribution: {unified_df['label'].value_counts().to_dict()}")
if "is_candidate" in unified_df.columns:
    print(f"  Candidates: {unified_df['is_candidate'].sum()}")
else:
    print("  Candidates: is_candidate column not found - will be created during imputation")

# OPTIMIZED KNN CROSS-DATASET IMPUTATION STRATEGY
print("=== OPTIMIZED KNN CROSS-DATASET IMPUTATION STRATEGY ===")

# Report missing values before cleaning
print("\nMissing values per column (top 10):")
missing = unified_df.isnull().sum().sort_values(ascending=False)
missing_pct = (missing / len(unified_df) * 100).round(2)
missing_df = pd.DataFrame({'count': missing, 'percent': missing_pct}).head(10)
print(missing_df.to_string())

# Identify numeric and categorical columns
numeric_cols = []
categorical_cols = []

for col in unified_df.columns:
    if col in ['star_id', 'planet_id', 'source_dataset']:
        categorical_cols.append(col)
    elif unified_df[col].dtype in ['float64', 'int64', 'int8', 'int16']:
        numeric_cols.append(col)

print(f"\n  Numeric columns: {len(numeric_cols)}")
print(f"  Categorical columns: {len(categorical_cols)}")

# OPTIMIZED KNN IMPUTATION STRATEGY
print("\n=== OPTIMIZED KNN IMPUTATION STRATEGY ===")

# 1. PROGRESSIVE IMPUTATION (Aşamalı doldurma)
print("\n1. PROGRESSIVE IMPUTATION")
# Group columns by missing percentage
missing_pct_numeric = {}
for col in numeric_cols:
    if col not in ['label', 'is_candidate']:
        missing_pct_numeric[col] = (unified_df[col].isnull().sum() / len(unified_df)) * 100

# Sort by missing percentage (least missing first)
sorted_cols = sorted(missing_pct_numeric.items(), key=lambda x: x[1])

print("Imputation order (least missing first):")
for col, pct in sorted_cols[:10]:
    print(f"  {col}: {pct:.1f}% missing")

# 2. SMART KNN IMPUTATION
print("\n2. SMART KNN IMPUTATION")
# Only impute columns with reasonable missing percentage
numeric_to_impute = [col for col, pct in missing_pct_numeric.items() 
                     if pct < 70 and pct > 0]  # Between 0% and 70% missing

print(f"Columns to impute: {len(numeric_to_impute)}")
print(f"Columns with >70% missing (will be dropped): {len([col for col, pct in missing_pct_numeric.items() if pct >= 70])}")

if numeric_to_impute:
    print(f"\nImputing {len(numeric_to_impute)} numeric columns with KNN...")
    
    # Prepare data for imputation
    imputation_data = unified_df[numeric_to_impute].copy()
    
    # Handle infinite values
    imputation_data = imputation_data.replace([np.inf, -np.inf], np.nan)
    
    # KNN Imputation with optimized parameters
    imputer = KNNImputer(
        n_neighbors=5, 
        weights='distance',
        metric='nan_euclidean'
    )
    
    # Fit and transform
    imputed_data = imputer.fit_transform(imputation_data)
    unified_df[numeric_to_impute] = imputed_data
    
    print(f"✅ KNN imputation completed for {len(numeric_to_impute)} columns")

# 3. CATEGORICAL IMPUTATION
print("\n3. CATEGORICAL IMPUTATION")
for col in categorical_cols:
    if unified_df[col].isna().any():
        mode_val = unified_df[col].mode()[0] if len(unified_df[col].mode()) > 0 else 'unknown'
        unified_df[col].fillna(mode_val, inplace=True)
        print(f"  {col}: filled with '{mode_val}'")

# 4. FINAL DATA TYPE CONVERSION
print("\n4. FINAL DATA TYPE CONVERSION")
for col in unified_df.columns:
    if col in unified_schema:
        target_dtype = unified_schema[col]['dtype']
        try:
            if target_dtype == 'object':
                unified_df[col] = unified_df[col].astype(str)
            else:
                unified_df[col] = unified_df[col].astype(target_dtype)
        except:
            print(f"  Warning: Could not convert {col} to {target_dtype}")

# 5. FINAL QUALITY CHECK
print("\n5. FINAL QUALITY CHECK")
print(f"Final dataset shape: {unified_df.shape}")
print(f"Missing values: {unified_df.isnull().sum().sum()}")
print(f"Missing percentage: {(unified_df.isnull().sum().sum() / (unified_df.shape[0] * unified_df.shape[1])) * 100:.2f}%")

# Check for infinite values
inf_count = np.isinf(unified_df.select_dtypes(include=[np.number])).sum().sum()
print(f"Infinite values: {inf_count}")

print("\n✅ Optimized KNN cross-dataset imputation completed!")
print("Strategy: Progressive imputation with smart KNN (k=5, distance weights)")
print("Benefits: Cross-dataset learning, better imputation quality, maintained data relationships")


=== PROFESYONEL VERİ BİRLEŞTİRME ===

Checking for column conflicts...
kepler columns: ['star_id', 'planet_id', 'source_dataset', 'ra_deg', 'dec_deg', 'label', 'is_candidate', 'period_days', 't0_bjd', 'transit_depth_ppm', 'duration_hours', 'impact_param', 'ecc', 'snr', 'rp_re', 'teq_k', 'insolation', 'teff_k', 'logg_cgs', 'feh_dex', 'mass_solar', 'radius_solar', 'mag_kepler', 'num_transits', 'ror_ratio', 'dor_ratio', 'fp_flag_nt', 'fp_flag_ss', 'fp_flag_co', 'fp_flag_ec']
tess columns: ['star_id', 'planet_id', 'source_dataset', 'ra_deg', 'dec_deg', 'label', 'is_candidate', 'period_days', 't0_bjd', 'transit_depth_ppm', 'duration_hours', 'rp_re', 'teq_k', 'insolation', 'teff_k', 'logg_cgs', 'radius_solar', 'mag_tess', 'stellar_pmra', 'stellar_pmdec', 'stellar_distance']
k2 columns: ['star_id', 'planet_id', 'source_dataset', 'ra_deg', 'dec_deg', 'label', 'is_candidate', 'period_days', 't0_bjd', 'transit_depth_ppm', 'duration_hours', 'impact_param', 'ecc', 'rp_re', 'mp_mj', 'teq_k', 'insol

In [72]:
# OPTIMIZED KNN CROSS-DATASET IMPUTATION STRATEGY
print("=== OPTIMIZED KNN CROSS-DATASET IMPUTATION STRATEGY ===")

# Report missing values before cleaning
print("\nMissing values per column (top 10):")
missing = unified_df.isnull().sum().sort_values(ascending=False)
missing_pct = (missing / len(unified_df) * 100).round(2)
missing_df = pd.DataFrame({'count': missing, 'percent': missing_pct}).head(10)
print(missing_df.to_string())

# Identify numeric and categorical columns
numeric_cols = []
categorical_cols = []

for col in unified_df.columns:
    if col in ['star_id', 'planet_id', 'source_dataset']:
        categorical_cols.append(col)
    elif unified_df[col].dtype in ['float64', 'int64', 'int8', 'int16']:
        numeric_cols.append(col)

print(f"\n  Numeric columns: {len(numeric_cols)}")
print(f"  Categorical columns: {len(categorical_cols)}")

# OPTIMIZED KNN IMPUTATION STRATEGY
print("\n=== OPTIMIZED KNN IMPUTATION STRATEGY ===")

# 1. PROGRESSIVE IMPUTATION (Aşamalı doldurma)
print("\n1. PROGRESSIVE IMPUTATION")
# Group columns by missing percentage
missing_pct_numeric = {}
for col in numeric_cols:
    if col not in ['label', 'is_candidate']:
        missing_pct_numeric[col] = (unified_df[col].isnull().sum() / len(unified_df)) * 100

# Sort by missing percentage (least missing first)
sorted_cols = sorted(missing_pct_numeric.items(), key=lambda x: x[1])

print("Imputation order (least missing first):")
for col, pct in sorted_cols[:10]:
    print(f"  {col}: {pct:.1f}% missing")

# 2. SMART KNN IMPUTATION
print("\n2. SMART KNN IMPUTATION")
# Only impute columns with reasonable missing percentage
numeric_to_impute = [col for col, pct in missing_pct_numeric.items() 
                     if pct < 70 and pct > 0]  # Between 0% and 70% missing

print(f"Columns to impute: {len(numeric_to_impute)}")
print(f"Columns with >70% missing (will be dropped): {len([col for col, pct in missing_pct_numeric.items() if pct >= 70])}")

if numeric_to_impute:
    print(f"\nImputing {len(numeric_to_impute)} numeric columns with KNN...")
    
    # Prepare data for imputation
    imputation_data = unified_df[numeric_to_impute].copy()
    
    # Handle infinite values
    imputation_data = imputation_data.replace([np.inf, -np.inf], np.nan)
    
    # KNN Imputation with optimized parameters
    imputer = KNNImputer(
        n_neighbors=5, 
        weights='distance',
        metric='nan_euclidean'
    )
    
    # Fit and transform
    imputed_data = imputer.fit_transform(imputation_data)
    unified_df[numeric_to_impute] = imputed_data
    
    print(f"✅ KNN imputation completed for {len(numeric_to_impute)} columns")

# 3. CATEGORICAL IMPUTATION
print("\n3. CATEGORICAL IMPUTATION")
for col in categorical_cols:
    if unified_df[col].isna().any():
        mode_val = unified_df[col].mode()[0] if len(unified_df[col].mode()) > 0 else 'unknown'
        unified_df[col].fillna(mode_val, inplace=True)
        print(f"  {col}: filled with '{mode_val}'")

# 4. FINAL DATA TYPE CONVERSION
print("\n4. FINAL DATA TYPE CONVERSION")
for col in unified_df.columns:
    if col in unified_schema:
        target_dtype = unified_schema[col]['dtype']
        try:
            if target_dtype == 'object':
                unified_df[col] = unified_df[col].astype(str)
            else:
                unified_df[col] = unified_df[col].astype(target_dtype)
        except:
            print(f"  Warning: Could not convert {col} to {target_dtype}")

# 5. FINAL QUALITY CHECK
print("\n5. FINAL QUALITY CHECK")
print(f"Final dataset shape: {unified_df.shape}")
print(f"Missing values: {unified_df.isnull().sum().sum()}")
print(f"Missing percentage: {(unified_df.isnull().sum().sum() / (unified_df.shape[0] * unified_df.shape[1])) * 100:.2f}%")

# Check for infinite values
inf_count = np.isinf(unified_df.select_dtypes(include=[np.number])).sum().sum()
print(f"Infinite values: {inf_count}")

print("\n✅ Optimized KNN cross-dataset imputation completed!")
print("Strategy: Progressive imputation with smart KNN (k=5, distance weights)")
print("Benefits: Cross-dataset learning, better imputation quality, maintained data relationships")


=== OPTIMIZED KNN CROSS-DATASET IMPUTATION STRATEGY ===

Missing values per column (top 10):
               count  percent
star_id            0      0.0
logg_cgs           0      0.0
stellar_pmdec      0      0.0
stellar_pmra       0      0.0
mag_tess           0      0.0
fp_flag_ec         0      0.0
fp_flag_co         0      0.0
fp_flag_ss         0      0.0
fp_flag_nt         0      0.0
dor_ratio          0      0.0

  Numeric columns: 30
  Categorical columns: 3

=== OPTIMIZED KNN IMPUTATION STRATEGY ===

1. PROGRESSIVE IMPUTATION
Imputation order (least missing first):
  ra_deg: 0.0% missing
  dec_deg: 0.0% missing
  period_days: 0.0% missing
  t0_bjd: 0.0% missing
  transit_depth_ppm: 0.0% missing
  duration_hours: 0.0% missing
  impact_param: 0.0% missing
  ecc: 0.0% missing
  snr: 0.0% missing
  rp_re: 0.0% missing

2. SMART KNN IMPUTATION
Columns to impute: 0
Columns with >70% missing (will be dropped): 0

3. CATEGORICAL IMPUTATION

4. FINAL DATA TYPE CONVERSION

5. FINAL QUAL

In [None]:

print("=== PROFESSIONAL ARTIFACT EXPORT ===")

import json
import yaml
from datetime import datetime

# 1. Save unified schema as YAML
print("Saving unified_schema.yaml...")
with open('unified_schema.yaml', 'w') as f:
    yaml.dump(unified_schema, f, default_flow_style=False, sort_keys=False)
print("✓ Saved: unified_schema.yaml")

# 2. Save rename maps as JSON
print("Saving rename_maps.json...")
with open('rename_maps.json', 'w') as f:
    json.dump(rename_maps, f, indent=2)
print("✓ Saved: rename_maps.json")

# 3. Generate comprehensive data quality report
print("Generating merge_report.md...")
report = []
report.append("# NASA Exoplanet Data Merge Report")
report.append(f"\n**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append("\n---\n")

# Summary statistics
report.append("## Dataset Summary\n")
report.append(f"**Total records:** {len(unified_df):,}")
report.append(f"**Total features:** {len(unified_df.columns)}")
report.append(f"\n**Source distribution:**")
for source, count in unified_df['source_dataset'].value_counts().items():
    pct = count / len(unified_df) * 100
    report.append(f"- {source}: {count:,} ({pct:.1f}%)")

# Label distribution
report.append("\n## Label Distribution\n")
label_map = {1: 'Confirmed Planet', 0: 'False Positive', -1: 'Candidate'}
for label_val, label_name in label_map.items():
    count = (unified_df['label'] == label_val).sum()
    pct = count / len(unified_df) * 100
    report.append(f"- **{label_name}**: {count:,} ({pct:.1f}%)")

# Check if is_candidate column exists, if not create it
if 'is_candidate' not in unified_df.columns:
    unified_df['is_candidate'] = (unified_df['label'] == -1).astype(int)
    print("Created missing 'is_candidate' column based on label values")

candidates = unified_df['is_candidate'].sum()
report.append(f"\n**Candidates flagged:** {candidates:,}")

# Data quality metrics
report.append("\n## Data Quality Metrics\n")

# Completeness
completeness = (1 - unified_df.isnull().sum() / len(unified_df)) * 100
avg_completeness = completeness.mean()
report.append(f"**Average completeness:** {avg_completeness:.1f}%\n")

report.append("**Completeness by feature (bottom 10):**")
bottom_10 = completeness.sort_values().head(10)
for col, comp in bottom_10.items():
    report.append(f"- {col}: {comp:.1f}%")

# Key feature availability
report.append("\n## Key Feature Availability\n")
key_features = ['period_days', 'rp_re', 'transit_depth_ppm', 'teff_k', 'duration_hours']
for feat in key_features:
    if feat in unified_df.columns:
        avail = (unified_df[feat].notna().sum() / len(unified_df)) * 100
        report.append(f"- **{feat}**: {avail:.1f}% available")

# Cross-dataset comparisons
report.append("\n## Cross-Dataset Comparisons\n")
for source in unified_df['source_dataset'].unique():
    source_df = unified_df[unified_df['source_dataset'] == source]
    report.append(f"\n### {source.upper()} Dataset")
    report.append(f"- Records: {len(source_df):,}")
    report.append(f"- Confirmed planets: {(source_df['label'] == 1).sum()}")
    report.append(f"- False positives: {(source_df['label'] == 0).sum()}")
    report.append(f"- Candidates: {(source_df['label'] == -1).sum()}")

# Recommendations
report.append("\n## Recommendations\n")
report.append("1. **Training strategy**: Use only confirmed (label=1) and false positive (label=0) for supervised training")
report.append("2. **Candidate handling**: Treat candidates (label=-1) as unlabeled data for semi-supervised learning or exclude from training")
report.append("3. **Feature engineering**: Consider log-transforming skewed features (period_days, rp_re, etc.)")
report.append("4. **Imbalanced classes**: Apply SMOTE or class weighting if false positives significantly outnumber confirmed planets")
report.append("5. **Missing values**: Features with <50% completeness may need removal or careful imputation validation")

# Save report
with open('merge_report.md', 'w', encoding='utf-8') as f:
    f.write("\n".join(report))
print("✓ Saved: merge_report.md")

# 4. Save unified dataset as Parquet (efficient format)
print("Saving unified_exoplanets.parquet...")
unified_df.to_parquet('unified_exoplanets.parquet', index=False)
print(f"✓ Saved: unified_exoplanets.parquet ({unified_df.shape[0]} rows)")

# 5. Save unified dataset as CSV
print("Saving unified_exoplanets.csv...")
unified_df.to_csv('unified_exoplanets.csv', index=False)
print(f"✓ Saved: unified_exoplanets.csv")

# 6. Save training dataset (confirmed labels only)
print("Saving unified_exoplanets_train.csv...")
train_df = unified_df[unified_df['label'].isin([0, 1])].copy()
train_df.to_csv('unified_exoplanets_train.csv', index=False)
print(f"✓ Saved: unified_exoplanets_train.csv ({len(train_df)} rows with confirmed labels)")

# Summary
print("\n" + "="*80)
print("PROFESSIONAL PIPELINE COMPLETE")
print("="*80)
print(f"\nGenerated artifacts:")
print(f"  1. unified_schema.yaml - Feature schema with dtypes and units")
print(f"  2. rename_maps.json - Column mapping dictionaries")
print(f"  3. merge_report.md - Data quality report")
print(f"  4. unified_exoplanets.parquet - Full unified dataset (Parquet)")
print(f"  5. unified_exoplanets.csv - Full unified dataset (CSV)")
print(f"  6. unified_exoplanets_train.csv - Training subset (confirmed labels only)")
print(f"\nTotal unified records: {len(unified_df):,}")
print(f"Training records: {len(train_df):,}")
print(f"Confirmed planets: {(train_df['label'] == 1).sum():,}")
print(f"False positives: {(train_df['label'] == 0).sum():,}")
print(f"Total features: {len(unified_df.columns)}")


=== PROFESYONEL ARTIFACT EXPORT ===
Saving unified_schema.yaml...
✓ Saved: unified_schema.yaml
Saving rename_maps.json...
✓ Saved: rename_maps.json
Generating merge_report.md...
Created missing 'is_candidate' column based on label values
✓ Saved: merge_report.md
Saving unified_exoplanets.parquet...
✓ Saved: unified_exoplanets.parquet (21271 rows)
Saving unified_exoplanets.csv...
✓ Saved: unified_exoplanets.csv
Saving unified_exoplanets_train.csv...
✓ Saved: unified_exoplanets_train.csv (12194 rows with confirmed labels)

PROFESYONEL PIPELINE COMPLETE

Generated artifacts:
  1. unified_schema.yaml - Feature schema with dtypes and units
  2. rename_maps.json - Column mapping dictionaries
  3. merge_report.md - Data quality report
  4. unified_exoplanets.parquet - Full unified dataset (Parquet)
  5. unified_exoplanets.csv - Full unified dataset (CSV)
  6. unified_exoplanets_train.csv - Training subset (confirmed labels only)

Total unified records: 21,271
Training records: 12,194
Confirme

In [74]:
# SMART FEATURE SELECTION
print("=== SMART FEATURE SELECTION ===")

# 1. TRAINING DATA: Sadece confirmed + false positive
print("\n1. TRAINING DATA HAZIRLIĞI")
train_data = unified_df[unified_df['label'].isin([0, 1])].copy()
print(f"Training data: {train_data.shape}")
print(f"Confirmed planets: {(train_data['label'] == 1).sum()}")
print(f"False positives: {(train_data['label'] == 0).sum()}")

# 2. CANDIDATE DATA: Tahmin edilecek veriler
print("\n2. CANDIDATE DATA HAZIRLIĞI")
candidate_data = unified_df[unified_df['label'] == -1].copy()
print(f"Candidate data: {candidate_data.shape}")
print(f"Candidates to predict: {len(candidate_data)}")

# 3. SMART FEATURE SELECTION
print("\n3. SMART FEATURE SELECTION")
# Get all numeric features
all_numeric_features = [col for col in unified_df.columns 
                       if col not in ['star_id', 'planet_id', 'source_dataset', 'label', 'is_candidate']
                       and unified_df[col].dtype in ['float64', 'int64', 'int8', 'int16']]

print(f"All numeric features: {len(all_numeric_features)}")

# Feature correlation analysis
print("\n4. FEATURE CORRELATION ANALYSIS")
correlation_matrix = train_data[all_numeric_features].corr().abs()
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if correlation_matrix.iloc[i, j] > 0.95:  # Very high correlation
            high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j]))

print(f"High correlation pairs (>0.95): {len(high_corr_pairs)}")
if len(high_corr_pairs) > 0:
    print("Top 5 high correlation pairs:")
    for pair in high_corr_pairs[:5]:
        print(f"  {pair[0]} <-> {pair[1]}: {pair[2]:.3f}")

# Remove highly correlated features
features_to_remove = set()
for pair in high_corr_pairs:
    # Keep the feature with higher variance
    var1 = train_data[pair[0]].var()
    var2 = train_data[pair[1]].var()
    if var1 > var2:
        features_to_remove.add(pair[1])
    else:
        features_to_remove.add(pair[0])

print(f"Removing {len(features_to_remove)} highly correlated features")
selected_features = [col for col in all_numeric_features if col not in features_to_remove]

# 5. FEATURE IMPORTANCE WITH QUICK RANDOM FOREST
print("\n5. FEATURE IMPORTANCE ANALYSIS")
from sklearn.ensemble import RandomForestClassifier

# Quick feature importance analysis
rf_selector = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
X_temp = train_data[selected_features].fillna(train_data[selected_features].median())
y_temp = train_data['label']

rf_selector.fit(X_temp, y_temp)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': rf_selector.feature_importances_
}).sort_values('importance', ascending=False)

# Select top features (keep top 70% or minimum 20 features)
n_features = max(20, int(len(selected_features) * 0.7))
top_features = feature_importance.head(n_features)['feature'].tolist()

print(f"Selected {len(top_features)} most important features")
print(f"Top 10 features: {top_features[:10]}")

# 6. FINAL FEATURE PREPARATION
print("\n6. FINAL FEATURE PREPARATION")
numeric_features = top_features

print(f"Final numeric features: {len(numeric_features)}")

# Prepare training features
X_train = train_data[numeric_features].fillna(train_data[numeric_features].median())
y_train = train_data['label']

# Prepare candidate features
X_candidates = candidate_data[numeric_features].fillna(candidate_data[numeric_features].median())

print(f"Training X shape: {X_train.shape}")
print(f"Training y shape: {y_train.shape}")
print(f"Candidate X shape: {X_candidates.shape}")

print("\n✅ Smart feature selection complete!")
print("Next: XGBoost model training and candidate prediction...")


=== SMART FEATURE SELECTION ===

1. TRAINING DATA HAZIRLIĞI
Training data: (12194, 34)
Confirmed planets: 5745
False positives: 6449

2. CANDIDATE DATA HAZIRLIĞI
Candidate data: (9077, 34)
Candidates to predict: 9077

3. SMART FEATURE SELECTION
All numeric features: 29

4. FEATURE CORRELATION ANALYSIS
High correlation pairs (>0.95): 1
Top 5 high correlation pairs:
  impact_param <-> ror_ratio: 0.979
Removing 1 highly correlated features

5. FEATURE IMPORTANCE ANALYSIS
Selected 20 most important features
Top 10 features: ['fp_flag_co', 'fp_flag_ss', 'fp_flag_nt', 'rp_re', 'dor_ratio', 't0_bjd', 'num_transits', 'snr', 'radius_solar', 'teq_k']

6. FINAL FEATURE PREPARATION
Final numeric features: 20
Training X shape: (12194, 20)
Training y shape: (12194,)
Candidate X shape: (9077, 20)

✅ Smart feature selection complete!
Next: XGBoost model training and candidate prediction...


In [None]:
# DATA SAVE - CSV FORMAT
print("=== DATA SAVE ===")

# 1. Save training data
print("\n1. Saving training data...")
train_data.to_csv('training_data.csv', index=False)
print(f"✅ Training data saved: {train_data.shape}")

# 2. Save candidate data
print("\n2. Saving candidate data...")
candidate_data.to_csv('candidate_data.csv', index=False)
print(f"✅ Candidate data saved: {candidate_data.shape}")

# 3. Save feature matrices
print("\n3. Saving feature matrices...")
# Save X_train and y_train as a DataFrame
train_features_df = pd.DataFrame(X_train, columns=numeric_features)
train_features_df['label'] = y_train
train_features_df.to_csv('X_train_features.csv', index=False)
print(f"✅ Training features saved: {train_features_df.shape}")

# Save X_candidates
candidate_features_df = pd.DataFrame(X_candidates, columns=numeric_features)
candidate_features_df.to_csv('X_candidates_features.csv', index=False)
print(f"✅ Candidate features saved: {candidate_features_df.shape}")

# 4. Save unified dataset (updated)
print("\n4. Saving unified dataset...")
unified_df.to_csv('unified_dataset_final.csv', index=False)
print(f"✅ Unified dataset saved: {unified_df.shape}")

print("\n✅ All data saved in CSV format!")
print("Files:")
print("  - training_data.csv")
print("  - candidate_data.csv") 
print("  - X_train_features.csv")
print("  - X_candidates_features.csv")
print("  - unified_dataset_final.csv")


=== VERİ KAYDETME ===

1. Training data kaydediliyor...
✅ Training data kaydedildi: (12194, 34)

2. Candidate data kaydediliyor...
✅ Candidate data kaydedildi: (9077, 34)

3. Feature matrices kaydediliyor...
✅ Training features kaydedildi: (12194, 21)
✅ Candidate features kaydedildi: (9077, 20)

4. Unified dataset kaydediliyor...
✅ Unified dataset kaydedildi: (21271, 34)

✅ Tüm veriler CSV formatında kaydedildi!
Dosyalar:
  - training_data.csv
  - candidate_data.csv
  - X_train_features.csv
  - X_candidates_features.csv
  - unified_dataset_final.csv
