In [1]:
# import libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold

In [2]:
df = pd.read_csv('../../data/processed/categorized_data.csv')
df.head()

Unnamed: 0,P_NAME,P_DETECTION,P_DISCOVERY_FACILITY,P_YEAR,P_MASS,P_MASS_ORIGIN,P_RADIUS,P_PERIOD,P_SEMI_MAJOR_AXIS,P_ECCENTRICITY,...,S_LUMINOSITY,S_SNOW_LINE,S_ABIO_ZONE,S_TIDAL_LOCK,P_HABZONE_OPT,P_HABZONE_CON,P_TYPE_TEMP,P_HABITABLE,P_ESI,HABITABILITY
0,OGLE-2016-BLG-1227L b,Microlensing,OGLE,2020,250.0,Mass,13.9,,3.4,,...,,,,0.214133,0,0,,0,,0
1,Kepler-276 c,Transit,Kepler,2013,16.6,Mass,2.9,31.884,0.1994,0.0,...,0.814704,2.437046,2.097783,0.31698,0,0,Hot,0,0.272032,0
2,Kepler-829 b,Transit,Kepler,2016,5.1,M-R relationship,2.11,6.883376,0.0678,0.0,...,1.096478,2.827247,1.756317,0.459559,0,0,Hot,0,0.254763,0
3,K2-283 b,Transit,K2,2018,12.2,M-R relationship,3.52,1.921036,0.0291,,...,0.299226,1.476943,0.568374,0.44376,0,0,Hot,0,0.193906,0
4,Kepler-477 b,Transit,Kepler,2016,4.94,M-R relationship,2.07,11.119907,0.0911,0.0,...,0.42462,1.759397,0.768502,0.38615,0,0,Hot,0,0.276721,0


In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
P_YEAR,5599.0,2016.189,4.527714,1992.0,2014.0,2016.0,2020.0,2024.0
P_MASS,5592.0,442.5213,2369.505,0.02,4.04,8.75,162.09249,89700.0
P_RADIUS,5592.0,5.715494,5.33145,0.31,1.78,2.78,11.9,77.342
P_PERIOD,5350.0,79940.82,5498596.0,0.09070629,4.465382,11.569511,41.505555,402000000.0
P_SEMI_MAJOR_AXIS,5595.0,6.307151,130.6763,0.0044,0.05275,0.1021,0.2846,7506.0
P_ECCENTRICITY,4822.0,0.07640731,0.1505245,0.0,0.0,0.0,0.09,0.95
P_INCLINATION,4288.0,86.86255,10.55637,0.37,86.9,88.7605,89.67,176.092
P_OMEGA,1659.0,119.026,119.4222,-233.0,28.3,104.49,210.0,395.341
S_MAG,5380.0,12.63606,3.101634,0.872,10.8375,13.466,15.009,44.61
S_DISTANCE,5578.0,696.6242,1106.374,1.30119,112.467,407.1005,852.0465,8500.0


In [4]:
df['HABITABILITY'].value_counts()

HABITABILITY
0    5529
1      70
Name: count, dtype: int64

In [5]:
# calculate the imbalance ratio
imbalance_ratio = df['HABITABILITY'].value_counts()[0] / df['HABITABILITY'].value_counts()[1]
print(f'Imbalance Ratio (Class 0 to Class 1): {imbalance_ratio:.2f}')

Imbalance Ratio (Class 0 to Class 1): 78.99


Given the imbalance ratio, we can see that the dataset is imbalanced, with significantly more non-habitable exoplanets than habitable ones. This information will be crucial when selecting appropriate modeling techniques and evaluation metrics to ensure that our model performs well across both classes.

In addition, we would like to avoid star properties leakage when spliting the train and test data.
To split train test data we try stratified groupfold method.

In [6]:
X = df.drop(columns=['HABITABILITY'])
y = df['HABITABILITY']
groups = df['S_NAME']

sgkf = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)
for train_idx, test_idx in sgkf.split(X, y, groups):
    df_train = df.iloc[train_idx]
    df_test = df.iloc[test_idx]
    break # only need the first split for train_test split

print("Total exoplanets:", len(df))
print("Train_size:", len(df_train)/len(df))
print("Test_size:", len(df_test)/len(df))
print("Train_set_total", len(df_train))
print("Test_set_total", len(df_test))

print("Train HABITABILITY distribution:")
print(df_train['HABITABILITY'].value_counts())
print("Test HABITABILITY distribution:")
print(df_test['HABITABILITY'].value_counts())

# Check for data leakage
print("Unique stars in train:", set(df_train['S_NAME'].unique()))
print("Unique stars in test:", set(df_test['S_NAME'].unique()))
print("Common stars in train and test:", set(df_train['S_NAME'].unique()) & set(df_test['S_NAME'].unique()))
print("Number of common stars:", len(set(df_train['S_NAME'].unique()) & set(df_test['S_NAME'].unique())))

Total exoplanets: 5599
Train_size: 0.6706554741918199
Test_size: 0.32934452580818
Train_set_total 3755
Test_set_total 1844
Train HABITABILITY distribution:
HABITABILITY
0    3704
1      51
Name: count, dtype: int64
Test HABITABILITY distribution:
HABITABILITY
0    1825
1      19
Name: count, dtype: int64
Unique stars in train: {'OGLE-2012-BLG-0358L', 'HD 110067', 'Kepler-1444', 'HD 163607', 'HD 100655', 'Kepler-290', 'HD 114613', 'HD 134606', 'Kepler-1293', 'TOI-1693', 'TOI-444', 'Kepler-1266', 'MOA-2009-BLG-266L', 'OGLE-2017-BLG-0173L', 'TOI-201', 'HD 216770', 'Kepler-1892', 'Kepler-727', 'HD 108874', 'AU Mic', '18 Del', 'K2-167', 'WASP-186', 'Kepler-338', 'Kepler-1282', 'Kepler-1748', 'TOI-451', 'Kepler-1093', 'Kepler-1524', 'Kepler-285', 'OGLE-2016-BLG-1067L', 'BD-00 4475', 'EPIC 246851721', 'Kepler-550', 'Kepler-593', 'Kepler-554', 'Kepler-77', 'HAT-P-65', 'Kepler-1142', 'HD 189567', 'HD 26161', 'Kepler-1180', 'Kepler-1952', 'Kepler-1684', 'Kepler-1490', 'Kepler-708', 'Kepler-644',

In [7]:
# save the splits and put the test set aside for final evaluation
df_train.to_csv("../../data/processed/train_phl.csv", index=False)
df_test.to_csv("../../data/processed/test_phl.csv", index=False)