In [2]:
from config import *

In [4]:
# Load dataset
df = pd.read_csv("../data/cleaned_planet_data.csv")
print(df.columns)

Index(['P_MASS', 'P_RADIUS', 'P_SEMI_MAJOR_AXIS', 'P_ECCENTRICITY',
       'P_IMPACT_PARAMETER', 'S_DISTANCE', 'S_MASS', 'S_RADIUS', 'S_AGE',
       'S_AGE_ERROR_MIN', 'S_AGE_ERROR_MAX', 'S_TEMPERATURE', 'P_ESCAPE',
       'P_POTENTIAL', 'P_GRAVITY', 'P_DENSITY', 'P_DISTANCE', 'P_FLUX',
       'P_TEMP_EQUIL', 'P_TEMP_SURF', 'S_LUMINOSITY', 'S_HZ_CON0_MAX',
       'S_ABIO_ZONE', 'S_TIDAL_LOCK', 'P_HABZONE_OPT', 'P_HABZONE_CON',
       'P_HABITABLE', 'P_ESI', 'CEESA'],
      dtype='object')


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5219 entries, 0 to 5218
Data columns (total 29 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   P_MASS              5219 non-null   float64
 1   P_RADIUS            5219 non-null   float64
 2   P_SEMI_MAJOR_AXIS   5219 non-null   float64
 3   P_ECCENTRICITY      5219 non-null   float64
 4   P_IMPACT_PARAMETER  5219 non-null   float64
 5   S_DISTANCE          5219 non-null   float64
 6   S_MASS              5219 non-null   float64
 7   S_RADIUS            5219 non-null   float64
 8   S_AGE               5219 non-null   float64
 9   S_AGE_ERROR_MIN     5219 non-null   float64
 10  S_AGE_ERROR_MAX     5219 non-null   float64
 11  S_TEMPERATURE       5219 non-null   float64
 12  P_ESCAPE            5219 non-null   float64
 13  P_POTENTIAL         5219 non-null   float64
 14  P_GRAVITY           5219 non-null   float64
 15  P_DENSITY           5219 non-null   float64
 16  P_DIST

### Normalizing the columns

In [5]:
from sklearn.preprocessing import StandardScaler

# Identify numeric columns automatically
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Remove target column from numeric columns
cols_to_exclude = ["P_HABITABLE","P_ESI","CEESA","P_HABZONE_CON","P_HABZONE_OPT"]
for col in cols_to_exclude:
    if col in numeric_cols:
        numeric_cols.remove(col)

# Create scaler object
scaler = StandardScaler()

# Fit on training numeric data and transform both train/test later
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Show sample output
df.head()

Unnamed: 0,P_MASS,P_RADIUS,P_SEMI_MAJOR_AXIS,P_ECCENTRICITY,P_IMPACT_PARAMETER,S_DISTANCE,S_MASS,S_RADIUS,S_AGE,S_AGE_ERROR_MIN,...,P_TEMP_SURF,S_LUMINOSITY,S_HZ_CON0_MAX,S_ABIO_ZONE,S_TIDAL_LOCK,P_HABZONE_OPT,P_HABZONE_CON,P_HABITABLE,P_ESI,CEESA
0,1.353895,1.212411,-0.041017,1.040274,0.438135,-0.582114,3.861587,4.02591,0.314205,-0.751818,...,0.054816,1.489149,4.536053,-0.024618,2.793402,0.0,0.0,0.0,0.083004,7.41912
1,0.998798,1.233453,-0.039243,0.019365,0.574099,-0.54911,4.039359,6.532509,-1.021311,1.344374,...,0.097682,2.334159,6.019105,-0.024618,1.484872,0.0,0.0,0.0,0.081792,6.696099
2,0.24136,1.359705,-0.044416,-0.521513,-0.055607,-0.600338,2.750514,2.167449,0.046411,0.211342,...,-0.037427,0.438891,2.379354,-0.024618,2.214053,0.0,0.0,0.0,0.081758,4.16652
3,0.495781,1.296579,-0.030056,1.998231,0.884954,-0.659397,-0.116054,-0.171889,-0.171492,-0.811468,...,-1.352668,-0.074742,-0.199411,-0.024618,0.09737,0.0,0.0,0.0,0.16331,3.362421
4,0.011233,1.464915,-0.038282,4.075955,-0.150845,-0.656103,0.261711,-0.125428,1.099605,1.710943,...,-1.218946,-0.069832,-0.112736,-0.024618,1.103524,1.0,1.0,0.0,0.368405,2.003263


### Splitting the data

In [6]:
# Separate features and target
X = df.drop(columns=["P_HABITABLE"])   # independent features
y = df["P_HABITABLE"]                  # target

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display output shapes
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)


Training features shape: (4175, 28)
Testing features shape: (1044, 28)
Training labels shape: (4175,)
Testing labels shape: (1044,)


### Handle class imbalance

In [14]:
# Apply SMOTE only on training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", X_train.shape, y_train.value_counts())
print("After SMOTE:", X_train_resampled.shape, y_train_resampled.value_counts())

Before SMOTE: (4175, 28) P_HABITABLE
0.0    4126
2.0      31
1.0      18
Name: count, dtype: int64
After SMOTE: (12378, 28) P_HABITABLE
0.0    4126
2.0    4126
1.0    4126
Name: count, dtype: int64
