This is everything from model.ipynb but with SMOTE applied. Refer to model.ipynb for explanations.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from IPython.display import Image
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import optuna
from scipy.stats import kurtosis
from scipy.stats import entropy
from imblearn.over_sampling import SMOTE

df = pd.read_csv('star_classification.csv')
df = df.drop(['obj_ID','alpha','delta','run_ID','rerun_ID','cam_col','field_ID','fiber_ID','spec_obj_ID', 'plate', 'MJD'], axis=1)
df['class'] = df['class'].map({'GALAXY':0, 'STAR':1, 'QSO': 2})
df = df[df['u'] > -4000]
df = df[df['r'] < 26]
df = df[df['i'] < 26]
df = df[df['g'] < 27.5]
df = df[df['z'] < 24.5]
df = df[df['u'] > 13]
df = df[df['g'] > 12]
df = df[df['z'] > 11]

# Color indices — common in astronomy, indicate object temperature/type
df['u_g'] = df['u'] - df['g']  # Ultraviolet to green difference
df['g_r'] = df['g'] - df['r']  # Green to red difference
df['r_i'] = df['r'] - df['i']  # Red to near-infrared difference
df['i_z'] = df['i'] - df['z']  # Near-IR to IR difference

# Composite color indices — capture broader color range
df['u_r'] = df['u'] - df['r']  # UV to red
df['g_z'] = df['g'] - df['z']  # Green to IR

# Mean flux — average brightness across all bands, can reflect luminosity
df['mean_flux'] = df[['u', 'g', 'r', 'i', 'z']].mean(axis=1)

# Brightest and faintest filter — useful for contrast information
df['max_flux'] = df[['u', 'g', 'r', 'i', 'z']].max(axis=1)  # Brightest band
df['min_flux'] = df[['u', 'g', 'r', 'i', 'z']].min(axis=1)  # Faintest band

# Flux range — dynamic range of object's brightness
df['flux_range'] = df['max_flux'] - df['min_flux']

# Slope from UV to IR — indicates how brightness changes across spectrum
df['slope_u_z'] = (df['u'] - df['z']) / (df['u'] + df['z'] + 1e-5)

# Photometric curvature — measure of color curvature (nonlinearity)
df['photometric_curvature'] = df['g'] - 2 * df['r'] + df['i']

# Red/blue light ratio — can distinguish redder (older) vs bluer (younger) objects
df['red_blue_ratio'] = (df['i'] + df['z']) / (df['u'] + df['g'] + 1e-5)

# Standard deviation of magnitudes — measures variability across bands
df['color_std'] = df[['u', 'g', 'r', 'i', 'z']].std(axis=1)

# r-band centered — how r differs from average of g and i
df['r_band_centered'] = df['r'] - (df['g'] + df['i']) / 2

# Fractional fluxes — percent contribution of each band to total flux
total_flux = df[['u', 'g', 'r', 'i', 'z']].sum(axis=1) + 1e-5
df['u_frac'] = df['u'] / total_flux  # UV percent
df['g_frac'] = df['g'] / total_flux  # Green percent
df['r_frac'] = df['r'] / total_flux  # Red percent
df['i_frac'] = df['i'] / total_flux  # Near-IR percent
df['z_frac'] = df['z'] / total_flux  # IR percent

# Kurtosis of flux — measures 'peakedness' of flux distribution across bands
df['flux_kurtosis'] = df[['u', 'g', 'r', 'i', 'z']].kurtosis(axis=1)

# Color curvature — cumulative second derivative of color gradients
df['color_curvature'] = (df['u'] - 2 * df['g'] + df['r']) + \
                        (df['g'] - 2 * df['r'] + df['i']) + \
                        (df['r'] - 2 * df['i'] + df['z'])

#Tilt — overall shift from blue (u, g) to red (i, z) side
df['tilt'] = (df['i'] + df['z']) - (df['u'] + df['g'])

# Entropy of flux distribution — how evenly light is spread across bands
fluxes = df[['u', 'g', 'r', 'i', 'z']].values
fluxes = fluxes / (fluxes.sum(axis=1, keepdims=True) + 1e-8)
df['flux_entropy'] = entropy(fluxes.T)

y = df['class']
X = df.drop('class', axis=1)

sm = SMOTE(random_state=62)
X_res, y_res = sm.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=62)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (142612, 30)
X_test shape: (35654, 30)
y_train shape: (142612,)
y_test shape: (35654,)


LGBM Classifier with SMOTE. Parameters found in model.ipynb using optuna.

In [2]:
lg = LGBMClassifier(n_estimators= 506,
                    max_depth= 10,
                    learning_rate= 0.10775347157171246,
                    subsample= 0.9999465294210166,
                    colsample_bytree= 0.5960237817125218,
                    reg_alpha= 4.831241641356589,
                    reg_lambda= 2.6641968551150597,
                    min_child_samples= 13, 
                    random_state=62,
                    verbose=-1,
                    boosting_type='gbdt',
                    num_threads=-1,
                    device_type='cpu').fit(X_train, y_train)
y_pred = lg.predict(X_test)
print(f"Train Accuracy: {cross_val_score(lg, X_train, y_train)}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")

Train Accuracy: [0.9819444  0.98208463 0.9817334  0.98124255 0.98166328]
Test Accuracy: 0.983732540528412


Train accuracy: 0.9817

XGB Classifier with SMOTE. Parameters found in model.ipynb using optuna.

In [3]:
xgb = XGBClassifier(n_estimators= 608,
                    max_depth= 12, 
                    learning_rate= 0.16907078997907377, 
                    subsample= 0.7064213244955417, 
                    colsample_bytree= 0.5513661838203805, 
                    reg_alpha= 3.4094709113646964, 
                    reg_lambda= 4.010194573368155, 
                    gamma= 1.2626609855347377, 
                    min_child_weight= 14,
                    random_state=62,
                    tree_method='hist',
                    n_jobs=-1).fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(f"Train Accuracy: {cross_val_score(xgb, X_train, y_train)}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")

Train Accuracy: [0.97942012 0.97920976 0.97850782 0.97773648 0.97815721]
Test Accuracy: 0.9802266225388456


Train accuracy: 0.9786

In [None]:
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 200, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "od_type": "Iter",
        "od_wait": 50,
        "task_type": "CPU",
        "verbose": 0,
        "random_state": 62,
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = CatBoostClassifier(**params)
        model.fit(X_tr, y_tr)
        preds = model.predict(X_val)
        accuracies.append(accuracy_score(y_val, preds))

    return np.mean(accuracies)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("Best parameters:", study.best_params)

best_model = CatBoostClassifier(**study.best_params, random_state=62, verbose=0)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))


Best Score: 0.9827433791945758 and parameters: {'iterations': 996, 'learning_rate': 0.29936701106349517, 'depth': 10, 'l2_leaf_reg': 1.1585276709003958, 'border_count': 205, 'random_strength': 9.845839240524949, 'bagging_temperature': 0.44028243154808044}

In [7]:
cb = CatBoostClassifier(iterations= 996,
                        learning_rate= 0.29936701106349517, 
                        depth= 10, 
                        l2_leaf_reg= 1.1585276709003958, 
                        border_count= 205, 
                        random_strength= 9.845839240524949, 
                        bagging_temperature= 0.44028243154808044,
                        random_state=62,
                        verbose=0).fit(X_train, y_train)
y_pred = cb.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")

Test Accuracy: 0.9856117125708196
