In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm
import os

# Load Data and Base Features
DATA_PATH = '../data/'

In [None]:
train_log_df = pd.read_csv(os.path.join(DATA_PATH, 'train_log.csv'))

base_stats_df = pd.read_csv(r"C:\Project\MALLORN Astronomical Classification Challenge\train_features.csv")
tsfresh_features_df = pd.read_csv(r"C:\Project\MALLORN Astronomical Classification Challenge\notebooks\final_features.csv")

all_lc_df_list = [
    pd.read_csv(os.path.join(DATA_PATH, s, 'train_full_lightcurves.csv'))
    for s in train_log_df['split'].unique()
]

full_lc_df = pd.concat(all_lc_df_list).dropna()
print("All necessary data and feature sets loaded successfully.")


All necessary data and feature sets loaded successfully.


In [None]:
print("\n--- Engineering HIGH-QUALITY Color features (No Interpolation) ---")
color_features = []

for obj_id, group in tqdm(full_lc_df.groupby('object_id'), desc="Calculating Robust Color Features"):
    group = group.sort_values('Time (MJD)')

    df_g = group[group['Filter'] == 'g']
    df_r = group[group['Filter'] == 'r']
    df_i = group[group['Filter'] == 'i']
    df_z = group[group['Filter'] == 'z']

    # Match within ±1 MJD (≈1 day)
    merged_gr = pd.merge_asof(
        df_g, df_r, on='Time (MJD)', direction='nearest',
        suffixes=('_g', '_r'), tolerance=1.0
    )
    merged_ri = pd.merge_asof(
        df_r, df_i, on='Time (MJD)', direction='nearest',
        suffixes=('_r', '_i'), tolerance=1.0
    )

    merged_gr['g_minus_r'] = merged_gr['Flux_g'] - merged_gr['Flux_r']
    merged_ri['r_minus_i'] = merged_ri['Flux_r'] - merged_ri['Flux_i']

    obj_stats = {'object_id': obj_id}
    for color, df in [('g_minus_r', merged_gr), ('r_minus_i', merged_ri)]:
        if not df[color].empty:
            obj_stats[f'{color}_mean'] = df[color].mean()
            obj_stats[f'{color}_std'] = df[color].std()
            obj_stats[f'{color}_skew'] = df[color].skew()
        else:
            obj_stats[f'{color}_mean'] = np.nan
            obj_stats[f'{color}_std'] = np.nan
            obj_stats[f'{color}_skew'] = np.nan

    color_features.append(obj_stats)

color_features_df = pd.DataFrame(color_features)



--- Engineering HIGH-QUALITY Color features (No Interpolation) ---


Calculating Robust Color Features: 100%|██████████| 3043/3043 [00:07<00:00, 404.67it/s]


In [None]:
print("\n--- Creating Master Feature Set ---")

master_df = tsfresh_features_df.copy()

base_stats_cols = [c for c in base_stats_df.columns if c not in master_df.columns or c == 'object_id']
master_df = master_df.merge(base_stats_df[base_stats_cols], on='object_id', how='left')

# Add the new high-quality color features
master_df = master_df.merge(color_features_df, on='object_id', how='left')

# Final cleanup
master_df = master_df.fillna(0) # Fill any remaining NaNs with 0
print(f"Final Master Feature Set Shape: {master_df.shape}")



--- Creating Master Feature Set ---
Final Master Feature Set Shape: (3043, 244)


In [None]:
from sklearn.preprocessing import StandardScaler

print("\n--- Final Training with Master Feature Set ---")
y = master_df['target']
X = master_df.drop(columns=['object_id', 'target'])

# Clean feature names
X.columns = ["".join(c if c.isalnum() else "_" for c in str(x)) for x in X.columns]

# Keep only numeric features for scaling
numeric_cols = X.select_dtypes(include=['number']).columns
X_numeric = X[numeric_cols]

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)
X_scaled_df = pd.DataFrame(X_scaled, columns=numeric_cols, index=X.index)

# Combine back with any categorical/non-numeric columns (if you still want them)
non_numeric_df = X.drop(columns=numeric_cols)
X = pd.concat([X_scaled_df, non_numeric_df], axis=1)

print(f"Scaled {len(numeric_cols)} numeric features. Kept {len(non_numeric_df.columns)} non-numeric columns unchanged.")

# Use the best hyperparameters from your Optuna run
best_params = {
    'learning_rate': 0.03610120340507472, 'num_leaves': 120, 'max_depth': 11,
    'min_child_samples': 80, 'subsample': 0.5577339749020074,
    'colsample_bytree': 0.5736019643417907, 'reg_alpha': 0.31579743846984376,
    'reg_lambda': 0.3170517208465995, 'objective': 'binary', 'metric': 'binary_logloss',
    'boosting_type': 'gbdt', 'n_estimators': 2000, 'device': 'gpu', 'verbose': -1
}



--- Final Training with Master Feature Set ---
Scaled 240 numeric features. Kept 2 non-numeric columns unchanged.


In [16]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import lightgbm as lgb
import numpy as np

# Encode object/categorical columns
X_fixed = X.copy()
for col in X_fixed.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_fixed[col] = le.fit_transform(X_fixed[col].astype(str))
    
# Cross-validation setup
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_fixed, y)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    X_train, y_train = X_fixed.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X_fixed.iloc[val_idx], y.iloc[val_idx]

    # Handle class imbalance
    pos_count = y_train.value_counts().get(1, 0)
    neg_count = y_train.value_counts().get(0, 0)
    best_params['scale_pos_weight'] = neg_count / pos_count if pos_count > 0 else 1

    # Train LightGBM
    model = lgb.LGBMClassifier(**best_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='f1',
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )

    # Threshold tuning
    val_preds_proba = model.predict_proba(X_val)[:, 1]
    thresholds = np.linspace(0.01, 0.99, 100)
    f1_values = [f1_score(y_val, (val_preds_proba > t).astype(int)) for t in thresholds]
    best_f1 = np.max(f1_values)
    f1_scores.append(best_f1)
    print(f"Best F1 Score for fold: {best_f1:.4f}")

print("\n--- Final Summary (Master Feature Set) ---")
print(f"Mean F1 Score: {np.mean(f1_scores):.4f}")


--- Fold 1/5 ---
Best F1 Score for fold: 0.3951
--- Fold 2/5 ---
Best F1 Score for fold: 0.5818
--- Fold 3/5 ---
Best F1 Score for fold: 0.5909
--- Fold 4/5 ---
Best F1 Score for fold: 0.4731
--- Fold 5/5 ---
Best F1 Score for fold: 0.5614

--- Final Summary (Master Feature Set) ---
Mean F1 Score: 0.5205
