In [1]:
import pandas as pd
import numpy as np

# Load engineered features
X_engineered = pd.read_csv("../data/X_engineered_train.csv")
y_encoded = pd.read_csv("../data/y_train.csv").values.ravel()

print("Loaded X:", X_engineered.shape)
print("Loaded y:", y_encoded.shape)

Loaded X: (7352, 574)
Loaded y: (7352,)


In [2]:
# Feature Selection
# Variance Threshold
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0.01)
X_vt = vt.fit_transform(X_engineered)

print("After variance threshold:", X_vt.shape)

After variance threshold: (7352, 570)


In [3]:
# Correlation Pruning
X_vt_df = pd.DataFrame(X_vt)

corr = X_vt_df.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

to_drop = [c for c in upper.columns if any(upper[c] > 0.95)]
X_uncorr = X_vt_df.drop(columns=to_drop)

print("After correlation pruning:", X_uncorr.shape)

After correlation pruning: (7352, 253)


In [4]:
# Model-Based Selection
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_uncorr, y_encoded)

importances = pd.Series(
    rf.feature_importances_,
    index=X_uncorr.columns
).sort_values(ascending=False)

importances.head(10)

40     0.060339
41     0.057231
57     0.048464
3      0.029600
42     0.028170
96     0.024128
58     0.021894
303    0.019768
37     0.019311
73     0.014986
dtype: float64

In [5]:
# Final Reduced Feature Set
TOP_K = 150
X_selected = X_uncorr[importances.head(TOP_K).index]

print("Final selected features:", X_selected.shape)

Final selected features: (7352, 150)


In [9]:
# Save selected feature INDICES
import joblib
selected_feature_indices = X_selected.columns.to_list()

joblib.dump(selected_feature_indices, "../model/selected_features.pkl")

print("✅ Selected feature INDICES saved:", len(selected_feature_indices))

✅ Selected feature INDICES saved: 150


In [10]:
import os
print(os.listdir("../model"))

['.ipynb_checkpoints', 'extra_trees_model.pkl', 'label_encoder.pkl', 'selected_features.pkl']
