In [None]:
# 1) Load labeled dataset and initial features
import pandas as pd
from pathlib import Path
df = pd.read_csv('data/processed/sp500_features_labeled.csv', index_col=0)
df.index = pd.to_datetime(df.index)
FEATURES = [c for c in df.columns if c not in ('label','signal')]
print('Features:', FEATURES)
print('Rows:', len(df))

In [None]:
# 2) Permutation importance (uses trained model)
from src.models.feature_analysis import run as perm_run
perm_run()
from IPython.display import Image, display
display(Image('logs/perm_importance.png'))

In [None]:
# 3) SHAP summary (if generated)
import os
shap_path = Path('logs/shap_summary.png')
if shap_path.exists():
    display(Image(str(shap_path)))
else:
    print('SHAP summary not available. Run src/models/feature_analysis with SHAP installed.')

In [None]:
# 4) Simple iterative feature selection: drop weakest permutation feature and retrain XGBoost
import joblib
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

def retrain_and_eval(df, features):
    X = df[features]
    y = df['label']
    n = len(df)
    train_end = int(0.7 * n)
    val_end = train_end + int(0.15 * n)
    X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
    X_test, y_test = X.iloc[val_end:], y.iloc[val_end:]
    # class weighting on original labels
    classes = list(y_train.unique())
    try:
        cw = compute_class_weight('balanced', classes=classes, y=y_train)
        weight_map = {cls: w for cls, w in zip(classes, cw)}
        sample_weight = y_train.map(weight_map).values
    except Exception:
        sample_weight = None
    model = XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.05, use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train.map({v:i for i,v in enumerate(sorted(y_train.unique()))}), sample_weight=sample_weight, verbose=False)
    preds = pd.Series(model.predict(X_test)).map({i:v for i,v in enumerate(sorted(y_train.unique()))})
    print(classification_report(y_test, preds, digits=4))
    return model

# Run iterative drop: (caution: retraining multiple times)
features = FEATURES.copy()
# limit iterations to avoid long runs
for step in range(3):
    print('Iteration', step, 'features count', len(features))
    m = retrain_and_eval(df, features)
    # compute simple feature importance from model and drop weakest
    imp = m.feature_importances_
    weakest = features[int(imp.argmin())]
    print('Dropping weakest feature:', weakest)
    features.remove(weakest)