In [1]:
import pandas as pd
import numpy as np

x_train = pd.read_csv('data/x_train.txt', sep=r'\s+', header=None)
y_train = pd.read_csv('data/y_train.txt', sep=r'\s+', header=None)[0]
x_test  = pd.read_csv('data/x_test.txt',  sep=r'\s+', header=None)

stats = []
for i in x_train.columns:
    t_mean = x_train[i].mean()
    s_mean = x_test[i].mean()
    t_var  = x_train[i].var()
    s_var  = x_test[i].var()
    stats.append({
        'feature':             i,
        'train_mean':          t_mean,
        'test_mean':           s_mean,
        'mean_diff':           s_mean - t_mean,
        'train_variance':      t_var,
        'test_variance':       s_var,
        'variance_ratio':      (s_var / t_var) if t_var>0 else np.nan,
        'corr_with_target':    x_train[i].corr(y_train)
    })

summary_df = pd.DataFrame(stats).set_index('feature')

print(summary_df.head(10))



         train_mean  test_mean  mean_diff  train_variance  test_variance  \
feature                                                                    
0         15.560411  15.507078  -0.053333       18.730704      17.260227   
1         12.650449  12.655507   0.005058       14.317654      13.704232   
2         27.750084  27.736016  -0.014067       48.258792      44.019127   
3         18.796808  18.825133   0.028325       24.323544      22.632589   
4         19.071302  18.995343  -0.075959       27.471500      25.135808   
5         11.820110  11.769083  -0.051027       13.312292      12.482966   
6         19.365360  19.355964  -0.009395       28.260874      25.857634   
7         15.602632  15.517396  -0.085235       19.669401      18.001512   
8         14.163618  14.233636   0.070018       19.810112      19.051235   
9         15.989661  16.041487   0.051826       22.978933      22.298445   

         variance_ratio  corr_with_target  
feature                                    

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp



q_low  = x_train.quantile(0.01)
q_high = x_train.quantile(0.99)

stats = []
for col in x_train.columns:
    ks_stat, ks_p = ks_2samp(x_train[col], x_test[col])
    train_outliers = ((x_train[col] < q_low[col]) | (x_train[col] > q_high[col])).sum()
    test_outliers  = ((x_test[col]  < q_low[col]) | (x_test[col]  > q_high[col])).sum()
    stats.append({
        'feature':        col,
        'ks_stat':        ks_stat,
        'ks_pvalue':      ks_p,
        'train_outliers': train_outliers,
        'test_outliers':  test_outliers,
        'train_q01':      q_low[col],
        'train_q99':      q_high[col],
        'train_min':      x_train[col].min(),
        'train_max':      x_train[col].max(),
        'test_min':       x_test[col].min(),
        'test_max':       x_test[col].max(),
    })

stats_df = pd.DataFrame(stats).set_index('feature')

print("KS statistics for top 10 ks_stat features:")
print(stats_df.sort_values('ks_stat', ascending=False).head(10))

x_train_capped = x_train.clip(lower=q_low, upper=q_high, axis=1)
x_test_capped  = x_test.clip( lower=q_low, upper=q_high, axis=1)

ks_after = []
for col in x_train.columns:
    ks2, p2 = ks_2samp(x_train_capped[col], x_test_capped[col])
    ks_after.append(ks2)
stats_df['ks_after_capping'] = ks_after

print("\nKS statistics for top 10 ks_stat features: after capping:")
top10 = stats_df.sort_values('ks_stat', ascending=False).head(10)
print(top10[['ks_stat','ks_after_capping']])


KS statistics for top 10 ks_stat features:
         ks_stat  ks_pvalue  train_outliers  test_outliers  train_q01  \
feature                                                                 
428       0.0426   0.000229             100            112   2.665452   
130       0.0368   0.002291             100            105  -2.249431   
107       0.0368   0.002291             100             97  -2.315544   
135       0.0354   0.003798             100            110  -2.354010   
394       0.0340   0.006174             100            156   0.012440   
303       0.0338   0.006607             100             76   0.008643   
296       0.0336   0.007068             100             92   0.022230   
252       0.0324   0.010504             100            108   0.025570   
34        0.0314   0.014451             100             99  -2.257608   
266       0.0308   0.017417             100            144   0.030595   

         train_q99  train_min  train_max  test_min   test_max  
feature         

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score


q_low, q_high = x_train.quantile(0.01), x_train.quantile(0.99)

stats = []
for col in x_train.columns:
    ks_stat, _ = ks_2samp(x_train[col], x_test[col])
    test_out = ((x_test[col] < q_low[col]) | (x_test[col] > q_high[col])).sum()
    stats.append({'feature': col, 'ks_stat': ks_stat, 'test_outliers': test_out})
stats_df = pd.DataFrame(stats).set_index('feature')

ks_thresh = 0.035
outlier_thresh = 120
flagged = stats_df[
    (stats_df['ks_stat'] >= ks_thresh) |
    (stats_df['test_outliers'] > outlier_thresh)
].index.tolist()
print(f"Flagged features (KS >= {ks_thresh} or test_outliers > {outlier_thresh}):\n{flagged}\n")

qt = QuantileTransformer(output_distribution='uniform', random_state=0)
combined = pd.concat([x_train[flagged], x_test[flagged]], axis=0)
qt.fit(combined)

x_train_qt = x_train.copy()
x_test_qt  = x_test.copy()
x_train_qt[flagged] = qt.transform(x_train[flagged])
x_test_qt[flagged]  = qt.transform(x_test[flagged])

X_adv = pd.concat([x_train_qt[flagged], x_test_qt[flagged]], axis=0)
y_adv = np.concatenate([np.zeros(len(x_train_qt)), np.ones(len(x_test_qt))])

X_tr, X_val, y_tr, y_val = train_test_split(
    X_adv, y_adv, test_size=0.3, random_state=0, stratify=y_adv
)
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_tr, y_tr)

y_prob = rf.predict_proba(X_val)[:,1]
print(f"Adversarial AUC: {roc_auc_score(y_val, y_prob):.3f}, "
      f"Accuracy: {accuracy_score(y_val, rf.predict(X_val)):.3f}\n")

importances = pd.Series(rf.feature_importances_, index=flagged).sort_values(ascending=False)
print("Top 10 adversarial-important features:")
print(importances.head(10), "\n")

new_stats = []
for col in flagged:
    ks2, _ = ks_2samp(x_train_qt[col], x_test_qt[col])
    new_stats.append({'feature': col, 'ks_after': ks2})
new_df = pd.DataFrame(new_stats).set_index('feature')
print("KS after QuantileTransform:")
print(new_df, "\n")

to_drop = new_df[new_df['ks_after'] > ks_thresh].index.tolist()
keep_after = [f for f in flagged if f not in to_drop]
print(f"Features to drop (ks_after > {ks_thresh}): {to_drop}")
print(f"Features to keep: {keep_after}\n")

x_train_processed = x_train_qt.drop(columns=to_drop)
x_test_processed  = x_test_qt.drop(columns=to_drop)

print(f"Final shapes --> x_train: {x_train_processed.shape}, x_test: {x_test_processed.shape}")


Flagged features (KS >= 0.035 or test_outliers > 120):
[11, 20, 24, 44, 47, 50, 55, 107, 114, 117, 121, 122, 123, 127, 130, 133, 135, 163, 175, 177, 184, 188, 193, 198, 204, 214, 225, 232, 244, 253, 264, 266, 279, 342, 343, 354, 388, 390, 394, 399, 404, 413, 426, 428, 430, 434, 439, 464]

Adversarial AUC: 0.501, Accuracy: 0.490

Top 10 adversarial-important features:
130    0.022881
107    0.022392
428    0.022245
388    0.022131
135    0.021990
184    0.021944
117    0.021707
394    0.021654
204    0.021605
232    0.021597
dtype: float64 

KS after QuantileTransform:
         ks_after
feature          
11         0.0078
20         0.0186
24         0.0134
44         0.0206
47         0.0164
50         0.0206
55         0.0242
107        0.0368
114        0.0084
117        0.0218
121        0.0184
122        0.0178
123        0.0180
127        0.0196
130        0.0368
133        0.0172
135        0.0354
163        0.0120
175        0.0176
177        0.0106
184        0.0180
188        

In [4]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.feature_selection import VarianceThreshold


sel = VarianceThreshold(threshold=1e-5)
sel.fit(x_train_processed)
keep_var = x_train_processed.columns[sel.get_support()]
drop_var = [c for c in x_train_processed.columns if c not in keep_var]
print("Dropped for near-zero variance:", drop_var)
x_train_var = x_train_processed[keep_var].copy()
x_test_var  = x_test_processed[keep_var].copy()

corr = x_train_var.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
drop_corr = [col for col in upper.columns if any(upper[col] > 0.95)]
print("Dropped for high correlation:", drop_corr)
x_train_corr = x_train_var.drop(columns=drop_corr).copy()
x_test_corr  = x_test_var.drop(columns=drop_corr).copy()

scaler = StandardScaler()
combined = pd.concat([x_train_corr, x_test_corr], axis=0)
scaler.fit(combined)

x_train_final = pd.DataFrame(
    scaler.transform(x_train_corr),
    columns=x_train_corr.columns,
    index=x_train_corr.index
)
x_test_final = pd.DataFrame(
    scaler.transform(x_test_corr),
    columns=x_test_corr.columns,
    index=x_test_corr.index
)

print("Final shapes -->", x_train_final.shape, x_test_final.shape)


Dropped for near-zero variance: []
Dropped for high correlation: [7]
Final shapes --> (5000, 495) (5000, 495)


In [12]:
!pip install optuna



In [5]:
import optuna
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import shap


def net_score(y_true, y_pred, n_features):
    acc = accuracy_score(y_true, y_pred)
    return 10 * acc * len(y_true) - 200 * n_features

def objective(trial):
    C = trial.suggest_loguniform('C', 0.001, 0.01)
    threshold = trial.suggest_float('threshold', 0.4, 0.6)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores = []

    for train_idx, val_idx in skf.split(x_train_final, y_train):
        X_tr, X_val = x_train_final.iloc[train_idx], x_train_final.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = LogisticRegression(
            penalty='l1', solver='saga', C=C,
            max_iter=10000, random_state=0
        )
        model.fit(X_tr, y_tr)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= threshold).astype(int)
        n_feats = np.count_nonzero(model.coef_)

        scores.append(net_score(y_val, preds, n_feats))

    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

best_params = study.best_params
best_C = best_params['C']
best_threshold = best_params['threshold']
print(f"Best params C: {best_C:.5f}, threshold: {best_threshold:.2f}")

final_model = LogisticRegression(
    penalty='l1', solver='saga', C=best_C,
    max_iter=10000, random_state=0
)
final_model.fit(x_train_final, y_train)

test_probs = final_model.predict_proba(x_test_final)[:, 1]
y_test_pred = (test_probs >= best_threshold).astype(int)

explainer = shap.LinearExplainer(
    final_model,
    x_train_final,
    feature_perturbation="interventional"
)
shap_values = explainer.shap_values(x_train_final)
mean_abs_shap = np.abs(shap_values).mean(axis=0)
shap_imp = pd.Series(mean_abs_shap, index=x_train_final.columns)
shap_imp = shap_imp.sort_values(ascending=False)
print("\nTop 10 features by mean |SHAP|:")
print(shap_imp.head(10))

B = 100
rng = np.random.RandomState(0)
feat_count = pd.Series(0, index=x_train_final.columns)

for i in range(B):
    idx = rng.choice(len(x_train_final), len(x_train_final), replace=True)
    Xb, yb = x_train_final.iloc[idx], y_train.iloc[idx]
    m = LogisticRegression(
        penalty='l1', solver='saga', C=best_C,
        max_iter=10000, random_state=i
    )
    m.fit(Xb, yb)
    sel = x_train_final.columns[m.coef_[0] != 0]
    feat_count[sel] += 1

stability = (feat_count / B).sort_values(ascending=False)
print("\nBootstrap selection frequency (top 10):")
print(stability.head(10))

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-06-02 10:21:13,069] A new study created in memory with name: no-name-967f0f26-0559-4bc7-9e0e-807f00b2c498
  C = trial.suggest_loguniform('C', 0.001, 0.01)
[I 2025-06-02 10:21:14,688] Trial 0 finished with value: 6676.0 and parameters: {'C': 0.004244948564139532, 'threshold': 0.513010733783312}. Best is trial 0 with value: 6676.0.
  C = trial.suggest_loguniform('C', 0.001, 0.01)
[I 2025-06-02 10:21:16,241] Trial 1 finished with value: 6002.0 and parameters: {'C': 0.0028034506997998213, 'threshold': 0.5595350989019569}. Best is trial 0 with value: 6676.0.
  C = trial.suggest_loguniform('C', 0.001, 0.01)
[I 2025-06-02 10:21:17,049] Trial 2 finished with value: 6330.0 and parameters: {'C': 0.003863402046440928, 'threshold': 0.5452278377537986}. Best is trial 0 with value: 6676.0.
  C = trial.suggest_loguniform('C', 0.001, 0.01)
[I 2025-06-02 10:21:18,573] Trial 3 finished with value: 6180.0 and parameters: {'C': 0.002052414557642375

Best params C: 0.00462, threshold: 0.50





Top 10 features by mean |SHAP|:
2      0.502722
343    0.000000
341    0.000000
340    0.000000
339    0.000000
338    0.000000
337    0.000000
336    0.000000
335    0.000000
334    0.000000
dtype: float64

Bootstrap selection frequency (top 10):
2      1.00
6      0.54
414    0.04
5      0.03
8      0.01
331    0.00
332    0.00
333    0.00
334    0.00
335    0.00
dtype: float64


In [10]:
import numpy as np
import pandas as pd
import optuna
import shap
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def net_score(y_true, y_pred, n_features):
    acc = accuracy_score(y_true, y_pred)
    return 10 * acc * len(y_true) - 200 * n_features

def count_tree_features(model):
    imp = model.feature_importances_
    return np.count_nonzero(imp)

def objective_xgb(trial):
    lambda_l1 = trial.suggest_float('lambda_l1', 50.0, 200.0, log=True)
    max_depth = trial.suggest_int('max_depth', 2, 4)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.1, 0.3)
    threshold = trial.suggest_float('threshold', 0.3, 0.7)
    estimators = trial.suggest_int('n_estimators', 500, 2000)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    fold_scores = []
    for tr_idx, val_idx in skf.split(x_train_final, y_train):
        X_tr, X_val = x_train_final.iloc[tr_idx], x_train_final.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        model = XGBClassifier(
            n_estimators=estimators,
            eval_metric='logloss',
            reg_lambda=lambda_l1,
            reg_alpha=0.0,
            max_depth=max_depth,
            colsample_bytree=colsample_bytree,
            random_state=0, n_jobs=-1
        )
        model.fit(X_tr, y_tr)
        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= threshold).astype(int)
        n_feats = count_tree_features(model)
        fold_scores.append(net_score(y_val, preds, n_feats))
    return np.mean(fold_scores)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=10)
best_xgb = study_xgb.best_params
best_l1_xgb = best_xgb['lambda_l1']
best_depth_xgb = best_xgb['max_depth']
best_col_xgb = best_xgb['colsample_bytree']
best_thr_xgb = best_xgb['threshold']
best_estimators_xgb = best_xgb['n_estimators']

print(f"\nBest XGBoost params: lambda_l1 = {best_l1_xgb:.2f}, max_depth = {best_depth_xgb}, colsample_bytree = {best_col_xgb:.2f}, threshold = {best_thr_xgb:.2f}, n_estimators = {best_estimators_xgb}")

model_xgb = XGBClassifier(
    n_estimators=best_estimators_xgb,
    eval_metric='logloss',
    reg_lambda=best_l1_xgb,
    max_depth=best_depth_xgb,
    colsample_bytree=best_col_xgb,
    random_state=0, n_jobs=-1
)
model_xgb.fit(x_train_final, y_train)

print("\nFinal model tuning results:")
print(f"XGBoost: lambda_l1 = {best_l1_xgb:.2f}, max_depth = {best_depth_xgb}, colsample_bytree = {best_col_xgb:.2f}, thresh = {best_thr_xgb:.2f}, #feat = {count_tree_features(model_xgb)}")

# SHAP analysis
print("\nXGBoost analysis")
test_probs = model_xgb.predict_proba(x_test_final)[:, 1]
y_test_pred = (test_probs >= best_thr_xgb).astype(int)
frac_pos = np.mean(y_test_pred)
print(f"Threshold = {best_thr_xgb:.2f}, # positives = {int(frac_pos * len(test_probs))} / {len(test_probs)}")

explainer = shap.TreeExplainer(model_xgb, x_train_final)
shap_vals = explainer.shap_values(x_train_final)
if isinstance(shap_vals, list):
    shap_vals = shap_vals[1]
mean_abs_shap = np.abs(shap_vals).mean(axis=0)
shap_imp = pd.Series(mean_abs_shap, index=x_train_final.columns).sort_values(ascending=False)
print("\nTop 10 features by mean |SHAP|:")
print(shap_imp.head(10))

# Bootstrap feature stability
B = 100
rng = np.random.RandomState(0)
feat_count = pd.Series(0, index=x_train_final.columns)
for i in range(B):
    idx = rng.choice(len(x_train_final), len(x_train_final), replace=True)
    Xb, yb = x_train_final.iloc[idx], y_train.iloc[idx]
    m = XGBClassifier(
        n_estimators=100,
        eval_metric='logloss',
        reg_lambda=best_l1_xgb, max_depth=best_depth_xgb,
        colsample_bytree=best_col_xgb,
        random_state=i, n_jobs=-1
    )
    m.fit(Xb, yb)
    sel = x_train_final.columns[m.feature_importances_ > 0]
    feat_count[sel] += 1

stability = (feat_count / B).sort_values(ascending=False)
print("\nBootstrap selection frequency (top 10):")
print(stability.head(10))

[I 2025-06-01 19:05:16,554] A new study created in memory with name: no-name-aad542cf-6870-4532-ac47-1f35cdc6ea07
[I 2025-06-01 19:05:53,458] Trial 0 finished with value: -92176.0 and parameters: {'lambda_l1': 64.79187284350219, 'max_depth': 4, 'colsample_bytree': 0.13997947586681198, 'threshold': 0.5674105803405487, 'n_estimators': 834}. Best is trial 0 with value: -92176.0.
[I 2025-06-01 19:06:31,605] Trial 1 finished with value: -92362.0 and parameters: {'lambda_l1': 86.48215786499351, 'max_depth': 2, 'colsample_bytree': 0.12827430274691726, 'threshold': 0.36389368299450253, 'n_estimators': 1964}. Best is trial 0 with value: -92176.0.
[I 2025-06-01 19:07:36,788] Trial 2 finished with value: -92286.0 and parameters: {'lambda_l1': 86.68266131541282, 'max_depth': 4, 'colsample_bytree': 0.10918418277237694, 'threshold': 0.693389824574521, 'n_estimators': 1874}. Best is trial 0 with value: -92176.0.
[I 2025-06-01 19:07:56,774] Trial 3 finished with value: -89764.0 and parameters: {'lambd


Best XGBoost params: lambda_l1 = 135.28, max_depth = 2, colsample_bytree = 0.16, threshold = 0.65, n_estimators = 544

Final model tuning results:
XGBoost: lambda_l1 = 135.28, max_depth = 2, colsample_bytree = 0.16, thresh = 0.65, #feat = 447

XGBoost analysis
Threshold = 0.65, # positives = 1723 / 5000





Top 10 features by mean |SHAP|:
2      0.515774
462    0.216470
8      0.183946
6      0.179436
3      0.166162
298    0.101901
348    0.086394
127    0.084769
146    0.081619
458    0.077183
dtype: float64


KeyboardInterrupt: 

In [19]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def evaluate_subset(feats):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    all_true, all_prob = [], []
    for tr_idx, val_idx in skf.split(x_train_final, y_train):
        X_tr = x_train_final.iloc[tr_idx][feats]
        y_tr = y_train.iloc[tr_idx]
        X_val = x_train_final.iloc[val_idx][feats]
        y_val = y_train.iloc[val_idx]

        model = XGBClassifier(
            n_estimators=best_estimators_xgb,
            eval_metric='logloss',
            reg_lambda=best_l1_xgb,
            max_depth=best_depth_xgb,
            colsample_bytree=best_col_xgb,
            random_state=0, n_jobs=-1
        )
        model.fit(X_tr, y_tr)
        all_true.append(y_val.values)
        all_prob.append(model.predict_proba(X_val)[:,1])

    all_true = np.concatenate(all_true)
    all_prob = np.concatenate(all_prob)

    best_net, best_thr = -np.inf, 0.5
    for thr in np.linspace(0.1, 0.9, 81):
        preds = (all_prob >= thr).astype(int)
        net = net_score(all_true, preds, len(feats))
        if net > best_net:
            best_net, best_thr = net, thr
    return best_net, best_thr

candidates = [[2], [2, 462, 6]]

best_net = -np.inf
winning_subset = None
winning_thr = None

for subset in candidates:
    net, thr = evaluate_subset(subset)
    print(f"Subset {subset}: CV net-score = {net:.1f} and threshold {thr:.2f}")
    if net > best_net:
        best_net, winning_subset, winning_thr = net, subset, thr

print(f"\nWinning subset: {winning_subset} with net-score {best_net:.1f} and threshold {winning_thr:.2f}")

final = XGBClassifier(
    n_estimators=best_estimators_xgb,
    eval_metric='logloss',
    reg_lambda=best_l1_xgb,
    max_depth=best_depth_xgb,
    colsample_bytree=best_col_xgb,
    random_state=0, n_jobs=-1
)
final.fit(x_train_final[winning_subset], y_train)

y_test_pred = (final.predict_proba(x_test_final[winning_subset])[:,1] >= winning_thr).astype(int)

print(f"Final XGBoost model trained with features {winning_subset}; test predictions ready.")

NameError: name 'best_estimators_xgb' is not defined

In [27]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def evaluate_subset(feats):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    all_true, all_prob = [], []
    for tr_idx, val_idx in skf.split(x_train_final, y_train):
        X_tr = x_train_final.iloc[tr_idx][feats]
        y_tr = y_train.iloc[tr_idx]
        X_val = x_train_final.iloc[val_idx][feats]
        y_val = y_train.iloc[val_idx]

        model = XGBClassifier(
            n_estimators=best_estimators_xgb,
            eval_metric='logloss',
            reg_lambda=best_l1_xgb,
            max_depth=best_depth_xgb,
            colsample_bytree=best_col_xgb,
            random_state=0, n_jobs=-1
        )
        model.fit(X_tr, y_tr)
        all_true.append(y_val.values)
        all_prob.append(model.predict_proba(X_val)[:,1])

    all_true = np.concatenate(all_true)
    all_prob = np.concatenate(all_prob)

    best_net, best_thr = -np.inf, 0.5
    for thr in np.linspace(0.1, 0.9, 81):
        preds = (all_prob >= thr).astype(int)
        net = net_score(all_true, preds, len(feats))
        if net > best_net:
            best_net, best_thr = net, thr
    return best_net, best_thr

candidates = [[2], [2, 462]]

best_net = -np.inf
winning_subset = None
winning_thr = None

for subset in candidates:
    net, thr = evaluate_subset(subset)
    print(f"Subset {subset}: CV net-score = {net:.1f} and threshold {thr:.2f}")
    if net > best_net:
        best_net, winning_subset, winning_thr = net, subset, thr

print(f"\nWinning subset: {winning_subset} with net-score {best_net:.1f} and threshold {winning_thr:.2f}")

final = XGBClassifier(
    n_estimators=best_estimators_xgb,
    eval_metric='logloss',
    reg_lambda=best_l1_xgb,
    max_depth=best_depth_xgb,
    colsample_bytree=best_col_xgb,
    random_state=0, n_jobs=-1
)
final.fit(x_train_final[winning_subset], y_train)

y_test_pred = (final.predict_proba(x_test_final[winning_subset])[:,1] >= winning_thr).astype(int)

print(f"Final XGBoost model trained with features {winning_subset}; test predictions ready.")

Subset [2]: CV net-score = 35130.0 and threshold 0.55
Subset [2, 462]: CV net-score = 35020.0 and threshold 0.52

Winning subset: [2] with net-score 35130.0 and threshold 0.55
Final XGBoost model trained with features [2]; test predictions ready.


In [27]:
import numpy as np
import pandas as pd
import optuna
import shap
from sklearn.naive_bayes import GaussianNB

def extract_phi(shap_out, n_features, target_class=1):
    if isinstance(shap_out, list):
        phi = shap_out[target_class]
        return phi[:, :n_features] if phi.shape[1] == n_features + 1 else phi
    if hasattr(shap_out, "values"):
        arr = shap_out.values
    else:
        arr = np.asarray(shap_out)
    if arr.ndim != 3:
        raise ValueError(f"Unexpected SHAP shape: {arr.shape}")
    N, A, B = arr.shape
    if A in {n_features, n_features + 1}:
        phi = arr[:, :n_features, target_class]
        return phi
    if B in {n_features, n_features + 1}:
        phi = arr[:, target_class, :n_features]
        return phi
    raise ValueError(f"Unknown axis configuration: {arr.shape}")

def net_score(y_true, y_pred, n_feats):
    return 10 * accuracy_score(y_true, y_pred) * len(y_true) - 200 * n_feats

def count_nb_features(model):
    return model.n_features_in_

def objective_nb(trial):
    vs  = trial.suggest_float("var_smoothing", 1e-12, 1e-6, log=True)
    thr = trial.suggest_float("threshold", 0.3, 0.7)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores = []
    for tr_idx, val_idx in skf.split(x_train_final, y_train):
        X_tr, X_val = x_train_final.iloc[tr_idx], x_train_final.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        m = GaussianNB(var_smoothing=vs).fit(X_tr, y_tr)
        preds = (m.predict_proba(X_val)[:, 1] >= thr).astype(int)
        scores.append(net_score(y_val, preds, count_nb_features(m)))
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective_nb, n_trials=25, show_progress_bar=True)

best_vs, best_thr = study.best_params["var_smoothing"], study.best_params["threshold"]
print(f"\nBest parameters → var_smoothing={best_vs:.2e}, threshold={best_thr:.2f}")

model_nb = GaussianNB(var_smoothing=best_vs).fit(x_train_final, y_train)
print(f"\nFinal model: #features = {count_nb_features(model_nb)}")

print("\nNaive Bayes SHAP analysis")
background = shap.sample(x_train_final, 100, random_state=0)
explainer  = shap.KernelExplainer(model_nb.predict_proba, background)

sample = shap.sample(x_train_final, 300, random_state=1)
shap_out = explainer.shap_values(sample, nsamples=100)
phi      = extract_phi(shap_out, n_features=x_train_final.shape[1])
mean_abs = np.abs(phi).mean(axis=0)

shap_imp = pd.Series(mean_abs, index=x_train_final.columns).sort_values(ascending=False)
print("\nTop 10 features by mean |SHAP|:")
print(shap_imp.head(10))

B, rng = 20, np.random.RandomState(0)
feat_count = pd.Series(0, index=x_train_final.columns)

for b in range(B):
    idx = rng.choice(len(x_train_final), len(x_train_final), replace=True)
    Xb, yb = x_train_final.iloc[idx], y_train.iloc[idx]
    m = GaussianNB(var_smoothing=best_vs).fit(Xb, yb)
    expl = shap.KernelExplainer(m.predict_proba, shap.sample(Xb, 100, random_state=b))
    shap_b = expl.shap_values(Xb.iloc[:200], nsamples=100)
    phi_b  = extract_phi(shap_b, n_features=Xb.shape[1])
    mean_abs_b = np.abs(phi_b).mean(axis=0)
    sel = Xb.columns[mean_abs_b > np.median(mean_abs_b)]
    feat_count[sel] += 1

stability = (feat_count / B).sort_values(ascending=False)
print("\nBootstrap selection frequency – Top 10:")
print(stability.head(10))

[I 2025-06-02 12:22:30,603] A new study created in memory with name: no-name-04ec09a1-44ee-4bf4-ae76-6bf722c0b810
Best trial: 0. Best value: -92324:   8%|▊         | 2/25 [00:00<00:02, 11.34it/s]

[I 2025-06-02 12:22:30,696] Trial 0 finished with value: -92324.0 and parameters: {'var_smoothing': 7.806640133564179e-08, 'threshold': 0.5670128482223724}. Best is trial 0 with value: -92324.0.
[I 2025-06-02 12:22:30,779] Trial 1 finished with value: -92350.0 and parameters: {'var_smoothing': 3.6946144305347125e-10, 'threshold': 0.6777501925100201}. Best is trial 0 with value: -92324.0.
[I 2025-06-02 12:22:30,861] Trial 2 finished with value: -92346.0 and parameters: {'var_smoothing': 4.4280850319495966e-10, 'threshold': 0.6812700262924618}. Best is trial 0 with value: -92324.0.


Best trial: 4. Best value: -92312:  24%|██▍       | 6/25 [00:00<00:01, 11.50it/s]

[I 2025-06-02 12:22:30,951] Trial 3 finished with value: -92324.0 and parameters: {'var_smoothing': 1.1443956295431147e-10, 'threshold': 0.5464489499733285}. Best is trial 0 with value: -92324.0.
[I 2025-06-02 12:22:31,040] Trial 4 finished with value: -92312.0 and parameters: {'var_smoothing': 1.5615004361575018e-12, 'threshold': 0.5233114758446538}. Best is trial 4 with value: -92312.0.
[I 2025-06-02 12:22:31,125] Trial 5 finished with value: -92344.0 and parameters: {'var_smoothing': 2.531897285295792e-07, 'threshold': 0.6527861950596133}. Best is trial 4 with value: -92312.0.


Best trial: 4. Best value: -92312:  32%|███▏      | 8/25 [00:00<00:01, 11.65it/s]

[I 2025-06-02 12:22:31,208] Trial 6 finished with value: -92324.0 and parameters: {'var_smoothing': 5.612068124478917e-07, 'threshold': 0.3013164320661367}. Best is trial 4 with value: -92312.0.
[I 2025-06-02 12:22:31,292] Trial 7 finished with value: -92324.0 and parameters: {'var_smoothing': 8.421392174639971e-07, 'threshold': 0.5590293399001562}. Best is trial 4 with value: -92312.0.
[I 2025-06-02 12:22:31,375] Trial 8 finished with value: -92336.0 and parameters: {'var_smoothing': 3.5016820775183155e-07, 'threshold': 0.5954485701083021}. Best is trial 4 with value: -92312.0.


Best trial: 4. Best value: -92312:  48%|████▊     | 12/25 [00:01<00:01, 11.60it/s]

[I 2025-06-02 12:22:31,458] Trial 9 finished with value: -92350.0 and parameters: {'var_smoothing': 6.920563501356304e-10, 'threshold': 0.6261521830697945}. Best is trial 4 with value: -92312.0.
[I 2025-06-02 12:22:31,550] Trial 10 finished with value: -92336.0 and parameters: {'var_smoothing': 1.0476661175980006e-12, 'threshold': 0.43008344723578934}. Best is trial 4 with value: -92312.0.
[I 2025-06-02 12:22:31,637] Trial 11 finished with value: -92314.0 and parameters: {'var_smoothing': 1.5626611176841904e-08, 'threshold': 0.48093766862532544}. Best is trial 4 with value: -92312.0.


Best trial: 4. Best value: -92312:  56%|█████▌    | 14/25 [00:01<00:00, 11.45it/s]

[I 2025-06-02 12:22:31,727] Trial 12 finished with value: -92334.0 and parameters: {'var_smoothing': 8.015580824366263e-09, 'threshold': 0.45213367279501626}. Best is trial 4 with value: -92312.0.
[I 2025-06-02 12:22:31,815] Trial 13 finished with value: -92316.0 and parameters: {'var_smoothing': 8.455925772623108e-12, 'threshold': 0.48053210349413944}. Best is trial 4 with value: -92312.0.
[I 2025-06-02 12:22:31,906] Trial 14 finished with value: -92318.0 and parameters: {'var_smoothing': 1.4185352496255862e-08, 'threshold': 0.37243471921676996}. Best is trial 4 with value: -92312.0.


Best trial: 17. Best value: -92304:  72%|███████▏  | 18/25 [00:01<00:00, 11.32it/s]

[I 2025-06-02 12:22:31,998] Trial 15 finished with value: -92320.0 and parameters: {'var_smoothing': 6.619924525010713e-09, 'threshold': 0.5268003873675288}. Best is trial 4 with value: -92312.0.
[I 2025-06-02 12:22:32,089] Trial 16 finished with value: -92332.0 and parameters: {'var_smoothing': 5.3649093003823575e-11, 'threshold': 0.4001667815640613}. Best is trial 4 with value: -92312.0.
[I 2025-06-02 12:22:32,173] Trial 17 finished with value: -92304.0 and parameters: {'var_smoothing': 2.0677188559998505e-12, 'threshold': 0.4989169883295357}. Best is trial 17 with value: -92304.0.


Best trial: 17. Best value: -92304:  80%|████████  | 20/25 [00:01<00:00, 11.35it/s]

[I 2025-06-02 12:22:32,261] Trial 18 finished with value: -92310.0 and parameters: {'var_smoothing': 1.2452497875444768e-12, 'threshold': 0.5070617695926103}. Best is trial 17 with value: -92304.0.
[I 2025-06-02 12:22:32,349] Trial 19 finished with value: -92322.0 and parameters: {'var_smoothing': 8.75395343094027e-12, 'threshold': 0.3587311442487369}. Best is trial 17 with value: -92304.0.
[I 2025-06-02 12:22:32,436] Trial 20 finished with value: -92334.0 and parameters: {'var_smoothing': 5.891522297720488e-12, 'threshold': 0.44319204628776143}. Best is trial 17 with value: -92304.0.


Best trial: 17. Best value: -92304:  96%|█████████▌| 24/25 [00:02<00:00, 11.27it/s]

[I 2025-06-02 12:22:32,524] Trial 21 finished with value: -92308.0 and parameters: {'var_smoothing': 1.079936788179908e-12, 'threshold': 0.5100691654390016}. Best is trial 17 with value: -92304.0.
[I 2025-06-02 12:22:32,612] Trial 22 finished with value: -92318.0 and parameters: {'var_smoothing': 3.055820240868782e-11, 'threshold': 0.4765185459414491}. Best is trial 17 with value: -92304.0.
[I 2025-06-02 12:22:32,705] Trial 23 finished with value: -92308.0 and parameters: {'var_smoothing': 3.3873676775185963e-12, 'threshold': 0.5118132539980456}. Best is trial 17 with value: -92304.0.


Best trial: 17. Best value: -92304: 100%|██████████| 25/25 [00:02<00:00, 11.41it/s]


[I 2025-06-02 12:22:32,792] Trial 24 finished with value: -92334.0 and parameters: {'var_smoothing': 4.414652830651082e-12, 'threshold': 0.5890243410136639}. Best is trial 17 with value: -92304.0.

Best parameters → var_smoothing=2.07e-12, threshold=0.50

Final model: #features = 495

Naive Bayes SHAP analysis


To avoid this situation and get a regular matrix do one of the following:
1) turn up the number of samples,
2) turn up the L1 regularization with num_features(N) where N is less than the number of samples,
3) group features together to reduce the number of inputs that need to be explained.
To avoid this situation and get a regular matrix do one of the following:
1) turn up the number of samples,
2) turn up the L1 regularization with num_features(N) where N is less than the number of samples,
3) group features together to reduce the number of inputs that need to be explained.
To avoid this situation and get a regular matrix do one of the following:
1) turn up the number of samples,
2) turn up the L1 regularization with num_features(N) where N is less than the number of samples,
3) group features together to reduce the number of inputs that need to be explained.
To avoid this situation and get a regular matrix do one of the following:
1) turn up the number of samples,
2) turn up the L1 r


Top 10 features by mean |SHAP|:
274    1.635562e+13
214    1.635562e+13
3      1.457642e-02
2      1.361911e-02
6      1.255340e-02
5      1.220861e-02
414    1.026418e-02
4      1.019199e-02
9      9.913078e-03
462    9.320831e-03
dtype: float64


To avoid this situation and get a regular matrix do one of the following:
1) turn up the number of samples,
2) turn up the L1 regularization with num_features(N) where N is less than the number of samples,
3) group features together to reduce the number of inputs that need to be explained.
To avoid this situation and get a regular matrix do one of the following:
1) turn up the number of samples,
2) turn up the L1 regularization with num_features(N) where N is less than the number of samples,
3) group features together to reduce the number of inputs that need to be explained.
To avoid this situation and get a regular matrix do one of the following:
1) turn up the number of samples,
2) turn up the L1 regularization with num_features(N) where N is less than the number of samples,
3) group features together to reduce the number of inputs that need to be explained.
To avoid this situation and get a regular matrix do one of the following:
1) turn up the number of samples,
2) turn up the L1 r


Bootstrap selection frequency – Top 10:
0      1.0
1      1.0
2      1.0
3      1.0
8      1.0
4      1.0
5      1.0
6      1.0
9      1.0
462    1.0
dtype: float64





In [28]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold

def evaluate_subset_nb(feats, var_smoothing):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    all_true, all_prob = [], []
    for tr_idx, val_idx in skf.split(x_train_final, y_train):
        X_tr, y_tr = x_train_final.iloc[tr_idx][feats], y_train.iloc[tr_idx]
        X_val, y_val = x_train_final.iloc[val_idx][feats], y_train.iloc[val_idx]
        model = GaussianNB(var_smoothing=var_smoothing).fit(X_tr, y_tr)
        all_true.append(y_val.values)
        all_prob.append(model.predict_proba(X_val)[:, 1])
    all_true = np.concatenate(all_true)
    all_prob = np.concatenate(all_prob)
    best_net, best_thr = -np.inf, 0.5
    for thr in np.linspace(0.1, 0.9, 81):
        preds = (all_prob >= thr).astype(int)
        net = net_score(all_true, preds, len(feats))
        if net > best_net:
            best_net, best_thr = net, thr
    return best_net, best_thr

top_shap_features = [274, 214, 3, 2, 6, 5, 414, 4, 9, 462]
candidates = [
    [274],
    [274, 214, 3],
    [274, 214, 3, 2, 6]
]

best_net, winning_subset, winning_thr = -np.inf, None, None
for subset in candidates:
    net, thr = evaluate_subset_nb(subset, best_vs)
    print(f"Subset {subset}: CV net-score = {net:.1f} | thr = {thr:.2f}")
    if net > best_net:
        best_net, winning_subset, winning_thr = net, subset, thr

print(f"\n Winning subset: {winning_subset} | net-score = {best_net:.1f} | thr = {winning_thr:.2f}")

final_nb = GaussianNB(var_smoothing=best_vs).fit(x_train_final[winning_subset], y_train)
y_test_pred = (final_nb.predict_proba(x_test_final[winning_subset])[:, 1] >= winning_thr).astype(int)

print(f"Final GaussianNB model trained on features {winning_subset}; test predictions ready.")


Subset [274]: CV net-score = 25580.0 | thr = 0.50
Subset [274, 214, 3]: CV net-score = 32630.0 | thr = 0.46
Subset [274, 214, 3, 2, 6]: CV net-score = 33640.0 | thr = 0.40

🏆 Winning subset: [274, 214, 3, 2, 6] | net-score = 33640.0 | thr = 0.40
Final GaussianNB model trained on features [274, 214, 3, 2, 6]; test predictions ready.


In [29]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold

def evaluate_subset_nb(feats, var_smoothing):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    all_true, all_prob = [], []
    for tr_idx, val_idx in skf.split(x_train_final, y_train):
        X_tr, y_tr = x_train_final.iloc[tr_idx][feats], y_train.iloc[tr_idx]
        X_val, y_val = x_train_final.iloc[val_idx][feats], y_train.iloc[val_idx]
        model = GaussianNB(var_smoothing=var_smoothing).fit(X_tr, y_tr)
        all_true.append(y_val.values)
        all_prob.append(model.predict_proba(X_val)[:, 1])
    all_true = np.concatenate(all_true)
    all_prob = np.concatenate(all_prob)
    best_net, best_thr = -np.inf, 0.5
    for thr in np.linspace(0.1, 0.9, 81):
        preds = (all_prob >= thr).astype(int)
        net = net_score(all_true, preds, len(feats))
        if net > best_net:
            best_net, best_thr = net, thr
    return best_net, best_thr

candidates = [
    [2],
    [2, 3],
    [2, 3, 6],
    [2, 3, 6, 5],
    [2, 3, 6, 5, 462]
]

best_net, winning_subset, winning_thr = -np.inf, None, None
for subset in candidates:
    net, thr = evaluate_subset_nb(subset, best_vs)
    print(f"Subset {subset}: CV net-score = {net:.1f} | thr = {thr:.2f}")
    if net > best_net:
        best_net, winning_subset, winning_thr = net, subset, thr

print(f"\nWinning subset: {winning_subset} | net-score = {best_net:.1f} | thr = {winning_thr:.2f}")

final_nb = GaussianNB(var_smoothing=best_vs).fit(x_train_final[winning_subset], y_train)
y_test_pred = (final_nb.predict_proba(x_test_final[winning_subset])[:, 1] >= winning_thr).astype(int)

print(f"Final GaussianNB model trained on features {winning_subset}; test predictions ready.")


Subset [2]: CV net-score = 35160.0 | thr = 0.45
Subset [2, 3]: CV net-score = 34380.0 | thr = 0.41
Subset [2, 3, 6]: CV net-score = 34190.0 | thr = 0.41
Subset [2, 3, 6, 5]: CV net-score = 33710.0 | thr = 0.36
Subset [2, 3, 6, 5, 462]: CV net-score = 33830.0 | thr = 0.37

Winning subset: [2] | net-score = 35160.0 | thr = 0.45
Final GaussianNB model trained on features [2]; test predictions ready.


In [None]:
import numpy as np
import pandas as pd
import optuna
import shap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def net_score(y_true, y_pred, n_feats):
    return 10 * accuracy_score(y_true, y_pred) * len(y_true) - 200 * n_feats

def count_rf_features(model):
    return np.count_nonzero(model.feature_importances_)

def objective_rf(trial):
    n_estimators = trial.suggest_int("n_estimators", 200, 800)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    max_features = trial.suggest_float("max_features", 0.1, 0.8)
    threshold = trial.suggest_float("threshold", 0.3, 0.7)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    scores = []
    for tr_idx, val_idx in cv.split(x_train_final, y_train):
        X_tr, X_val = x_train_final.iloc[tr_idx], x_train_final.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        clf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            n_jobs=-1,
            random_state=0
        ).fit(X_tr, y_tr)
        preds = (clf.predict_proba(X_val)[:, 1] >= threshold).astype(int)
        scores.append(net_score(y_val, preds, count_rf_features(clf)))
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective_rf, n_trials=30, show_progress_bar=True)

best_params = study.best_params
best_thr = best_params.pop("threshold")
print(f"\nBest RF params: {best_params}, threshold={best_thr:.2f}")

rf = RandomForestClassifier(**best_params, n_jobs=-1, random_state=0).fit(x_train_final, y_train)
print(f"\nFinal model: #features used = {count_rf_features(rf)}")

print("\nRandom Forest SHAP analysis")
explainer = shap.TreeExplainer(
    rf,
    model_output="probability",
    feature_perturbation="interventional"
)

sample = shap.sample(x_train_final, 500, random_state=1)
phi = explainer.shap_values(sample)[1]           # class-1 (N, F)
mean_abs = np.abs(phi).mean(axis=0)
shap_imp = pd.Series(mean_abs, index=x_train_final.columns).sort_values(ascending=False)
print("\nTop-10 features by mean |SHAP|:")
print(shap_imp.head(10))

B, rng = 20, np.random.RandomState(0)
feat_count = pd.Series(0, index=x_train_final.columns)

for b in range(B):
    idx = rng.choice(len(x_train_final), len(x_train_final), replace=True)
    Xb, yb = x_train_final.iloc[idx], y_train.iloc[idx]
    m = RandomForestClassifier(**best_params, n_jobs=-1, random_state=b).fit(Xb, yb)
    expl_b = shap.TreeExplainer(
        m,
        model_output="probability",
        feature_perturbation="interventional"
    )
    phi_b = expl_b.shap_values(Xb.iloc[:300])[1]
    mean_abs_b = np.abs(phi_b).mean(axis=0)
    sel = Xb.columns[mean_abs_b > np.median(mean_abs_b)]
    feat_count[sel] += 1

stability = (feat_count / B).sort_values(ascending=False)
print("\nBootstrap selection frequency – Top-10:")
print(stability.head(10))


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

def evaluate_subset_rf(feats, rf_params):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    all_true, all_prob = [], []
    for tr_idx, val_idx in skf.split(x_train_final, y_train):
        X_tr, y_tr = x_train_final.iloc[tr_idx][feats], y_train.iloc[tr_idx]
        X_val, y_val = x_train_final.iloc[val_idx][feats], y_train.iloc[val_idx]
        model = RandomForestClassifier(
            n_estimators = rf_params["n_estimators"],
            max_depth    = rf_params["max_depth"],
            max_features = rf_params["max_features"],
            n_jobs       = -1,
            random_state = 0
        ).fit(X_tr, y_tr)
        all_true.append(y_val.values)
        all_prob.append(model.predict_proba(X_val)[:, 1])
    all_true = np.concatenate(all_true)
    all_prob = np.concatenate(all_prob)
    best_net, best_thr = -np.inf, 0.5
    for thr in np.linspace(0.1, 0.9, 81):
        preds = (all_prob >= thr).astype(int)
        net = net_score(all_true, preds, len(feats))
        if net > best_net:
            best_net, best_thr = net, thr
    return best_net, best_thr

candidates = [[2], [2, 462, 6]]

best_net, winning_subset, winning_thr = -np.inf, None, None
for subset in candidates:
    net, thr = evaluate_subset_rf(subset, best_params)
    print(f"Subset {subset}: CV net-score = {net:.1f} | thr = {thr:.2f}")
    if net > best_net:
        best_net, winning_subset, winning_thr = net, subset, thr

print(f"\n🏆 Winning subset: {winning_subset} | net-score = {best_net:.1f} | thr = {winning_thr:.2f}")

final_rf = RandomForestClassifier(
    n_estimators = best_params["n_estimators"],
    max_depth    = best_params["max_depth"],
    max_features = best_params["max_features"],
    n_jobs       = -1,
    random_state = 0
).fit(x_train_final[winning_subset], y_train)

y_test_pred = (final_rf.predict_proba(x_test_final[winning_subset])[:, 1] >= winning_thr).astype(int)

print(f"✅ Final RandomForest model trained on features {winning_subset}; test predictions ready.")

In [26]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

def evaluate_subset_rf(feats, rf_params):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    all_true, all_prob = [], []
    for tr_idx, val_idx in skf.split(x_train_final, y_train):
        X_tr, y_tr = x_train_final.iloc[tr_idx][feats], y_train.iloc[tr_idx]
        X_val, y_val = x_train_final.iloc[val_idx][feats], y_train.iloc[val_idx]
        model = RandomForestClassifier(
            n_estimators = rf_params["n_estimators"],
            max_depth    = rf_params["max_depth"],
            max_features = rf_params["max_features"],
            n_jobs       = -1,
            random_state = 0
        ).fit(X_tr, y_tr)
        all_true.append(y_val.values)
        all_prob.append(model.predict_proba(X_val)[:, 1])
    all_true = np.concatenate(all_true)
    all_prob = np.concatenate(all_prob)
    best_net, best_thr = -np.inf, 0.5
    for thr in np.linspace(0.1, 0.9, 81):
        preds = (all_prob >= thr).astype(int)
        net = net_score(all_true, preds, len(feats))
        if net > best_net:
            best_net, best_thr = net, thr
    return best_net, best_thr

candidates = [[2], [2, 462]]

best_net, winning_subset, winning_thr = -np.inf, None, None
for subset in candidates:
    net, thr = evaluate_subset_rf(subset, best_params)
    print(f"Subset {subset}: CV net-score = {net:.1f}  |  thr = {thr:.2f}")
    if net > best_net:
        best_net, winning_subset, winning_thr = net, subset, thr

print(f"\n🏆 Winning subset: {winning_subset} | net-score = {best_net:.1f} | thr = {winning_thr:.2f}")

final_rf = RandomForestClassifier(
    n_estimators = best_params["n_estimators"],
    max_depth    = best_params["max_depth"],
    max_features = best_params["max_features"],
    n_jobs       = -1,
    random_state = 0
).fit(x_train_final[winning_subset], y_train)

y_test_pred = (final_rf.predict_proba(x_test_final[winning_subset])[:, 1] >= winning_thr).astype(int)

print(f"✅ Final RandomForest model trained on features {winning_subset}; test predictions ready.")

[I 2025-06-02 11:46:09,492] A new study created in memory with name: no-name-571ff3a9-4693-4c20-b9a0-48150a1990d8
Best trial: 0. Best value: -91894:   3%|▎         | 1/30 [00:31<15:16, 31.61s/it]

[I 2025-06-02 11:46:41,098] Trial 0 finished with value: -91894.0 and parameters: {'n_estimators': 344, 'max_depth': 9, 'max_features': 0.18847739044982803, 'threshold': 0.4271921442229651}. Best is trial 0 with value: -91894.0.


Best trial: 0. Best value: -91894:   7%|▋         | 2/30 [00:53<12:09, 26.07s/it]

[I 2025-06-02 11:47:03,289] Trial 1 finished with value: -92010.0 and parameters: {'n_estimators': 415, 'max_depth': 4, 'max_features': 0.22766819060550955, 'threshold': 0.6585208840082575}. Best is trial 0 with value: -91894.0.


Best trial: 0. Best value: -91894:  10%|█         | 3/30 [04:04<45:28, 101.05s/it]

[I 2025-06-02 11:50:13,574] Trial 2 finished with value: -91998.0 and parameters: {'n_estimators': 736, 'max_depth': 7, 'max_features': 0.6973167370829153, 'threshold': 0.6660985785106917}. Best is trial 0 with value: -91894.0.


Best trial: 0. Best value: -91894:  13%|█▎        | 4/30 [06:10<48:08, 111.11s/it]

[I 2025-06-02 11:52:20,099] Trial 3 finished with value: -91942.0 and parameters: {'n_estimators': 720, 'max_depth': 4, 'max_features': 0.7972841041016869, 'threshold': 0.3223928643244902}. Best is trial 0 with value: -91894.0.


Best trial: 0. Best value: -91894:  17%|█▋        | 5/30 [07:22<40:20, 96.81s/it] 

[I 2025-06-02 11:53:31,550] Trial 4 finished with value: -91970.0 and parameters: {'n_estimators': 444, 'max_depth': 6, 'max_features': 0.48927598985324205, 'threshold': 0.3388431777743617}. Best is trial 0 with value: -91894.0.


Best trial: 0. Best value: -91894:  20%|██        | 6/30 [09:08<39:58, 99.95s/it]

[I 2025-06-02 11:55:17,594] Trial 5 finished with value: -91952.0 and parameters: {'n_estimators': 684, 'max_depth': 10, 'max_features': 0.3149363223607321, 'threshold': 0.6341543894814001}. Best is trial 0 with value: -91894.0.


Best trial: 6. Best value: -91870:  23%|██▎       | 7/30 [09:51<31:15, 81.56s/it]

[I 2025-06-02 11:56:01,300] Trial 6 finished with value: -91870.0 and parameters: {'n_estimators': 562, 'max_depth': 8, 'max_features': 0.18608115136541897, 'threshold': 0.4473380843538748}. Best is trial 6 with value: -91870.0.


Best trial: 6. Best value: -91870:  27%|██▋       | 8/30 [11:07<29:15, 79.78s/it]

[I 2025-06-02 11:57:17,271] Trial 7 finished with value: -92338.0 and parameters: {'n_estimators': 230, 'max_depth': 10, 'max_features': 0.6624210213447221, 'threshold': 0.6983390677193417}. Best is trial 6 with value: -91870.0.


Best trial: 6. Best value: -91870:  30%|███       | 9/30 [11:48<23:39, 67.61s/it]

[I 2025-06-02 11:57:58,119] Trial 8 finished with value: -92098.0 and parameters: {'n_estimators': 602, 'max_depth': 6, 'max_features': 0.20907072690849132, 'threshold': 0.6511721885870821}. Best is trial 6 with value: -91870.0.


Best trial: 6. Best value: -91870:  33%|███▎      | 10/30 [13:07<23:41, 71.10s/it]

[I 2025-06-02 11:59:17,017] Trial 9 finished with value: -92022.0 and parameters: {'n_estimators': 520, 'max_depth': 4, 'max_features': 0.6837140856190007, 'threshold': 0.6975245268381826}. Best is trial 6 with value: -91870.0.


Best trial: 6. Best value: -91870:  37%|███▋      | 11/30 [13:34<18:16, 57.70s/it]

[I 2025-06-02 11:59:44,353] Trial 10 finished with value: -91896.0 and parameters: {'n_estimators': 582, 'max_depth': 8, 'max_features': 0.10301538686723477, 'threshold': 0.5254710291224127}. Best is trial 6 with value: -91870.0.


Best trial: 11. Best value: -91854:  40%|████      | 12/30 [14:22<16:26, 54.79s/it]

[I 2025-06-02 12:00:32,491] Trial 11 finished with value: -91854.0 and parameters: {'n_estimators': 284, 'max_depth': 8, 'max_features': 0.3970582860392865, 'threshold': 0.4273549330379627}. Best is trial 11 with value: -91854.0.


Best trial: 11. Best value: -91854:  43%|████▎     | 13/30 [15:05<14:25, 50.93s/it]

[I 2025-06-02 12:01:14,534] Trial 12 finished with value: -91856.0 and parameters: {'n_estimators': 235, 'max_depth': 8, 'max_features': 0.41790978445798616, 'threshold': 0.44583121283668553}. Best is trial 11 with value: -91854.0.


Best trial: 11. Best value: -91854:  47%|████▋     | 14/30 [15:49<13:02, 48.88s/it]

[I 2025-06-02 12:01:58,668] Trial 13 finished with value: -91858.0 and parameters: {'n_estimators': 228, 'max_depth': 8, 'max_features': 0.4500657545988904, 'threshold': 0.5206571195567813}. Best is trial 11 with value: -91854.0.


Best trial: 11. Best value: -91854:  50%|█████     | 15/30 [16:33<11:54, 47.63s/it]

[I 2025-06-02 12:02:43,416] Trial 14 finished with value: -91862.0 and parameters: {'n_estimators': 328, 'max_depth': 7, 'max_features': 0.36331943289713486, 'threshold': 0.4051277374310737}. Best is trial 11 with value: -91854.0.


Best trial: 11. Best value: -91854:  53%|█████▎    | 16/30 [17:52<13:18, 57.04s/it]

[I 2025-06-02 12:04:02,285] Trial 15 finished with value: -91900.0 and parameters: {'n_estimators': 310, 'max_depth': 9, 'max_features': 0.5435189077857945, 'threshold': 0.5759840470464763}. Best is trial 11 with value: -91854.0.


Best trial: 11. Best value: -91854:  57%|█████▋    | 17/30 [18:14<10:04, 46.50s/it]

[I 2025-06-02 12:04:24,296] Trial 16 finished with value: -91874.0 and parameters: {'n_estimators': 205, 'max_depth': 5, 'max_features': 0.37622501578405426, 'threshold': 0.3827422158959402}. Best is trial 11 with value: -91854.0.


Best trial: 11. Best value: -91854:  60%|██████    | 18/30 [19:22<10:35, 52.94s/it]

[I 2025-06-02 12:05:32,224] Trial 17 finished with value: -91866.0 and parameters: {'n_estimators': 284, 'max_depth': 9, 'max_features': 0.5105892049904671, 'threshold': 0.47868346148394325}. Best is trial 11 with value: -91854.0.


Best trial: 11. Best value: -91854:  63%|██████▎   | 19/30 [20:48<11:32, 62.95s/it]

[I 2025-06-02 12:06:58,482] Trial 18 finished with value: -91864.0 and parameters: {'n_estimators': 387, 'max_depth': 7, 'max_features': 0.5766385906782249, 'threshold': 0.37154165089173363}. Best is trial 11 with value: -91854.0.


Best trial: 11. Best value: -91854:  67%|██████▋   | 20/30 [22:07<11:14, 67.47s/it]

[I 2025-06-02 12:08:16,493] Trial 19 finished with value: -91864.0 and parameters: {'n_estimators': 451, 'max_depth': 8, 'max_features': 0.39637035210870475, 'threshold': 0.464506552414341}. Best is trial 11 with value: -91854.0.


Best trial: 20. Best value: -64960:  70%|███████   | 21/30 [22:22<07:47, 51.95s/it]

[I 2025-06-02 12:08:32,271] Trial 20 finished with value: -64960.0 and parameters: {'n_estimators': 278, 'max_depth': 3, 'max_features': 0.30709732119463856, 'threshold': 0.5423754961473267}. Best is trial 20 with value: -64960.0.


Best trial: 20. Best value: -64960:  73%|███████▎  | 22/30 [22:39<05:30, 41.31s/it]

[I 2025-06-02 12:08:48,743] Trial 21 finished with value: -65514.0 and parameters: {'n_estimators': 286, 'max_depth': 3, 'max_features': 0.29036784168786073, 'threshold': 0.5464966217923791}. Best is trial 20 with value: -64960.0.


Best trial: 20. Best value: -64960:  77%|███████▋  | 23/30 [22:54<03:54, 33.45s/it]

[I 2025-06-02 12:09:03,862] Trial 22 finished with value: -65978.0 and parameters: {'n_estimators': 283, 'max_depth': 3, 'max_features': 0.293529365141568, 'threshold': 0.571269023031321}. Best is trial 20 with value: -64960.0.


Best trial: 20. Best value: -64960:  80%|████████  | 24/30 [23:14<02:57, 29.58s/it]

[I 2025-06-02 12:09:24,432] Trial 23 finished with value: -73576.0 and parameters: {'n_estimators': 375, 'max_depth': 3, 'max_features': 0.2957493675620744, 'threshold': 0.5733267862686489}. Best is trial 20 with value: -64960.0.


Best trial: 20. Best value: -64960:  83%|████████▎ | 25/30 [23:31<02:07, 25.58s/it]

[I 2025-06-02 12:09:40,671] Trial 24 finished with value: -65012.0 and parameters: {'n_estimators': 278, 'max_depth': 3, 'max_features': 0.2890482773080426, 'threshold': 0.573106654955077}. Best is trial 20 with value: -64960.0.


Best trial: 20. Best value: -64960:  87%|████████▋ | 26/30 [23:43<01:26, 21.70s/it]

[I 2025-06-02 12:09:53,329] Trial 25 finished with value: -85708.0 and parameters: {'n_estimators': 495, 'max_depth': 3, 'max_features': 0.11293707775546855, 'threshold': 0.6069574248823179}. Best is trial 20 with value: -64960.0.


Best trial: 20. Best value: -64960:  90%|█████████ | 27/30 [24:13<01:12, 24.00s/it]

[I 2025-06-02 12:10:22,693] Trial 26 finished with value: -91878.0 and parameters: {'n_estimators': 361, 'max_depth': 5, 'max_features': 0.26869923863980477, 'threshold': 0.5368649444400233}. Best is trial 20 with value: -64960.0.


Best trial: 20. Best value: -64960:  93%|█████████▎| 28/30 [24:33<00:46, 23.02s/it]

[I 2025-06-02 12:10:43,434] Trial 27 finished with value: -89894.0 and parameters: {'n_estimators': 260, 'max_depth': 4, 'max_features': 0.32973319029320675, 'threshold': 0.49045738656113513}. Best is trial 20 with value: -64960.0.


Best trial: 28. Best value: -55042:  97%|█████████▋| 29/30 [24:44<00:19, 19.14s/it]

[I 2025-06-02 12:10:53,509] Trial 28 finished with value: -55042.0 and parameters: {'n_estimators': 200, 'max_depth': 3, 'max_features': 0.24582761191885866, 'threshold': 0.6055500455796807}. Best is trial 28 with value: -55042.0.


Best trial: 28. Best value: -55042: 100%|██████████| 30/30 [25:00<00:00, 50.02s/it]


[I 2025-06-02 12:11:10,189] Trial 29 finished with value: -92006.0 and parameters: {'n_estimators': 336, 'max_depth': 5, 'max_features': 0.15917912600070916, 'threshold': 0.6047295366923146}. Best is trial 28 with value: -55042.0.

Best RF params: {'n_estimators': 200, 'max_depth': 3, 'max_features': 0.24582761191885866}, threshold=0.61

Final model: #feat = 296

Random Forest • SHAP analysis


ExplainerError: The background dataset you provided does not cover all the leaves in the model, so TreeExplainer cannot run with the feature_perturbation="tree_path_dependent" option! Try providing a larger background dataset, no background dataset, or using feature_perturbation="interventional".