## Basically adjusted code from Ola so that we use Optuna search

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectFromModel, RFE, mutual_info_classif, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import optuna
import shap

X_train = pd.read_csv('data/x_train.txt', sep=r'\s+', header=None)
y_train = pd.read_csv('data/y_train.txt', sep=r'\s+', header=None).iloc[:,0]
X_test  = pd.read_csv('data/x_test.txt',  sep=r'\s+', header=None)

preprocessing = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('vthresh', VarianceThreshold(threshold=0.01)),
])

def net_score_cv(pipe, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    X_arr = X.values if hasattr(X, 'values') else X
    for tr, va in skf.split(X_arr, y):
        pl = pipe.fit(X_arr[tr], y.iloc[tr])
        probs = pl.predict_proba(X_arr[va])[:,1]
        top_idx = np.argsort(probs)[-1000:]
        tp = y.iloc[va].iloc[top_idx].sum()
        # number of features used
        if 'fs' in pipe.named_steps:
            fs = pipe.named_steps['fs']
            nf = fs.n_features_ if hasattr(fs, 'n_features_') else fs.get_support().sum()
        else:
            nf = X_arr.shape[1]
        scores.append(10 * tp - 200 * nf)
    return np.mean(scores)

def objective(trial):
    strategy = trial.suggest_categorical('strategy', ['logistic_l1', 'rf_kbest', 'xgb_shap', 'svm_uni', 'enet_rfe'])
    if strategy == 'logistic_l1':
        C = trial.suggest_loguniform('log_l1_C', 1e-4, 1e1)
        fs = SelectFromModel(LogisticRegression(penalty='l1', solver='saga', C=C, max_iter=5000), threshold='mean')
        clf = LogisticRegression(random_state=42)
        pipe = Pipeline([*preprocessing.steps, ('fs', fs), ('clf', clf)])

    elif strategy == 'rf_kbest':
        k = trial.suggest_int('rf_k', 5, 100)
        def rf_score(X, y):
            rf_full = RandomForestClassifier(n_estimators=200, random_state=42)
            rf_full.fit(X, y)
            return rf_full.feature_importances_
        fs = SelectKBest(rf_score, k=k)
        clf = RandomForestClassifier(n_estimators=trial.suggest_int('rf_n_est', 100, 500), random_state=42)
        pipe = Pipeline([*preprocessing.steps, ('fs', fs), ('clf', clf)])

    elif strategy == 'xgb_shap':
        k = trial.suggest_int('xgb_k', 5, 50)
        Xp = preprocessing.fit_transform(X_train)
        xgb_full = XGBClassifier(
            n_estimators=trial.suggest_int('xgb_n_est', 500, 2000),
            learning_rate=trial.suggest_loguniform('xgb_lr', 1e-3, 1e-1),
            max_depth=trial.suggest_int('xgb_md', 3, 8),
            tree_method='hist', random_state=42, use_label_encoder=False, eval_metric='logloss'
        )
        xgb_full.fit(Xp, y_train, eval_set=[(Xp, y_train)], verbose=False)
        expl = shap.TreeExplainer(xgb_full)
        shap_vals = expl.shap_values(Xp)
        mean_abs = np.abs(shap_vals).mean(axis=0)
        fs = SelectKBest(lambda X, y: mean_abs, k=k)
        clf = LogisticRegression(random_state=42)
        pipe = Pipeline([*preprocessing.steps, ('fs', fs), ('clf', clf)])

    elif strategy == 'svm_uni':
        k = trial.suggest_int('svm_k', 5, 100)
        C = trial.suggest_loguniform('svm_C', 1e-2, 1e2)
        gamma = trial.suggest_categorical('svm_gamma', ['scale','auto'])
        fs = SelectKBest(score_func=lambda X, y: mutual_info_classif(X, y, random_state=42), k=k)
        clf = SVC(kernel='rbf', C=C, gamma=gamma, probability=True, random_state=42)
        pipe = Pipeline([*preprocessing.steps, ('fs', fs), ('clf', clf)])

    else:
        n_feat = trial.suggest_int('enet_n', 5, 100)
        enet = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=trial.suggest_uniform('enet_l1', 0.0, 1.0), C=trial.suggest_loguniform('enet_C', 1e-4, 1e1), max_iter=5000, random_state=42)
        fs = RFE(estimator=enet, n_features_to_select=n_feat, step=0.1)
        clf = LogisticRegression(random_state=42)
        pipe = Pipeline([*preprocessing.steps, ('fs', fs), ('clf', clf)])

    score = net_score_cv(pipe, X_train, y_train)
    return score

study = optuna.create_study(direction='maximize', study_name='pipeline_net_score', sampler=optuna.samplers.TPESampler(seed=42), pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=100, n_jobs=1)

print('Best strategy & hyperparams:', study.best_params)
print('Best net-score:', study.best_value)

[I 2025-05-27 23:58:39,868] A new study created in memory with name: pipeline_net_score
[I 2025-05-28 00:00:13,500] Trial 0 finished with value: 1086.0 and parameters: {'strategy': 'rf_kbest', 'rf_k': 19, 'rf_n_est': 123}. Best is trial 0 with value: 1086.0.
  enet = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=trial.suggest_uniform('enet_l1', 0.0, 1.0), C=trial.suggest_loguniform('enet_C', 1e-4, 1e1), max_iter=5000, random_state=42)
  enet = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=trial.suggest_uniform('enet_l1', 0.0, 1.0), C=trial.suggest_loguniform('enet_C', 1e-4, 1e1), max_iter=5000, random_state=42)
[I 2025-05-28 00:00:21,158] Trial 1 finished with value: -11914.0 and parameters: {'strategy': 'enet_rfe', 'enet_n': 84, 'enet_l1': 0.21233911067827616, 'enet_C': 0.0008111941985431928}. Best is trial 0 with value: 1086.0.
  learning_rate=trial.suggest_loguniform('xgb_lr', 1e-3, 1e-1),
Parameters: { "use_label_encoder" } are not used.

  bst

Best strategy & hyperparams: {'strategy': 'logistic_l1', 'log_l1_C': 0.00256161684385224}
Best net-score: 4686.0


In [None]:
def build_best_pipe(params):
    strat = params['strategy']
    steps = preprocessing.steps.copy()
    if strat == 'logistic_l1':
        fs  = SelectFromModel(
                  LogisticRegression(penalty='l1', solver='saga',
                                     C=params['log_l1_C'], max_iter=5000),
                  threshold='mean'
               )
        clf = LogisticRegression(random_state=42)
    elif strat == 'rf_kbest':
        fs  = SelectKBest(
                  lambda X,y: RandomForestClassifier(
                                  n_estimators=params['rf_n_est'],
                                  random_state=42
                              ).fit(X,y).feature_importances_,
                  k=params['rf_k']
               )
        clf = RandomForestClassifier(n_estimators=params['rf_n_est2'],
                                     random_state=42)
    elif strat == 'xgb_shap':
        Xp    = preprocessing.fit_transform(X_train)
        xgb_r = XGBClassifier(
                  n_estimators=params['xgb_n_est'],
                  learning_rate=params['xgb_lr'],
                  max_depth=params['xgb_md'],
                  tree_method='hist',
                  use_label_encoder=False,
                  eval_metric='logloss',
                  random_state=42
               )
        xgb_r.fit(Xp, y_train, verbose=False)
        expl  = shap.TreeExplainer(xgb_r)
        gain_arr = np.abs(expl.shap_values(Xp)).mean(axis=0)
        fs    = SelectKBest(lambda X,y: gain_arr, k=params['xgb_k'])
        clf   = LogisticRegression(random_state=42)
    elif strat == 'svm_uni':
        fs  = SelectKBest(
                  score_func=lambda X,y: mutual_info_classif(
                                             X,y,random_state=42
                                          ),
                  k=params['svm_k']
               )
        clf = SVC(kernel='rbf',
                  C=params['svm_C'],
                  gamma=params['svm_gamma'],
                  probability=True,
                  random_state=42)
    else:
        enet = LogisticRegression(
                   penalty='elasticnet',
                   solver='saga',
                   l1_ratio=params['enet_l1'],
                   C=params['enet_C'],
                   max_iter=5000,
                   random_state=42
               )
        fs   = RFE(estimator=enet,
                   n_features_to_select=params['enet_n'],
                   step=0.1)
        clf  = LogisticRegression(random_state=42)

    steps += [('fs', fs), ('clf', clf)]
    return Pipeline(steps)

best_pipe = build_best_pipe(study.best_params)

cv_score = net_score_cv(best_pipe, X_train, y_train)
print(f"Net‐score of Optuna pipeline (5-fold CV): {cv_score:.1f}")

best_pipe.fit(X_train, y_train)
probs = best_pipe.predict_proba(X_test)[:, 1]

top_idx = np.argsort(probs)[-1000:]
with open('STUDENTID_obs.txt','w') as f:
    for i in top_idx:
        f.write(f"{i}\n")

fs = best_pipe.named_steps['fs']
mask = fs.get_support() if hasattr(fs, 'get_support') else np.ones(X_train.shape[1], dtype=bool)
feat_idxs = np.where(mask)[0]
with open('STUDENTID_vars.txt','w') as f:
    for i in feat_idxs:
        f.write(f"{i}\n")

Net‐score of Optuna pipeline (5-fold CV): 4686.0
