In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
X_train = pd.read_csv("data/x_train.txt",header=None,sep=" ")
y_train = pd.read_csv("data/y_train.txt",header=None,sep=" ")
def eval_proba(probas,y,n_features, num_target = 1000):
    sorted_probas = np.sort(probas)[::-1]  
    threshold = sorted_probas[num_target - 1]

    y_pred = (probas >= threshold).astype(int)
    gain = 10*precision_score(y, y_pred)*num_target
    return np.round(gain - 200*n_features)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import precision_score, f1_score, accuracy_score
import shap
import pandas as pd

param_grid = {
    'max_depth': [None, 20, 10],
    'n_estimators': [50, 100, 150, 200],
    "min_samples_split": [2, 4, 8],
    'n_features': [i for i in range(1, 7)] 
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_gain = -np.inf
best_params = None
history = []   

for params in ParameterGrid(param_grid):
    cv_gains = []
    cv_precisions = []
    cv_f1s = []
    cv_accuracies = []

    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx, 0], y_train.iloc[val_idx, 0]

        rf = RandomForestClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            n_jobs=-1,
            random_state=42
        )
        rf.fit(X_tr, y_tr)

        explainer = shap.TreeExplainer(rf)
        shap_values = explainer.shap_values(X_tr)[1]  
        mean_shap = np.abs(shap_values).mean(axis=0)
        top_idx = np.argsort(mean_shap)[::-1][:params['n_features']]

        rf_small = RandomForestClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            n_jobs=-1,
            random_state=42
        )
        rf_small.fit(X_tr.iloc[:, top_idx], y_tr)

        probas = rf_small.predict_proba(X_val.iloc[:, top_idx])[:, 1]
        y_pred = (probas >= 0.5).astype(int)

        gain = eval_proba(probas, y_val.values, params['n_features'], num_target=200)
        precision = precision_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        accuracy = accuracy_score(y_val, y_pred)

        cv_gains.append(gain)
        cv_precisions.append(precision)
        cv_f1s.append(f1)
        cv_accuracies.append(accuracy)

    avg_gain = np.mean(cv_gains)
    avg_precision = np.mean(cv_precisions)
    avg_f1 = np.mean(cv_f1s)
    avg_accuracy = np.mean(cv_accuracies)

    print(f"Params: {params}, gain: {avg_gain:.2f}, precision: {avg_precision:.3f}, f1: {avg_f1:.3f}, accuracy: {avg_accuracy:.3f}")

    history.append({**params,
                    "gain": avg_gain,
                    "precision": avg_precision,
                    "f1": avg_f1,
                    "accuracy": avg_accuracy})

    if avg_gain > best_gain:
        best_gain = avg_gain
        best_params = params

print(f"Best parameters: {best_params}")
print(f"Max gain: {best_gain}")

history_df = pd.DataFrame(history)
display(history_df.sort_values("gain", ascending=False).head(10))

Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50, 'n_features': 1}, gain: 986.40, precision: 0.538, f1: 0.543, accuracy: 0.549
Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50, 'n_features': 2}, gain: 846.00, precision: 0.574, f1: 0.579, accuracy: 0.585
Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50, 'n_features': 3}, gain: 646.00, precision: 0.574, f1: 0.579, accuracy: 0.585
Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50, 'n_features': 4}, gain: 446.00, precision: 0.574, f1: 0.579, accuracy: 0.585
Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50, 'n_features': 5}, gain: 246.00, precision: 0.574, f1: 0.579, accuracy: 0.585
Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50, 'n_features': 6}, gain: 46.00, precision: 0.574, f1: 0.579, accuracy: 0.585
Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100, 'n_features': 1}, gain: 968.20, pr

Unnamed: 0,max_depth,min_samples_split,n_estimators,n_features,gain,precision,f1,accuracy
198,10.0,8,100,1,1173.2,0.627534,0.605402,0.6274
180,10.0,4,150,1,1146.0,0.636084,0.599415,0.63
168,10.0,4,50,1,1135.2,0.624789,0.611152,0.6282
144,10.0,2,50,1,1134.6,0.630871,0.606776,0.63
210,10.0,8,200,1,1134.0,0.626964,0.612881,0.6302
204,10.0,8,150,1,1130.0,0.635653,0.60815,0.633
186,10.0,4,200,1,1128.8,0.634759,0.606402,0.632
192,10.0,8,50,1,1126.6,0.630268,0.612257,0.6318
156,10.0,2,150,1,1120.0,0.629725,0.599702,0.6268
162,10.0,2,200,1,1114.0,0.632278,0.604432,0.63


In [12]:
display(history_df.sort_values("gain", ascending=False).head(40))

Unnamed: 0,max_depth,min_samples_split,n_estimators,n_features,gain,precision,f1,accuracy
198,10.0,8,100,1,1173.2,0.627534,0.605402,0.6274
180,10.0,4,150,1,1146.0,0.636084,0.599415,0.63
168,10.0,4,50,1,1135.2,0.624789,0.611152,0.6282
144,10.0,2,50,1,1134.6,0.630871,0.606776,0.63
210,10.0,8,200,1,1134.0,0.626964,0.612881,0.6302
204,10.0,8,150,1,1130.0,0.635653,0.60815,0.633
186,10.0,4,200,1,1128.8,0.634759,0.606402,0.632
192,10.0,8,50,1,1126.6,0.630268,0.612257,0.6318
156,10.0,2,150,1,1120.0,0.629725,0.599702,0.6268
162,10.0,2,200,1,1114.0,0.632278,0.604432,0.63
