In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
X_train = pd.read_csv("data/x_train.txt",header=None,sep=" ")
y_train = pd.read_csv("data/y_train.txt",header=None,sep=" ")
X_train, y_train = np.array(X_train), np.array(y_train).T[0]
def eval_proba(probas,y,n_features, num_target = 1000):
    sorted_probas = np.sort(probas)[::-1]  
    threshold = sorted_probas[num_target - 1]

    y_pred = (probas >= threshold).astype(int)
    gain = 10*precision_score(y, y_pred)*num_target
    return np.round(gain - 200*n_features)

In [6]:
from sklearn.feature_selection import RFE
from sklearn.model_selection import ParameterGrid, cross_val_predict,StratifiedKFold
from xgboost import XGBClassifier
import numpy as np

param_grid = {
    'max_depth': [5,10, 15, 25],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.001, 0.01,  0.1, 1],
    'n_features': [1, 2, 3, 4, 5]
}
#Choosen based on previous test
start_features = np.array([2,  4,  5, 414, 425, 462])
best_score = 0
best_params = None
history = []
X_start = X_train[:,start_features]
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for params in ParameterGrid(param_grid):
    
    model = XGBClassifier(n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            learning_rate=params['learning_rate'], 
            objective='binary:logistic',
            random_state=42,
            n_jobs=-1)
    
    selector = RFE(estimator=model, n_features_to_select=params['n_features'], step=1, verbose=0)
    selector.fit(X_start, y_train)
    selected = start_features[selector.get_support(indices=True)]
    X_selected = selector.transform(X_start)
    probas = cross_val_predict(model, X_selected, y_train, cv=cv, method='predict_proba')[:,1]
    score = eval_proba(probas, y_train,params['n_features'])
    print(f"Params: {params}, score:{score}, features:{selected}")

    history.append({**params,
                    "score": score,
                    "features":selected})
    if score > best_score:
        best_score = score
        best_params = params
        best_features = selected

print(f"Best parameters: {best_params}")
print(f"Best features: {best_features}")
print(f"Max gain: {best_score}")

Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 1}, score:6991.0, features:[2]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 2}, score:6721.0, features:[  2 414]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 3}, score:6335.0, features:[  2 414 462]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 4}, score:6303.0, features:[  2   4 414 462]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 5}, score:6099.0, features:[  2   4 414 425 462]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 100, 'n_features': 1}, score:6946.0, features:[2]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 100, 'n_features': 2}, score:6630.0, features:[  2 414]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 100, 'n_features': 3}, score:6393.0, features:[  2 414 462]
Params: {'learning_ra