In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
X_train = pd.read_csv("data/x_train.txt",header=None,sep=" ")
y_train = pd.read_csv("data/y_train.txt",header=None,sep=" ")
X_train, y_train = np.array(X_train), np.array(y_train).T[0]
def eval_proba(probas,y,n_features, num_target = 1000):
    sorted_probas = np.sort(probas)[::-1]  
    threshold = sorted_probas[num_target - 1]

    y_pred = (probas >= threshold).astype(int)
    gain = 10*precision_score(y, y_pred)*num_target
    return np.round(gain - 200*n_features)

In [29]:
from sklearn.feature_selection import RFE
from sklearn.model_selection import ParameterGrid, cross_val_predict
from xgboost import XGBClassifier
import numpy as np

param_grid = {
    'max_depth': [5,10, 15, 25],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'n_features': [1, 2, 3, 4, 5]
}
#Choosen based on previous test
start_features = np.array([2,  4,  5, 414, 425, 462])
best_score = 0
best_params = None
history = []
X_start = X_train[:,start_features]
for params in ParameterGrid(param_grid):
    
    model = XGBClassifier(n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            learning_rate=params['learning_rate'], 
            objective='binary:logistic',
            random_state=42,
            n_jobs=-1)
    
    selector = RFE(estimator=model, n_features_to_select=params['n_features'], step=1, verbose=0)
    selector.fit(X_start, y_train)
    selected = start_features[selector.get_support(indices=True)]
    X_selected = selector.transform(X_start)
    probas = cross_val_predict(model, X_selected, y_train,  method='predict_proba')[:,1]
    score = eval_proba(probas, y_train,params['n_features'])
    print(f"Params: {params}, score:{score}, features:{selected}")

    history.append({**params,
                    "score": score,
                    "features":selected})
    if score > best_score:
        best_score = score
        best_params = params
        best_features = selected

print(f"Best parameters: {best_params}")
print(f"Max gain: {best_score}")

(5000, 6)
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 1}, score:6986.0, features:[2]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 2}, score:6596.0, features:[  2 414]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 3}, score:6595.0, features:[  2 414 462]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 4}, score:6655.0, features:[  2   4 414 462]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50, 'n_features': 5}, score:6317.0, features:[  2   4 414 425 462]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 100, 'n_features': 1}, score:6982.0, features:[2]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 100, 'n_features': 2}, score:6620.0, features:[  2 414]
Params: {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 100, 'n_features': 3}, score:6599.0, features:[  2 414 462]
Params: {'l

KeyboardInterrupt: 

In [11]:
X_train[[0,1,2]]

Unnamed: 0,0,1,2
0,20.454647,16.739345,39.811892
1,16.175225,10.483281,27.471017
2,10.577212,10.795115,24.621388
3,26.299206,13.471215,51.725934
4,23.193955,20.037969,37.780290
...,...,...,...
4995,22.390013,15.116628,25.835180
4996,15.482546,14.319951,31.035257
4997,10.607588,8.713746,17.567374
4998,22.984365,13.822107,30.838448


In [None]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, f1_score, accuracy_score
import pandas as pd
import numpy as np

param_grid = {
    'max_depth': [None, 25, 15],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'n_features': [1, 2, 3, 4, 5, 6]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_gain = -np.inf
best_params = None
history = []

for params in ParameterGrid(param_grid):
    cv_gains = []
    cv_precisions = []
    cv_f1s = []
    cv_accuracies = []
    
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx, 0], y_train.iloc[val_idx, 0]

        mi = mutual_info_classif(X_tr, y_tr, random_state=42)
        top_idx = np.argsort(mi)[::-1][:params['n_features']]
        
        clf = XGBClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            learning_rate=params['learning_rate'],
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1
        )
        clf.fit(X_tr.iloc[:, top_idx], y_tr)
        
        probas = clf.predict_proba(X_val.iloc[:, top_idx])[:, 1]
        y_pred = (probas >= 0.5).astype(int)
        
        gain = eval_proba(probas, y_val.values, params['n_features'], num_target=200)
        precision = precision_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        accuracy = accuracy_score(y_val, y_pred)
        
        cv_gains.append(gain)
        cv_precisions.append(precision)
        cv_f1s.append(f1)
        cv_accuracies.append(accuracy)

    avg_gain = np.mean(cv_gains)
    avg_precision = np.mean(cv_precisions)
    avg_f1 = np.mean(cv_f1s)
    avg_accuracy = np.mean(cv_accuracies)
    
    print(f"Params: {params}, gain: {avg_gain:.2f}, precision: {avg_precision:.3f}, f1: {avg_f1:.3f}, accuracy: {avg_accuracy:.3f}")

    history.append({**params,
                    "gain": avg_gain,
                    "precision": avg_precision,
                    "f1": avg_f1,
                    "accuracy": avg_accuracy})

    if avg_gain > best_gain:
        best_gain = avg_gain
        best_params = params

print(f"Best parameters: {best_params}")
print(f"Max gain: {best_gain}")

history_df = pd.DataFrame(history)
display(history_df.sort_values("gain", ascending=False).head(10))

Params: {'learning_rate': 0.001, 'max_depth': None, 'n_estimators': 50, 'n_features': 1}, gain: 1227.00, precision: 0.712, f1: 0.498, accuracy: 0.630
Params: {'learning_rate': 0.001, 'max_depth': None, 'n_estimators': 50, 'n_features': 2}, gain: 1057.00, precision: 0.719, f1: 0.518, accuracy: 0.633
Params: {'learning_rate': 0.001, 'max_depth': None, 'n_estimators': 50, 'n_features': 3}, gain: 835.60, precision: 0.711, f1: 0.503, accuracy: 0.628
Params: {'learning_rate': 0.001, 'max_depth': None, 'n_estimators': 50, 'n_features': 4}, gain: 632.40, precision: 0.719, f1: 0.587, accuracy: 0.661
Params: {'learning_rate': 0.001, 'max_depth': None, 'n_estimators': 50, 'n_features': 5}, gain: 430.00, precision: 0.707, f1: 0.566, accuracy: 0.650
Params: {'learning_rate': 0.001, 'max_depth': None, 'n_estimators': 50, 'n_features': 6}, gain: 242.20, precision: 0.713, f1: 0.572, accuracy: 0.653
Params: {'learning_rate': 0.001, 'max_depth': None, 'n_estimators': 100, 'n_features': 1}, gain: 1250.40

Unnamed: 0,learning_rate,max_depth,n_estimators,n_features,gain,precision,f1,accuracy
36,0.001,25.0,150,1,1259.8,0.702186,0.665784,0.6898
30,0.001,25.0,100,1,1256.6,0.713108,0.661783,0.692
54,0.001,15.0,100,1,1256.6,0.713108,0.661783,0.692
222,0.1,,100,1,1255.6,0.692001,0.687416,0.6964
60,0.001,15.0,150,1,1254.0,0.702186,0.665784,0.6898
108,0.01,25.0,150,1,1253.0,0.688923,0.67762,0.69
132,0.01,15.0,150,1,1252.2,0.688482,0.678296,0.6902
42,0.001,25.0,200,1,1251.4,0.701375,0.670499,0.6916
6,0.001,,100,1,1250.4,0.714165,0.670017,0.6964
162,0.05,,200,1,1250.0,0.692322,0.689737,0.6978


In [8]:
display(history_df.sort_values("gain", ascending=False).head(50))

Unnamed: 0,learning_rate,max_depth,n_estimators,n_features,gain,precision,f1,accuracy
36,0.001,25.0,150,1,1259.8,0.702186,0.665784,0.6898
30,0.001,25.0,100,1,1256.6,0.713108,0.661783,0.692
54,0.001,15.0,100,1,1256.6,0.713108,0.661783,0.692
222,0.1,,100,1,1255.6,0.692001,0.687416,0.6964
60,0.001,15.0,150,1,1254.0,0.702186,0.665784,0.6898
108,0.01,25.0,150,1,1253.0,0.688923,0.67762,0.69
132,0.01,15.0,150,1,1252.2,0.688482,0.678296,0.6902
42,0.001,25.0,200,1,1251.4,0.701375,0.670499,0.6916
6,0.001,,100,1,1250.4,0.714165,0.670017,0.6964
162,0.05,,200,1,1250.0,0.692322,0.689737,0.6978
