In [1]:
import sys
from pathlib import Path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import re
import sklearn as skt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from noise_remover import NoiseRemover
from titanic_preprocessor import TitanicPreprocessor
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from xgboost.callback import EarlyStopping

from utils import load_config, load_datasets

load_config()
dataset, dt = load_datasets()

model= XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",       # modela conteos
    tree_method="hist",         # rápido y eficiente en CPU
    n_estimators=5000,
    learning_rate=0.1,
    callbacks= [EarlyStopping(rounds=50, save_best=True)],
    max_depth=3,
    enable_categorical=True,
    verbosity=0
)

def generate_predefined_split(y, prep):
    X_train, X_eval, y_train, y_eval = train_test_split(df, y, test_size=0.2, stratify=y)
    X_train_preprocessed = prep.fit_transform(X_train, y_train)  
    X_eval_preprocessed  = prep.transform(X_eval)   
    X_preprocessed = pd.concat([X_train_preprocessed, X_eval_preprocessed], axis=0)
    y_all   = pd.concat([y_train, y_eval], axis=0)
    test_fold = np.r_[[-1]*len(X_train_preprocessed), [0]*len(X_eval_preprocessed)]
    return X_preprocessed, X_eval_preprocessed, y_all, y_eval, test_fold


In [None]:
param_grid = {
    'max_depth': [3, 4, 5, 7, 9 ,12, 13, 14, 15],
    'learning_rate': [0.008,0.009 ,0.01, 0.1, 0.2, 0.8],
    'subsample': [0.5, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.3, 0.8, 0.9, 1.0]
}

df=dataset.copy()
y = df.pop('Survived')
dfcv=df.copy()
params=[]

for i in range(100):
    data_preprocessor = TitanicPreprocessor()

    prep = Pipeline([('pre-processor', TitanicPreprocessor()),('noise_remover', NoiseRemover())])
    X_preprocessed, X_eval_preprocessed, y_all, y_eval, test_fold = generate_predefined_split(y, prep)

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=PredefinedSplit(test_fold),
        scoring='accuracy',
        n_jobs=-1,  # Use all available cores
        verbose=0  # Print progress
    )

    print("Starting grid search...")
    grid_search.fit(X_preprocessed, y_all,eval_set=[(X_eval_preprocessed, y_eval)],verbose=False)

    best_xgb = grid_search.best_estimator_
    best_iter = getattr(best_xgb, "best_iteration", None)
    print("Best params:", grid_search.best_params_)
    print("Best (hold-out) score:", grid_search.best_score_)
    n_estimators=(best_iter + 1) if best_iter is not None else 0
    print("Best n_estimators (ES):", n_estimators)
    grid_search.best_params_["n_estimators"]=n_estimators
    grid_search.best_params_["score"]=grid_search.best_score_
    params.append(grid_search.best_params_)

paramsFileContent=pd.DataFrame(params)
paramsFileContent.to_csv("params.csv", index=False)

print(params)

#[{'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 9, 'subsample': 0.5}, {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'subsample': 0.5}]





Starting grid search...
Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 12, 'subsample': 0.5}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 665
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.9}
Best (hold-out) score: 0.8491620111731844
Best n_estimators (ES): 277
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'subsample': 0.5}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 3339
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.8}
Best (hold-out) score: 0.888268156424581
Best n_estimators (ES): 257
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'subsample': 0.5}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 783
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.8}
Best (hold-out) score: 0.8491620111731844
Best n_estimators (ES): 486
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.2, 'max_depth': 5, 'subsample': 0.5}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 306
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.008, 'max_depth': 3, 'subsample': 1.0}
Best (hold-out) score: 0.888268156424581
Best n_estimators (ES): 5000
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 4, 'subsample': 0.8}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 1138
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 1.0}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 4876
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 15, 'subsample': 0.8}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 1368
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.888268156424581
Best n_estimators (ES): 655
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 9, 'subsample': 1.0}
Best (hold-out) score: 0.8324022346368715
Best n_estimators (ES): 1252
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 13, 'subsample': 0.9}
Best (hold-out) score: 0.8268156424581006
Best n_estimators (ES): 1098
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'subsample': 0.9}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 1220
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 122
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 14, 'subsample': 0.5}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 1025
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 15, 'subsample': 0.5}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 731
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 14, 'subsample': 0.8}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 255
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.8}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 531
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 12, 'subsample': 0.8}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 200
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.8}
Best (hold-out) score: 0.8324022346368715
Best n_estimators (ES): 480
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.2, 'max_depth': 5, 'subsample': 0.5}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 431
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.9}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 389
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.5}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 352
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.9}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 499
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.9}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 426
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 7, 'subsample': 0.8}
Best (hold-out) score: 0.7988826815642458
Best n_estimators (ES): 1364
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 14, 'subsample': 0.8}
Best (hold-out) score: 0.888268156424581
Best n_estimators (ES): 532
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 12, 'subsample': 1.0}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 2738
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 5, 'subsample': 1.0}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 1393
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 1.0}
Best (hold-out) score: 0.8938547486033519
Best n_estimators (ES): 1886
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.8}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 535
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.9}
Best (hold-out) score: 0.8268156424581006
Best n_estimators (ES): 446
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.008, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 4995
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.9}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 519
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 12, 'subsample': 0.8}
Best (hold-out) score: 0.888268156424581
Best n_estimators (ES): 245
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.008, 'max_depth': 4, 'subsample': 1.0}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 5000
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 12, 'subsample': 0.8}
Best (hold-out) score: 0.888268156424581
Best n_estimators (ES): 292
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 14, 'subsample': 0.8}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 502
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 774
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'subsample': 0.8}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 701
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 5, 'subsample': 0.9}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 294
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.9}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 766
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'subsample': 0.5}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 642
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.9}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 202
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 4, 'subsample': 0.8}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 1114
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.009, 'max_depth': 13, 'subsample': 1.0}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 5000
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.9}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 633
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.8}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 152
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 13, 'subsample': 0.5}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 56
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 5, 'subsample': 0.5}
Best (hold-out) score: 0.8770949720670391
Best n_estimators (ES): 680
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 235
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 1.0}
Best (hold-out) score: 0.8491620111731844
Best n_estimators (ES): 1663
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 1.0}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 1292
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 5, 'subsample': 0.5}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 856
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.8324022346368715
Best n_estimators (ES): 211
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.8}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 379
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.5}
Best (hold-out) score: 0.888268156424581
Best n_estimators (ES): 181
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.8770949720670391
Best n_estimators (ES): 276
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 9, 'subsample': 0.5}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 451
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 395
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.008, 'max_depth': 12, 'subsample': 1.0}
Best (hold-out) score: 0.8770949720670391
Best n_estimators (ES): 5000
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 1043
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 3, 'subsample': 0.8}
Best (hold-out) score: 0.8379888268156425
Best n_estimators (ES): 1024
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 9, 'subsample': 1.0}
Best (hold-out) score: 0.8491620111731844
Best n_estimators (ES): 2900
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 1.0}
Best (hold-out) score: 0.8100558659217877
Best n_estimators (ES): 1514
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 1.0}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 5000
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 5, 'subsample': 0.9}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 912
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 5, 'subsample': 0.5}
Best (hold-out) score: 0.8938547486033519
Best n_estimators (ES): 341
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 629
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 5, 'subsample': 0.8}
Best (hold-out) score: 0.8938547486033519
Best n_estimators (ES): 405
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.008, 'max_depth': 7, 'subsample': 1.0}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 5000
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.8770949720670391
Best n_estimators (ES): 911
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.008, 'max_depth': 12, 'subsample': 0.9}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 5000
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.2, 'max_depth': 5, 'subsample': 1.0}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 1142
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.2, 'max_depth': 12, 'subsample': 0.8}
Best (hold-out) score: 0.8324022346368715
Best n_estimators (ES): 456
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.8}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 458
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 4, 'subsample': 0.8}
Best (hold-out) score: 0.8324022346368715
Best n_estimators (ES): 1541
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 5, 'subsample': 0.9}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 755
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.009, 'max_depth': 3, 'subsample': 1.0}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 4999
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 266
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 12, 'subsample': 0.5}
Best (hold-out) score: 0.8770949720670391
Best n_estimators (ES): 509
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 9, 'subsample': 0.5}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 152
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.8491620111731844
Best n_estimators (ES): 267
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.01, 'max_depth': 7, 'subsample': 1.0}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 5000
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.9}
Best (hold-out) score: 0.8938547486033519
Best n_estimators (ES): 308
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.2, 'max_depth': 4, 'subsample': 0.9}
Best (hold-out) score: 0.9050279329608939
Best n_estimators (ES): 997
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.2, 'max_depth': 3, 'subsample': 0.9}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 873
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 225
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.9}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 538
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'subsample': 0.8}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 4994
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 14, 'subsample': 0.5}
Best (hold-out) score: 0.9162011173184358
Best n_estimators (ES): 616
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.9}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 301
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 907
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.8156424581005587
Best n_estimators (ES): 907
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 5, 'subsample': 0.5}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 817
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 5, 'subsample': 0.9}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 589
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.5}
Best (hold-out) score: 0.8603351955307262
Best n_estimators (ES): 240
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 13, 'subsample': 0.9}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 386
[{'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 12, 'subsample': 0.5, 'n_estimators': 665, 'score': 0.8435754189944135}, {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.9, 'n_estimators': 277, 'score': 0.8491620111731844}, {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'subsample': 0.5, 'n_estimators': 3339, 'score': 0.8659217877094972}, {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.8, 'n_estimators': 257, 'score': 0.888268156424581}, {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'subsample': 0.5, 'n_estimators': 783, 'score': 0.8715083798882681}, {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.8, 'n_estimators': 486, 'score': 0.8491620111731844}, {'colsample_bytree': 0.3, 'learning

In [4]:

params_and_results=[]
cv=StratifiedKFold(n_splits=5,shuffle=False,random_state=None)
for setOfParams in params:
    tunedModel= XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",       # modela conteos
    tree_method="hist",         # rápido y eficiente en CPU
    n_estimators=setOfParams["n_estimators"],
    learning_rate=setOfParams["learning_rate"],
    max_depth=setOfParams["max_depth"],
    colsample_bytree=setOfParams["colsample_bytree"],
    subsample=setOfParams["subsample"],
    enable_categorical=True,
    verbosity=0
    )
    pipeline = Pipeline([('pre-processor', data_preprocessor), ('nosie remover', NoiseRemover()),('rfb', tunedModel)])
    final_score_raw=cross_val_score(pipeline,dfcv,y,cv=cv,scoring="accuracy")
    setOfParams["cross_validation_score"] = final_score_raw.mean()
    setOfParams["cross_validation_error_margin"] = final_score_raw.std()
    params_and_results.append(setOfParams)
    print("para los siguientes hiperparametros")
    print(setOfParams)
    print(f"El promedio de precision en cross validation es: {final_score_raw.mean():.4f} (+/- {final_score_raw.std() * 2:.4f})")



params_and_results.sort(key=lambda x : x["cross_validation_score"])    
paramsFileContent=pd.DataFrame(params_and_results)
paramsFileContent.to_csv("params_with_results.csv", index=False)
 
    

para los siguientes hiperparametros
{'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 12, 'subsample': 0.5, 'n_estimators': 665, 'score': 0.8435754189944135, 'cross_validation_score': 0.8058627832527776, 'cross_validation_error_margin': 0.013132293251322733}
El promedio de precision en cross validation es: 0.8059 (+/- 0.0263)
para los siguientes hiperparametros
{'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.9, 'n_estimators': 277, 'score': 0.8491620111731844, 'cross_validation_score': 0.8024794425961961, 'cross_validation_error_margin': 0.021915880417229545}
El promedio de precision en cross validation es: 0.8025 (+/- 0.0438)
para los siguientes hiperparametros
{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'subsample': 0.5, 'n_estimators': 3339, 'score': 0.8659217877094972, 'cross_validation_score': 0.8271922666499277, 'cross_validation_error_margin': 0.01958100729447747}
El promedio de precision en cross validation es: 0.82