In [1]:
import sys
from pathlib import Path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

import pandas as pd
import numpy as np
import re
import sklearn as skt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from noise_remover import NoiseRemover
from titanic_preprocessor import TitanicPreprocessor
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from xgboost.callback import EarlyStopping

from utils import load_config, load_datasets

load_config()
dataset, dt = load_datasets()

model= XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",       # modela conteos
    tree_method="hist",         # rápido y eficiente en CPU
    n_estimators=5000,
    learning_rate=0.1,
    callbacks= [EarlyStopping(rounds=50, save_best=True)],
    max_depth=3,
    enable_categorical=True,
    verbosity=0
)

def generate_predefined_split(y, prep):
    X_train, X_eval, y_train, y_eval = train_test_split(df, y, test_size=0.2, stratify=y)
    X_train_preprocessed = prep.fit_transform(X_train, y_train)  
    X_eval_preprocessed  = prep.transform(X_eval)   
    X_preprocessed = pd.concat([X_train_preprocessed, X_eval_preprocessed], axis=0)
    y_all   = pd.concat([y_train, y_eval], axis=0)
    test_fold = np.r_[[-1]*len(X_train_preprocessed), [0]*len(X_eval_preprocessed)]
    return X_preprocessed, X_eval_preprocessed, y_all, y_eval, test_fold


In [4]:
param_grid = {
    'max_depth': [3, 4, 5, 7, 9 ,12, 13, 14, 15],
    'learning_rate': [0.009 ,0.01, 0.1, 0.2, 0.8],
    'subsample': [0.5, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.3, 0.8, 0.9, 1.0]
}
df=dataset.copy()
y = df.pop('Survived')
params=[]

for i in range(20):
    data_preprocessor = TitanicPreprocessor()

    prep = Pipeline([('pre-processor', TitanicPreprocessor()),('noise_remover', NoiseRemover())])
    X_preprocessed, X_eval_preprocessed, y_all, y_eval, test_fold = generate_predefined_split(y, prep)

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=PredefinedSplit(test_fold),
        scoring='accuracy',
        n_jobs=-1,  # Use all available cores
        verbose=0  # Print progress
    )

    print("Starting grid search...")
    grid_search.fit(X_preprocessed, y_all,eval_set=[(X_eval_preprocessed, y_eval)],verbose=False)

    best_xgb = grid_search.best_estimator_
    best_iter = getattr(best_xgb, "best_iteration", None)
    print("Best params:", grid_search.best_params_)
    print("Best (hold-out) score:", grid_search.best_score_)
    print("Best n_estimators (ES):", (best_iter + 1) if best_iter is not None else "N/A")
    params.append(grid_search.best_params_)

paramsFileContent=pd.DataFrame(params.to_dict)
paramsFileContent.to_csv("params.csv", index=False)

print(params)

#[{'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 9, 'subsample': 0.5}, {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'subsample': 0.5}]





Starting grid search...
Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.8268156424581006
Best n_estimators (ES): 208
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 7, 'subsample': 0.8}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 275
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 4, 'subsample': 1.0}
Best (hold-out) score: 0.8938547486033519
Best n_estimators (ES): 4124
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.2, 'max_depth': 13, 'subsample': 0.5}
Best (hold-out) score: 0.8268156424581006
Best n_estimators (ES): 189
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.2, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.8268156424581006
Best n_estimators (ES): 290
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.2, 'max_depth': 4, 'subsample': 1.0}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 1850
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 5, 'subsample': 0.5}
Best (hold-out) score: 0.8770949720670391
Best n_estimators (ES): 323
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.009, 'max_depth': 7, 'subsample': 0.5}
Best (hold-out) score: 0.8547486033519553
Best n_estimators (ES): 4999
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 5, 'subsample': 0.8}
Best (hold-out) score: 0.8770949720670391
Best n_estimators (ES): 253
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 9, 'subsample': 0.5}
Best (hold-out) score: 0.8770949720670391
Best n_estimators (ES): 191
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.8}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 458
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.01, 'max_depth': 14, 'subsample': 1.0}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 4999
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.8}
Best (hold-out) score: 0.88268156424581
Best n_estimators (ES): 654
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 0.5}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 605
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 12, 'subsample': 1.0}
Best (hold-out) score: 0.8715083798882681
Best n_estimators (ES): 3562
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 5, 'subsample': 1.0}
Best (hold-out) score: 0.8938547486033519
Best n_estimators (ES): 1780
Starting grid search...




Best params: {'colsample_bytree': 0.3, 'learning_rate': 0.8, 'max_depth': 3, 'subsample': 1.0}
Best (hold-out) score: 0.8491620111731844
Best n_estimators (ES): 626
Starting grid search...




Best params: {'colsample_bytree': 1.0, 'learning_rate': 0.8, 'max_depth': 9, 'subsample': 0.8}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 265
Starting grid search...




Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 4, 'subsample': 0.8}
Best (hold-out) score: 0.8435754189944135
Best n_estimators (ES): 450
Starting grid search...




Best params: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.5}
Best (hold-out) score: 0.8659217877094972
Best n_estimators (ES): 1015


AttributeError: 'list' object has no attribute 'to_dict'