In [1]:
import sys
from pathlib import Path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb
import pandas as pd
import numpy as np
import re
import sklearn as skt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
from noise_remover import NoiseRemover
from titanic_preprocessor import TitanicPreprocessor
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from xgboost.callback import EarlyStopping

from utils import load_config, load_datasets

load_config()
dataset, dt = load_datasets()

model= XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",       # modela conteos
    tree_method="hist",         # rápido y eficiente en CPU
    n_estimators=5000,
    learning_rate=0.1,
    callbacks= [EarlyStopping(rounds=50, save_best=True)],
    max_depth=3,
    enable_categorical=True,
    verbosity=0
)

def generate_predefined_split(y, prep):
    X_train, X_eval, y_train, y_eval = train_test_split(df, y, test_size=0.2, stratify=y)
    X_train_preprocessed = prep.fit_transform(X_train, y_train)  
    X_eval_preprocessed  = prep.transform(X_eval)   
    X_preprocessed = pd.concat([X_train_preprocessed, X_eval_preprocessed], axis=0)
    y_all   = pd.concat([y_train, y_eval], axis=0)
    test_fold = np.r_[[-1]*len(X_train_preprocessed), [0]*len(X_eval_preprocessed)]
    return X_preprocessed, X_eval_preprocessed, y_all, y_eval, test_fold


In [None]:
param_grid = {
    'max_depth': [2,14],
    'learning_rate': [0.1, 0.2],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.8, 0.9]
}

df=dataset.copy()
y = df.pop('Survived')
dfcv=df.copy()
params=[]

for i in range(100):
    data_preprocessor = TitanicPreprocessor()

    prep = Pipeline([('pre-processor', TitanicPreprocessor()),('noise_remover', NoiseRemover())])
    X_preprocessed, X_eval_preprocessed, y_all, y_eval, test_fold = generate_predefined_split(y, prep)

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=PredefinedSplit(test_fold),
        scoring='accuracy',
        n_jobs=-1,  # Use all available cores
        verbose=0  # Print progress
    )

    print("Starting grid search...")
    grid_search.fit(X_preprocessed, y_all,eval_set=[(X_eval_preprocessed, y_eval)],verbose=False)

    best_xgb = grid_search.best_estimator_
    best_iter = getattr(best_xgb, "best_iteration", None)
    print("Best params:", grid_search.best_params_)
    print("Best (hold-out) score:", grid_search.best_score_)
    n_estimators=(best_iter + 1) if best_iter is not None else 0
    print("Best n_estimators (ES):", n_estimators)
    grid_search.best_params_["n_estimators"]=n_estimators
    grid_search.best_params_["score"]=grid_search.best_score_
    params.append(grid_search.best_params_)

paramsFileContent=pd.DataFrame(params)
paramsFileContent.to_csv("params.csv", index=False)

print(params)

#[{'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 9, 'subsample': 0.5}, {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'subsample': 0.5}]



In [None]:
param_grid = {
    'rfbes__max_depth': [3, 4, 5, 7, 9 ,12, 13, 14, 15],
    'rfbes__learning_rate': [0.008,0.009 ,0.01, 0.1, 0.2, 0.8],
    'rfbes__subsample': [0.5, 0.8, 0.9, 1.0],
    'rfbes__colsample_bytree': [0.3, 0.8, 0.9, 1.0]
}

df=dataset.copy()
y = df.pop('Survived')
dfcv=df.copy()
params=[]


class ModelWithEarlyStop(BaseEstimator, ClassifierMixin):

    def __init__(self, n_estimators=5000,
                 learning_rate=0.1,
                 max_depth=3,
                 subsample=1.0,
                 colsample_bytree=1.0):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.model=XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",       # modela conteos
        tree_method="hist",         # rápido y eficiente en CPU
        n_estimators=5000,
        learning_rate=0.1,
        callbacks= [EarlyStopping(rounds=50, save_best=True)],
        max_depth=3,
        enable_categorical=True,
        verbosity=0
    )
        
    def fit(self, X, y=None):
        self.model.fit(X, y, eval_set=[(X, y)], verbose=0)
        print(f"llamando fit modelo con {len(X):d}")
        return self

    def predict(self, X):
        self.model.predict(X)

    def predict_proba(self, X):
        return self.model_.predict_proba(X)
        

cv=StratifiedKFold(n_splits=5,shuffle=False,random_state=None)
data_preprocessor = TitanicPreprocessor()
model_with_ES = ModelWithEarlyStop(cv)
pipeline = Pipeline([('pre-processor', data_preprocessor), ('nosie remover', NoiseRemover()),('rfbes', model_with_ES)])

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    verbose=0  # Print progress
)

print("Starting grid search...")
grid_search.fit(df, y)

best_xgb = grid_search.best_estimator_
best_iter = getattr(best_xgb, "best_iteration", None)
print("Best params:", grid_search.best_params_)
print("Best (hold-out) score:", grid_search.best_score_)
n_estimators=(best_iter + 1) if best_iter is not None else 0
print("Best n_estimators (ES):", n_estimators)
grid_search.best_params_["n_estimators"]=n_estimators
grid_search.best_params_["score"]=grid_search.best_score_
params.append(grid_search.best_params_)

paramsFileContent=pd.DataFrame(params)
paramsFileContent.to_csv("params.csv", index=False)

print(params)

#[{'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 9, 'subsample': 0.5}, {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'subsample': 0.5}]



In [2]:
from data_filler import DataFiller
from ticket_preprocessor import TicketPreprocessor


param_grid = {
    'max_depth': [2, 3, 4, 5],
    'learning_rate': [0.009 ,0.01, 0.1, 0.2, 0.25, 0.28, 0.30, 0.4],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.85, 0.9, 1.0],
    'reg_alpha': [2, 5, 10, 15, 20],
    'reg_lambda': [0, 0.01, 0.1, 1, 5, 10]
}


df=dataset.copy()
y = df.pop('Survived')
dfcv=df.copy()
params=[]

class XGBoostCVWrapper(BaseEstimator, TransformerMixin):
    def __init__(self,max_depth=7, learning_rate=0.01, subsample=0.9, colsample_bytree=0.9, n_estimators=5000, reg_lambda=2,  reg_alpha=0.0):
        self.max_depth = max_depth
        self.reg_lambda = reg_lambda
        self.reg_alpha = reg_alpha
        self.subsample = subsample
        self.n_estimators = n_estimators
        self.colsample_bytree = colsample_bytree
        self.learning_rate = learning_rate
        self.cv_results_ = None
        self.best_iteration_ = None
    
    def fit(self, X, y=None):
        dtrain = xgb.DMatrix(X, label=y, enable_categorical=True)
        self.cv_results_ = xgb.cv(
        params={'max_depth': self.max_depth, 'learning_rate': self.learning_rate, 'subsample': self.subsample, 'colsample_bytree': self.colsample_bytree,  'objective': 'binary:logistic', 'reg_lambda': self.reg_lambda,  'reg_alpha': self.reg_alpha},  # objective needed for regression
        dtrain=dtrain,
        num_boost_round=5000,
        nfold=5,
        metrics='logloss',
        early_stopping_rounds=50,
        seed=42,
        as_pandas=True,
        verbose_eval=False
        )
        self.best_iteration_ = len(self.cv_results_)

        cv=StratifiedKFold(n_splits=5,shuffle=False,random_state=None)

        self.model_ = xgb.XGBClassifier(
            max_depth=self.max_depth,
            learning_rate=self.learning_rate,
            subsample=self.subsample,
            colsample_bytree=self.colsample_bytree,
            n_estimators=self.best_iteration_,
            enable_categorical=True,
            eval_metric='logloss',
            tree_method="hist",  
            reg_lambda=self.reg_lambda,            # L2 ayuda a estabilizar
            reg_alpha=self.reg_alpha,
        )

        


        pipeline = Pipeline([('pre-processor', TicketPreprocessor()),('fill_ticket', DataFiller()), ('nosie remover', NoiseRemover()),('rfb',  self.model_)])

        cv_score = cross_val_score(pipeline, dfcv,y,cv=cv,scoring="accuracy")
        model_score = cv_score.mean()
        self.model_.fit(X, y)

        overfitting_coeficient = self.model_.score(X, y) - model_score


        self.score_val = model_score - overfitting_coeficient
        self.best_result_ = self.cv_results_['test-logloss-mean'].min()

        print(f"'max_depth': {self.max_depth}, 'learning_rate': {self.learning_rate}, 'subsample': {self.subsample}, 'colsample_bytree': {self.colsample_bytree} 'n_estimators': {self.best_iteration_} , result: {self.score_val}, cv: {model_score}, ov_cof: {overfitting_coeficient}")
        return self
    
    
    def score(self, X, y):
        return self.score_val
    
    
data_preprocessor = TitanicPreprocessor()

modelo_falopa = XGBoostCVWrapper()
# pipeline without model
prep = Pipeline([('pre-processor', TicketPreprocessor()),('fill_ticket', DataFiller()), ('nosie remover', NoiseRemover())])

prep.fit(df)
X_preprocessed = prep.transform(df)
dummy_cv = [(np.arange(len(X_preprocessed)), np.arange(len(X_preprocessed)))]


grid_search = GridSearchCV(
    estimator=modelo_falopa,
    param_grid=param_grid,
    scoring=None,
    cv=dummy_cv,
    n_jobs=-1,  # Use all available cores
    verbose=0  # Print progress
)

print("Starting grid search...")
grid_search.fit(X_preprocessed, y)
print("Best score:", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)
print()







#[{'colsample_bytree': 0.8, 'learning_rate': 0.8, 'max_depth': 9, 'subsample': 0.5}, {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'subsample': 0.5}]



     Pclass  Sex    Age      Fare  Cabin  FamilyNumber  OheEmbarked_C  \
0         3    1  22.00    7.2500  False             1          False   
1         1    0  38.00   71.2833   True             1           True   
2         3    0  26.00    7.9250  False             0          False   
3         1    0  35.00   53.1000   True             1          False   
4         3    1  35.00    8.0500  False             0          False   
5         3    1  25.00    8.4583  False             0          False   
6         1    1  54.00   51.8625   True             0          False   
7         3    1   2.00   21.0750  False             4          False   
8         3    0  27.00   11.1333  False             2          False   
9         2    0  14.00   30.0708  False             1           True   
10        3    0   4.00   16.7000   True             2          False   
11        1    0  58.00   26.5500   True             0          False   
12        3    1  20.00    8.0500  False           



     Pclass  Sex   Age  ...  OheTitle_Mr  OheTitle_Mrs  OheTitle_Rare
168       1    1  40.0  ...         True         False          False
169       3    1  28.0  ...         True         False          False
170       1    1  61.0  ...         True         False          False
171       3    1   4.0  ...        False         False          False
173       3    1  21.0  ...         True         False          False
..      ...  ...   ...  ...          ...           ...            ...
886       2    1  27.0  ...        False         False           True
887       1    0  19.0  ...        False         False          False
888       3    0  22.0  ...        False         False          False
889       1    1  26.0  ...         True         False          False
890       3    1  32.0  ...         True         False          False

[712 rows x 14 columns]
     Pclass  Sex   Age  ...  OheTitle_Mr  OheTitle_Mrs  OheTitle_Rare
168       1    1  40.0  ...         True         False          F

KeyboardInterrupt: 

In [None]:
grid_search = GridSearchCV(
    estimator=modelo_falopa,
    param_grid=param_grid,
    scoring=None,
    cv=dummy_cv,
    n_jobs=-1,  # Use all available cores
    verbose=0  # Print progress
)

print("Starting grid search...")
grid_search.fit(X_preprocessed, y_all)

best_xgb = grid_search.best_estimator_
best_iter = getattr(best_xgb, "best_iteration", None)
print("Best params:", grid_search.best_params_)
n_estimators=(best_iter + 1) if best_iter is not None else 0
print("Best n_estimators (ES):", grid_search.best_score_)

In [None]:

params_and_results=[]
cv=StratifiedKFold(n_splits=5,shuffle=False,random_state=None)
for setOfParams in params:
    tunedModel= XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",       # modela conteos
    tree_method="hist",         # rápido y eficiente en CPU
    n_estimators=setOfParams["n_estimators"],
    learning_rate=setOfParams["learning_rate"],
    max_depth=setOfParams["max_depth"],
    colsample_bytree=setOfParams["colsample_bytree"],
    subsample=setOfParams["subsample"],
    enable_categorical=True,
    verbosity=0
    )
    pipeline = Pipeline([('pre-processor', data_preprocessor), ('nosie remover', NoiseRemover()),('rfb', tunedModel)])
    final_score_raw=cross_val_score(pipeline,dfcv,y,cv=cv,scoring="accuracy")
    setOfParams["cross_validation_score"] = final_score_raw.mean()
    setOfParams["cross_validation_error_margin"] = final_score_raw.std()
    params_and_results.append(setOfParams)
    print("para los siguientes hiperparametros")
    print(setOfParams)
    print(f"El promedio de precision en cross validation es: {final_score_raw.mean():.4f} (+/- {final_score_raw.std() * 2:.4f})")



params_and_results.sort(key=lambda x : x["cross_validation_score"])    
paramsFileContent=pd.DataFrame(params_and_results)
paramsFileContent.to_csv("params_with_results.csv", index=False)
 
    