In [5]:
# I learned a lot from the book "feature_engineering_bookcamp" , and I mod. the code in cells below for regression case
# https://github.com/sinanuozdemir/feature_engineering_bookcamp/blob/main/notebooks/Base.ipynb

# seed our random values for reproducible code
import numpy as np
np.random.seed(0)
import random
random.seed(0)

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.pipeline import Pipeline
import time

# Base Lines - para problemas de "Classificación"

In [3]:
def cla_simple_grid_search(x_train, y_train, x_test, y_test, feature_engineering_pipeline):
    ''' 
    simple helper function to grid search an ExtraTreesClassifier model and 
    print out a classification report for the best param set.
    Best here is defined as having the best cross-validated accuracy on the training set
    '''
    
    params = {  # some simple parameters to grid search
        'max_depth': [10, None],
        'n_estimators': [10, 50, 100, 500],
        'criterion': ['gini', 'entropy']
    }

    base_model = ExtraTreesClassifier()

    model_grid_search = GridSearchCV(base_model, param_grid=params, cv=3)
    start_time = time.time()  # capture the start time
    if feature_engineering_pipeline:  # fit FE pipeline to training data and use it to transform test data
        parsed_x_train = feature_engineering_pipeline.fit_transform(x_train, y_train)
        parsed_x_test = feature_engineering_pipeline.transform(x_test)
    else:
        parsed_x_train = x_train
        parsed_x_test = x_test

    parse_time = time.time()
    print(f"Parsing took {(parse_time - start_time):.2f} seconds")

    model_grid_search.fit(parsed_x_train, y_train)
    fit_time = time.time()
    print(f"Training took {(fit_time - start_time):.2f} seconds")

    best_model = model_grid_search.best_estimator_

    print(classification_report(y_true=y_test, y_pred=best_model.predict(parsed_x_test)))
    end_time = time.time()
    print(f"Overall took {(end_time - start_time):.2f} seconds")
    
    return best_model

In [4]:
def cla_advanced_grid_search(x_train, y_train, x_test, y_test, ml_pipeline, params, cv=3, include_probas=False, is_regression=False):
    ''' 
    This helper function will grid search a machine learning pipeline with feature engineering included
    and print out a classification report for the best param set. 
    Best here is defined as having the best cross-validated accuracy on the training set
    '''
    
    model_grid_search = GridSearchCV(ml_pipeline, param_grid=params, cv=cv, error_score=-1)
    start_time = time.time()  # capture the start time

    model_grid_search.fit(x_train, y_train)

    best_model = model_grid_search.best_estimator_
    
    y_preds = best_model.predict(x_test)
    
    if is_regression:
        rmse = np.sqrt(mean_squared_error(y_pred=y_preds, y_true=test_set['pct_change_eod']))
        print(f'RMSE: {rmse:.5f}')
    else:
        print(classification_report(y_true=y_test, y_pred=y_preds))
    print(f'Best params: {model_grid_search.best_params_}')
    end_time = time.time()
    print(f"Overall took {(end_time - start_time):.2f} seconds")
    
    if include_probas:
        y_probas = best_model.predict_proba(x_test).max(axis=1)
        return best_model, y_preds, y_probas
    
    return best_model, y_preds

# Base Lines - para problemas de "Regresión"

In [2]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
import time

def simple_grid_search_regressor(x_train, y_train, x_test, y_test, feature_engineering_pipeline=None):
    '''
    Helper function to perform grid search with an LGBM regressor and 
    print out the Mean Squared Error (MSE) for the best parameter set.
    Best here is defined as having the lowest cross-validated error on the training set.
    '''
    
    params = {  # some simple parameters to grid search
        'max_depth': [10, None],
        'n_estimators': [10, 50, 100, 500],
        'objective': ['regression']  # for regression
    }

    base_model = LGBMRegressor(
    n_jobs=-1,          # Utiliza todos los núcleos disponibles
    verbose=-1          # Silencia la salida del modelo
)

    model_grid_search = GridSearchCV(
        base_model,
        param_grid=params,
        cv=3,
        verbose=0,  # Silenciar la salida
        n_jobs=-1   # Usar todos los núcleos del CPU
    )
    
    start_time = time.time()  # Capture the start time

    if feature_engineering_pipeline:  # Fit FE pipeline to training data and use it to transform test data
        parsed_x_train = feature_engineering_pipeline.fit_transform(x_train, y_train)
        parsed_x_test = feature_engineering_pipeline.transform(x_test)
    else:
        parsed_x_train = x_train
        parsed_x_test = x_test

    parse_time = time.time()
    
    model_grid_search.fit(parsed_x_train, y_train)
    fit_time = time.time()

    best_model = model_grid_search.best_estimator_

    # Predict on the test set and calculate MSE
    y_pred = best_model.predict(parsed_x_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

     # Calculate RMSLE
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    
    
    end_time = time.time()
    print(f"Overall took {(end_time - start_time):.2f} seconds")
    print("MSE: ", mse)
    print("MAE: ", mae)
    print("RMSLE: ", rmsle)

    # Output results
    return best_model

In [6]:
from sklearn.metrics import mean_squared_error
import numpy as np
import time

def adv_grid_search(x_train, y_train, x_test, y_test, ml_pipeline, params, cv=3, include_probas=False, is_regression=False):
    ''' 
    This helper function performs grid search on a machine learning pipeline with feature engineering 
    and outputs the RMSE for regression or a classification report for classification tasks.
    '''
    
    # GridSearchCV to find the best model within the pipeline
    model_grid_search = GridSearchCV(ml_pipeline, param_grid=params, cv=cv, error_score=-1)
    start_time = time.time()  # capture the start time

    # Fit the grid search to the training data
    model_grid_search.fit(x_train, y_train)

    # Retrieve the best model from grid search
    best_model = model_grid_search.best_estimator_
    
    # Make predictions on the test set
    y_preds = best_model.predict(x_test)
    
    # Evaluate based on regression or classification
    if is_regression:
        rmse = np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_preds))
        print(f'RMSE: {rmse:.5f}')
    else:
        print(classification_report(y_true=y_test, y_pred=y_preds))
    
    print(f'Best params: {model_grid_search.best_params_}')
    
    end_time = time.time()
    print(f"Overall took {(end_time - start_time):.2f} seconds")
    
    # Optionally, include probabilities for classification tasks
    if include_probas and not is_regression:
        y_probas = best_model.predict_proba(x_test).max(axis=1)
        return best_model, y_preds, y_probas
    
    return best_model, y_preds

In [None]:
def initial_grid_search(x_train, y_train, feature_engineering_pipeline=None):
    # Parámetros iniciales para el GridSearch
    params = {
        'max_depth': [10, None],
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'num_leaves': [20, 31, 50],
        'feature_fraction': [0.8, 0.9, 1.0],
        'bagging_fraction': [0.8, 0.9, 1.0],
        'min_child_samples': [10, 20, 30],
        'lambda_l1': [0, 0.1, 0.5],
        'lambda_l2': [0, 0.1, 0.5],
        'max_bin': [255, 500, 1000],
        'objective': ['regression']
    }

    base_model = LGBMRegressor(n_jobs=-1, verbose=-1)

    model_grid_search = GridSearchCV(
        base_model,
        param_grid=params,
        cv=3,
        verbose=0,
        n_jobs=-1
    )
    
    if feature_engineering_pipeline:
        parsed_x_train = feature_engineering_pipeline.fit_transform(x_train, y_train)
    else:
        parsed_x_train = x_train

    model_grid_search.fit(parsed_x_train, y_train)
    best_params = model_grid_search.best_params_
    
    print("Best parameters from GridSearchCV:", best_params)
    
    return best_params

def fine_tuning_optuna(x_train, y_train, x_test, y_test, best_params, feature_engineering_pipeline=None, n_trials=33):
    # Definir la función objetivo de Optuna, con un rango ajustado en torno a los mejores parámetros
    def objective(trial):
        max_depth = trial.suggest_int('max_depth', max(1, best_params['max_depth'] - 3 if best_params['max_depth'] else 1), best_params['max_depth'] + 3 if best_params['max_depth'] else 15)
        n_estimators = trial.suggest_int('n_estimators', max(50, best_params['n_estimators'] - 50), best_params['n_estimators'] + 50, step=10)
        learning_rate = trial.suggest_loguniform('learning_rate', best_params['learning_rate'] * 0.1, best_params['learning_rate'] * 10)
        num_leaves = trial.suggest_int('num_leaves', max(20, best_params['num_leaves'] - 10), best_params['num_leaves'] + 10)
        feature_fraction = trial.suggest_uniform('feature_fraction', 0.7, 1.0)
        bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.7, 1.0)
        min_child_samples = trial.suggest_int('min_child_samples', max(5, best_params['min_child_samples'] - 5), best_params['min_child_samples'] + 5)
        lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-8, 10.0)
        lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-8, 10.0)
        max_bin = trial.suggest_int('max_bin', max(50, best_params['max_bin'] - 200), best_params['max_bin'] + 200)
        
        model = LGBMRegressor(
            max_depth=max_depth,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            feature_fraction=feature_fraction,
            bagging_fraction=bagging_fraction,
            min_child_samples=min_child_samples,
            lambda_l1=lambda_l1,
            lambda_l2=lambda_l2,
            max_bin=max_bin,
            objective='regression',
            n_jobs=-1,
            verbose=-1
        )
        
        if feature_engineering_pipeline:
            parsed_x_train = feature_engineering_pipeline.fit_transform(x_train, y_train)
            parsed_x_test = feature_engineering_pipeline.transform(x_test)
        else:
            parsed_x_train = x_train
            parsed_x_test = x_test
        
        model.fit(parsed_x_train, y_train)
        y_pred = model.predict(parsed_x_test)
        
        rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
        
        return rmsle  # Minimizar el RMSLE

    # Crear el estudio de Optuna
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    
    print("Best trial with Optuna:")
    trial = study.best_trial
    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    
    return study.best_trial

# Ejemplo de uso
best_params = initial_grid_search(x_train, y_train, feature_engineering_pipeline)
best_trial = fine_tuning_optuna(x_train, y_train, x_test, y_test, best_params, feature_engineering_pipeline)