In [1]:
# bibliotecas
import json
import numpy as np
import pandas as pd

# regressores
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor

# ferramentas extras
from sklearn.model_selection import GridSearchCV, KFold

In [2]:
# carrega dados de treino
X_train = pd.read_csv("../outputs/X_train.csv").values
y_train = pd.read_csv("../outputs/y_train.csv").values.ravel()

In [3]:
# Define os algoritimos e hyperparametros a serem testados
models_params_grid = {
    "Regressão Linear": {
        "model": LinearRegression(),
        "params": {
            "fit_intercept": [True, False],
            "positive": [True, False],
            "n_jobs": [-1]
        }
    },
    "Regressão LASSO": {
        "model": Lasso(),
        "params": {
            "max_iter": [1000],
            "alpha": [2**x for x in range(-12, 13)],
            "random_state": [1]
        }
    },
    "Regressão Ridge": {
        "model": Ridge(),
        "params": {
            "max_iter": [1000],
            "alpha": [2**x for x in range(-12, 13)],
            "random_state": [1]
        }
    },
    "Rede Elástica": {
        "model": ElasticNet(),
        "params": {
            "max_iter": [1000],
            "alpha": [2**x for x in range(-12, 13)],
            "l1_ratio": [x/10 for x in range(1, 10)],
            "random_state": [1]
        }
    },
    "K-Vizinhos Mais Próximos": {
        "model": KNeighborsRegressor(),
        "params": {
            "n_neighbors": [x for x in range(1, 16)],
            "weights": ['uniform', 'distance'],
            "p": [1, 2]
        }
    },
    "Árvore de Decisão": {
        "model": DecisionTreeRegressor(),
        "params": {
            "max_depth": [2, 4, 8, 16],
            "max_leaf_nodes": [64, 128, 256, 512],
            "min_samples_leaf": [1, 2, 4],
            "min_samples_split": [2, 4, 8],
            "random_state": [1]
        }
    },
    "Árvores Extras": {
        "model": ExtraTreesRegressor(),
        "params": {
            "n_estimators": [64, 128, 256, 512],
            "max_depth": [2, 4, 8, 16],
            "min_samples_leaf": [1, 2, 4],
            "min_samples_split": [2, 4, 8],
            "random_state": [1]
        }
    },
    "Floresta Aleatória": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [64, 128, 256, 512],
            "max_depth": [4, 8, 16, 32],
            "min_samples_leaf": [1, 2, 4],
            "min_samples_split": [2, 4, 8],
            "random_state": [1]
        }
    },
    "Gradiente Boosting Machine": {
        "model": GradientBoostingRegressor(),
        "params": {
            "n_estimators": [32, 64, 128, 256],
            "learning_rate": [1/32, 1/16, 1/8, 1/4],
            "max_depth": [3, 4, 5],
            "min_samples_split": [2, 4, 8],
            "min_samples_leaf": [1, 2, 4],
            "random_state": [1]
        }
    },
    "eXtreme Gradient Boosting": {
        "model": XGBRegressor(),
        "params": {
            "n_estimators": [64, 128, 256, 512],
            "learning_rate": [1/32, 1/16, 1/8],
            "max_depth": [3, 4, 5, 6, 7],
            "min_child_weight": [1, 2, 4],
            "subsample": [0.6, 0.7, 0.8, 0.9],
            "colsample_bytree": [0.6, 0.7, 0.8, 0.9],
            "random_state": [1]
        }
    },
    "Light Gradient Boosting": {
        "model": LGBMRegressor(),
        "params": {
            "n_estimators": [32, 64, 128, 256],
            "learning_rate": [1/32, 1/16, 1/8, 1/4],
            "num_leaves": [31, 63, 127],
            "max_depth": [-1, 8, 16, 32],
            "min_child_samples": [8, 16, 32],
            "random_state": [1],
            "force_row_wise": [True],
            "verbose": [-1]
        }
    },
    "Categorical Boosting": {
        "model": CatBoostRegressor(),
        "params": {
            "depth": [2, 3, 4, 6, 7],
            "learning_rate": [1/64, 1/32, 1/16, 1/8],
            "iterations": [128, 256, 512, 1024],
            "l2_leaf_reg": [1, 2, 3, 4],
            "border_count": [8, 16, 32, 64, 128],
            "random_state": [1],
            "silent": [True]
        }
    }
}

In [4]:
# para fins de controle, temos também a opção sem parametros e com estados aleatorios fixos
control_models_params = {
    "Regressão Linear": {
        "model": LinearRegression(),
        "params": {}
    },
    "Regressão LASSO": {
        "model": Lasso(),
        "params": {
            "max_iter": [1000],
            "random_state": [1]
        }
    },
    "Regressão Ridge": {
        "model": Ridge(),
        "params": {
            "max_iter": [1000],
            "random_state": [1]
        }
    },
    "Rede Elástica": {
        "model": ElasticNet(),
        "params": {
            "max_iter": [1000],
            "random_state": [1]
        }
    },
    "K-Vizinhos Mais Próximos": {
        "model": KNeighborsRegressor(),
        "params": {}
    },
    "Árvore de Decisão": {
        "model": DecisionTreeRegressor(),
        "params": {
            "random_state": [1]
        }
    },
    "Árvores Extras": {
        "model": ExtraTreesRegressor(),
        "params": {
            "random_state": [1]
        }
    },
    "Floresta Aleatória": {
        "model": RandomForestRegressor(),
        "params": {
            "random_state": [1]
        }
    },
    "Gradiente Boosting Machine": {
        "model": GradientBoostingRegressor(),
        "params": {
            "random_state": [1]
        }
    },
    "eXtreme Gradient Boosting": {
        "model": XGBRegressor(),
        "params": {
            "random_state": [1]
        }
    },
    "Light Gradient Boosting": {
        "model": LGBMRegressor(),
        "params": {
            "random_state": [1],
            "force_row_wise": [True],
            "verbose": [-1]
        }
    },
    "Categorical Boosting": {
        "model": CatBoostRegressor(),
        "params": {
            "random_state": [1],
            "silent": [True]
        }
    }
}


In [5]:
# função que faz a busca pro hiperparametros de todos os algoritimos em sequencia
def perform_grid_search(models_params, X_train, y_train, n_splits, filename=None):
    results = {}
    
    for name, config in models_params.items():
        print(f'Training model: {name}')
        model = config['model']
        param_grid = config['params']

        # Cria uma validação cruzada com KFold
        cv = KFold(n_splits=n_splits, shuffle=True, random_state=0)
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, verbose=1, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Salva os resultados
        results[name] = {
            'params': grid_search.best_params_,
            'score': grid_search.best_score_,
        }
        
    # se foi expecificado onde salvar, salva resultados em um arquivo JSON
    if filename!=None:
        with open(filename, 'w') as f:
            json.dump(results, f, indent=4)
        print(f'\nResultados salvos em {filename}')

    return results

In [6]:
# função que compara os resultados de duas buscas por parametros
def params_gain(old_params, new_params):

    score_gain = {}
    for name in old_params.keys():
        old_score = old_params[name]['score']
        new_score = new_params[name]['score']
        score_gain[name] = 100*(new_score-old_score)/old_score

    return score_gain

In [7]:
# escolhe quantas divisões serão feitas para validação cruzada
n_splits = 5

In [9]:
# testa os prametros padrões de cada algoritimo
params_control = perform_grid_search(control_models_params, X_train, y_train, n_splits, filename='../outputs/params_control.json')

Training model: Regressão Linear
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: Regressão LASSO
Fitting 5 folds for each of 1 candidates, totalling 5 fits


  model = cd_fast.enet_coordinate_descent(


Training model: Regressão Ridge
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: Rede Elástica
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: K-Vizinhos Mais Próximos
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: Árvore de Decisão
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: Árvores Extras
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: Floresta Aleatória
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: Gradiente Boosting Machine
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: eXtreme Gradient Boosting
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: Light Gradient Boosting
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Training model: Categorical Boosting
Fitting 5 folds for each of 1 candidates, totalling 5 fits

Resultados salvos em ../outputs/params_contr

In [None]:
# Faz a busca em grade dos melhores parametros entre os definidos para cada algoritimo
params_best = perform_grid_search(models_params_grid, X_train, y_train, n_splits, filename='../outputs/params_best.json')

In [None]:
# calcula o quão melhor foi o resultado do grid-serch em relação aos parametros padrões
params_gain(params_control, params_best)

In [None]:
# mostra os melhores parametros
params_best