In [126]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np


In [127]:
X_train = np.load("data/processed/X_train.npy")
y_train = pd.read_csv("data/processed/y_train.csv").values.ravel() 

In [128]:
def rmse_cv(model, X, y):
    scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)
    rmse_scores = np.sqrt(-scores)
    return rmse_scores.mean()


In [129]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

rmse_lin = rmse_cv(lin_reg, X_train, y_train)

print("RMSE Regressão Linear:", rmse_lin)

RMSE Regressão Linear: 69065.59069601324


In [130]:
def train_model(model, param_grid, X, y, cv=5):
    grid = GridSearchCV(
        model, 
        param_grid,
        cv=cv,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid.fit(X, y)

    best_model = grid.best_estimator_
    scores = cross_val_score(best_model, X, y, scoring='neg_mean_squared_error', cv=cv)
    rmse = np.sqrt(-scores).mean()
    
    return best_model, rmse, grid.best_params_

In [131]:
tree = DecisionTreeRegressor(random_state=42)
param_grid_tree = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

best_tree, rmse_tree, best_params_tree = train_model(tree, param_grid_tree, X_train, y_train)
print("Melhores parâmetros Decision Tree:", best_params_tree)
print("RMSE Decision Tree:", rmse_tree)

Melhores parâmetros Decision Tree: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
RMSE Decision Tree: 60717.52245459622


In [132]:

rf = RandomForestRegressor(random_state=42)

param_grid_rf = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

best_rf, rmse_rf, best_params_rf = train_model(rf, param_grid_rf, X_train, y_train)
print("Melhores parâmetros Random Forest:", best_params_rf)
print("RMSE Random Forest:", rmse_rf)

Melhores parâmetros Random Forest: {'max_features': 8, 'n_estimators': 30}
RMSE Random Forest: 49893.91429940498


In [133]:
import joblib
import os

os.makedirs("models", exist_ok=True)

joblib.dump(best_tree, "models/decision_tree_model.pkl")
joblib.dump(best_rf, "models/random_forest_model.pkl")
joblib.dump(lin_reg, "models/linear_regression_model.pkl")

['models/linear_regression_model.pkl']