# Entrenamiento y selección de modelo


Este cuaderno carga los conjuntos preprocesados, entrena múltiples algoritmos y almacena el mejor modelo junto con el reporte de métricas.


In [7]:
import json
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

CONFIG_PATH = Path("config.json")
if not CONFIG_PATH.exists():
    CONFIG_PATH = Path("../../config.json").resolve()

with CONFIG_PATH.open(encoding="utf-8") as cfg_file:
    config = json.load(cfg_file)

project_root = CONFIG_PATH.parent
store_dir = project_root / config["paths"]["model_store"]
store_dir.mkdir(parents=True, exist_ok=True)

train_artifact_path = project_root / config["paths"]["train_set"]
test_artifact_path = project_root / config["paths"]["test_set"]
print(f"Train artifact: {train_artifact_path}")
print(f"Test artifact: {test_artifact_path}")

train_artifact = joblib.load(train_artifact_path)
test_artifact = joblib.load(test_artifact_path)

X_train, y_train = train_artifact["X"], train_artifact["y"]
X_test, y_test = test_artifact["X"], test_artifact["y"]
feature_names = train_artifact.get("feature_names")
print(f"Formas -> X_train: {X_train.shape}, X_test: {X_test.shape}")


Train artifact: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\model_store\train_set.pkl
Test artifact: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\model_store\test_set.pkl
Formas -> X_train: (800, 19), X_test: (200, 19)


In [8]:
def build_models(random_state: int):
    return {
        "linear_regression": LinearRegression(),
        "decision_tree": DecisionTreeRegressor(
            max_depth=8,
            min_samples_split=4,
            random_state=random_state,
        ),
        "random_forest": RandomForestRegressor(
            n_estimators=300,
            max_depth=10,
            random_state=random_state,
            n_jobs=-1,
        ),
        "xgboost": XGBRegressor(
            objective="reg:squarederror",
            n_estimators=400,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=random_state,
            reg_lambda=1.0,
        ),
    }

results = []
best_model = None
best_model_name = None
best_score = float("-inf")
best_metrics = {}

models = build_models(config["training"]["random_state"])
for name, model in models.items():
    print(f"Entrenando {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    metrics = {
        "r2": r2_score(y_test, preds),
        "mae": mean_absolute_error(y_test, preds),
        "rmse": np.sqrt(mean_squared_error(y_test, preds)),
    }
    results.append({"model": name, **metrics})
    if metrics["r2"] > best_score:
        best_score = metrics["r2"]
        best_model = model
        best_model_name = name
        best_metrics = metrics

print(f"\nMejor modelo: {best_model_name} (R2={best_score:.4f})")


best_model_path = project_root / config["paths"]["best_model"]
best_model_path.parent.mkdir(parents=True, exist_ok=True)
best_payload = {
    "model_name": best_model_name,
    "model": best_model,
    "feature_names": feature_names,
    "metrics": best_metrics,
}
joblib.dump(best_payload, best_model_path)
print(f"Modelo almacenado en {best_model_path}")

metrics_df = pd.DataFrame(results).sort_values(by="r2", ascending=False)
metrics_report_path = project_root / config["paths"]["metrics_report"]
metrics_report_path.parent.mkdir(parents=True, exist_ok=True)
metrics_df.to_csv(metrics_report_path, index=False, encoding="utf-8")
print(f"Reporte de métricas guardado en {metrics_report_path}")


Entrenando linear_regression...
Entrenando decision_tree...
Entrenando random_forest...
Entrenando xgboost...

Mejor modelo: linear_regression (R2=0.8804)
Modelo almacenado en C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\model_store\best_model.pkl
Reporte de métricas guardado en C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\reports\model_metrics.csv


In [9]:
print("\nResumen de métricas obtenidas:")
display(metrics_df)




Resumen de métricas obtenidas:


Unnamed: 0,model,r2,mae,rmse
0,linear_regression,0.880433,4.214763,5.393994
2,random_forest,0.854425,4.586037,5.951812
3,xgboost,0.849092,4.676387,6.059841
1,decision_tree,0.816951,5.301368,6.674042
