In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import logging
import time
from datetime import datetime
import mlflow

In [None]:
## 1.1 Agregamos configuración de conexión con mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.set_experiment("Regression Technics for House Pricing")

# 1. Configuración de modelos

In [None]:
model_configurations = {
    'linear_regression': {
        "model": LinearRegression(),
        "param_grid": {
            "fit_intercept": [True, False]
        }
    },

    "random_forest": {
        "model": RandomForestRegressor(),
        "param_grid": {
            "n_estimators": [25, 50, 100],
            "max_depth": [None, 5, 10]
        }
    },

    "gradient_boosting": {
        "model": GradientBoostingRegressor(),
        "param_grid": {
            "learning_rate": [0.01, 0.1, 0.5],
            "n_estimators": [25, 50, 100]
        }
    }
}

# 2. Configuración para Logging

In [None]:
logging.basicConfig(
    filename="ml_system.log", 
    encoding="utf-8", 
    filemode="a", 
    level=logging.INFO,
    format="{asctime}, {levelname}, {message}", 
    style="{", 
    datefmt="%Y-%m-%d %H:%M"
)

# 3. Entrenamiento y Selección del modelo ganador

In [None]:
dataset = pd.read_csv('../data/interim/proc_data_train.csv')
X = dataset.drop("SalePrice", axis=1)
y = dataset['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=2025, test_size=0.3)

results = {}
logging.info(f"Iniciando Entrenamiento, ENTRENAMIENTO")
start = time.time()

current_datetime = datetime.now()
formatted_time = current_datetime.strftime("%Y-%m-%d %H:%M:%S")

with mlflow.start_run(run_name=formatted_time):
    for model_name, model in model_configurations.items():
        base_model = model['model']
        param_grid = model["param_grid"]

        grid_search_config = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            cv=5,
            scoring="neg_mean_squared_error"
        )
        grid_search_config.fit(X_train, y_train)
        best_model = grid_search_config.best_estimator_
        preds = best_model.predict(X_test)
        params = grid_search_config.best_params_
        rmse_mean = np.round(np.sqrt(mean_squared_error(y_test, preds)), 2)
        results[model_name] = {
            "rmse": rmse_mean, 
            "best_param": params, 
            "best_model": best_model, 
            "run_id": mlflow.active_run().info.run_id
        }
        print(f"RMSE del modelo {model_name}: {rmse_mean}, ENTRENAMIENTO")
        logging.info(f"RMSE del modelo {model_name}: {rmse_mean}, ENTRENAMIENTO")

        # Registramos métrica de rendimiento en mlflow
        with mlflow.start_run(run_name=model_name, nested=True):
            mlflow.log_metric(f"RMSE", rmse_mean)
            mlflow.log_params(params)
            mlflow.set_tag("stage", "challenger")

mlflow.end_run()  # cerramos la corrida

finish = time.time()
logging.info(f"Tiempo de entrenamiento {finish-start}, ENTRENAMIENTO")

In [None]:
df_results = pd.DataFrame(
    [{"model": m, "rmse": v["rmse"]} for m, v in results.items()]
).sort_values("rmse")
df_results

In [None]:
champion_model_name = df_results.iloc[0]['model']
champion_model = results[champion_model_name]["best_model"]
champion_params = results[champion_model_name]["best_param"]
champion_rmse = results[champion_model_name]["rmse"]
champion_run_id = results[champion_model_name]["run_id"]

from mlflow.tracking import MlflowClient
client = MlflowClient()

client.set_tag(champion_run_id, "stage", "champion")
client.set_tag(champion_run_id, "champ_rmse", champion_rmse)
client.set_tag(champion_run_id, "champ_name", champion_model_name)
client.set_tag(champion_run_id, "champ_params", str(champion_params))

mlflow.sklearn.log_model(
    sk_model=champion_model,
    artifact_path=f"champ_model_{champion_model_name}",
    input_example=X_train.iloc[:1]
)

# 4. Agregamos modelo al Pipeline y modelo ganador

In [None]:
house_prices_pipeline = joblib.load('../models/house_prices_data_pre_proc_pipeline.pkl')
model = (champion_model_name, champion_model)
house_prices_pipeline.steps.append(model)  # agregamos modelo ganador al pipeline
house_prices_pipeline

### Reentrenamos el pipeline con el modelo ganador

In [None]:
dataset = pd.read_csv('../data/raw/train.csv')
dataset['MSSubClass'] = dataset['MSSubClass'].astype('O')
dataset['GarageCars'] = dataset['GarageCars'].astype('O')
dataset['BsmtFullBath'] = dataset['BsmtFullBath'].astype('O')
X = dataset.drop(["Id", "SalePrice"], axis=1)
y = dataset['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, random_state=2028
)
house_prices_pipeline.fit(X_train, y_train)

In [None]:
joblib.dump(house_prices_pipeline, '../models/house_prices_pipeline.pkl')