# **Maestría en Inteligencia Artificial Aplicada**

## **Curso: Proyecto Integrador**

### Tecnológico de Monterrey

### Prof Dra. Grettel Barceló Alonso y Dr. Luis Eduardo Falcón Morales

## Avance III de Proyecto

## Modelo Base

## Integrantes del Equipo:
### - Erika Cardona Rojas            A01749170
### - Miriam Bönsch                  A01330346
### - Mardonio Manuel Román Ramírez  A01795265

In [None]:
import time
import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, LeaveOneOut, RepeatedKFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Librerias de modelos
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

import yaml

# Cargando Yaml
with open("../config.yaml", "r", encoding="utf-8") as file:
    config = yaml.safe_load(file)

In [None]:
# Cargando Base de Datos
df = pd.read_excel(r"../Data/DF_Pred_4.xlsx")

# Filtrado De Variables

Dada nuestra previa entrega, se utilizarán las variables más importantes en la RFECV

In [None]:
# Cargando las variables más importantes según RFECV 1
df_ranking = pd.read_excel('../Entregables/UdeBarcelona/RFECV.xlsx')

selected_features_1 = df_ranking.loc[df_ranking['Ranking'] == 1, 'Feature']

# Cargando las variables más importantes según RFECV 2
df_ranking = pd.read_excel('../Entregables/UdeBarcelona/RFECV_Lasso.xlsx')

selected_features_2 = df_ranking.loc[df_ranking['Ranking'] == 1, 'Feature']

del df_ranking

In [None]:
# Obteniendo variables únicas de ambos
unique_selected_features = list(set(pd.concat([selected_features_1, selected_features_2])))

# Entrenamiento de Modelos

> ### Creación de DF X e Y

In [None]:
X_full = df[unique_selected_features]
y = df.loc[:,'delta_bdnf_Int']

# Separación en Train y Test (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.2, random_state=42)

> ### Entrenamiento con Leave-One-Out Cross-Validation (LOOCV)

In [None]:
# =========================================
# CONFIG CV (más eficiente que LOOCV)
# =========================================
cv_strategy = RepeatedKFold(
    n_splits=7,
    n_repeats=5,
    random_state=42
)

# Si quieres LOOCV puro:
# cv_strategy = LeaveOneOut()

# =========================================
# LOGGING PROFESIONAL
# =========================================
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

results = []
best_estimators = {}

for name, model_config in config['models_and_params'].items():

    start_time = time.time()
    logger.info(f"Optimizando modelo: {name}")

    # -------- Extraer configuración ----------
    model_class_name = model_config['model']['class']
    model_params = model_config['model']['params']
    param_grid = model_config['param_grid']

    model = MODEL_REGISTRY[model_class_name](**model_params)

    # -------- Pipeline con cache ----------
    pipeline = Pipeline(
        steps=[
            ('scaler', StandardScaler()),
            ('model', model)
        ],
        memory=None  # puedes poner carpeta si quieres cache real
    )

    # -------- RandomizedSearch (más eficiente) ----------
    search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_grid,
        n_iter=min(40, np.prod([len(v) for v in param_grid.values()])),
        cv=cv_strategy,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1,
        random_state=42,
        verbose=0
    )

    search.fit(X_train, y_train)

    best_pipeline = search.best_estimator_
    best_estimators[name] = best_pipeline

    # -------- Evaluación en test ----------
    y_pred = best_pipeline.predict(X_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    clean_params = {
        k.replace('model__', ''): v
        for k, v in search.best_params_.items()
    }

    elapsed = time.time() - start_time

    results.append({
        'Modelo': name,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'Mejores Hiperparámetros': clean_params,
        'Tiempo (s)': round(elapsed, 2)
    })

    logger.info(f"{name} terminado en {elapsed:.2f} segundos")


# =========================================
# DataFrame ordenado
# =========================================
results_df = (
    pd.DataFrame(results)
      .sort_values("RMSE")
      .reset_index(drop=True)
)

results_df