In [1]:
# Indicador para saber si se está trabajando en Google Drive
is_Drive = False

# Ruta base donde se guardarán o cargarán los archivos
base_path = ''

# Si estamos trabajando en Google Drive, ejecutar este bloque
if (is_Drive):
    from google.colab import drive, files # type: ignore
    drive.mount('/content/drive/', force_remount=True)
    base_path = '/content/drive/MyDrive/Machine_Learning_Para_Sistemas_Inteligentes/'

## Load the data


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # Para gráficos estadísticos
from jinja2 import Environment, BaseLoader # Para generación de informes
import os
import re

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.dummy import DummyRegressor

In [3]:
print("Cargando datos...")
df = pd.read_csv(base_path + "train.csv")
df_raw = df.copy()
print(df.shape)
display(df.head())

Cargando datos...
(16013, 15)


Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,30814163,"3 bedrooms, 2 bathrooms. Patio, terrace and BBQ!",157770305,Sara,Constitucion,-34.62142,-58.37754,Entire home/apt,2212,1,24,10-11-2019,2.19,4,337
1,32539509,Studio with balcony and gim @Palermo Hollywood,16133446,Luis & Florencia,Palermo,-34.57949,-58.43199,Entire home/apt,2691,2,0,,,52,179
2,36262352,Nice Niceto. Der Wohnung in Buenos aires,257784804,Diego,Palermo,-34.58298,-58.44265,Entire home/apt,1315,2,8,06-10-2019,1.64,1,22
3,1147359,"Soho Artist Studio, breakfast",4215940,Lilian,Palermo,-34.58937,-58.43274,Entire home/apt,2750,1,9,29-01-2019,0.12,6,343
4,26470465,Perfect get-away in Buenos Aires,166597104,Mariangeles,Retiro,-34.59448,-58.37936,Entire home/apt,1076,1,6,10-05-2019,0.42,2,339


In [4]:
print(df.info())
print(df.describe())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16013 entries, 0 to 16012
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              16013 non-null  int64  
 1   name                            16005 non-null  object 
 2   host_id                         16013 non-null  int64  
 3   host_name                       16000 non-null  object 
 4   neighbourhood                   16013 non-null  object 
 5   latitude                        16013 non-null  float64
 6   longitude                       16013 non-null  float64
 7   room_type                       16013 non-null  object 
 8   price                           16013 non-null  int64  
 9   minimum_nights                  16013 non-null  int64  
 10  number_of_reviews               16013 non-null  int64  
 11  last_review                     11463 non-null  object 
 12  reviews_per_month               

In [5]:
# --- Tratamiento de valores faltantes ---
# imputar reviews_per_month con 0 si está vacío
df["reviews_per_month"] = df["reviews_per_month"].fillna(0)

# cambiamos last_review a datetime y creamos feature temporal days_since_last_review
df["last_review"] = pd.to_datetime(df["last_review"], errors="coerce")
df["days_since_last_review"] = (pd.Timestamp("today") - df["last_review"]).dt.days
df["days_since_last_review"] = df["days_since_last_review"].fillna(df["days_since_last_review"].max())

# --- Gestión de outliers ---
numeric_outlier_cols = [
    "price",
    "minimum_nights",
    "number_of_reviews",
    "reviews_per_month",
    "calculated_host_listings_count",
    "availability_365",
    "days_since_last_review"
]

for col in numeric_outlier_cols:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        # Recortamos los valores extremos al rango [lower, upper]
        df[col] = df[col].clip(lower=lower, upper=upper)

# --- Eliminación de columnas irrelevantes ---
# eliminamos las columnas id name, host_name y host_id
cols_to_drop = ["id", "name", "host_name", "host_id"]
df = df.drop(columns=cols_to_drop, errors="ignore")

# --- Transformación de variables ---
# log transform al precio para estabilizar varianza
df["price"] = np.log1p(df["price"])


### Gráfica de los Outliers

In [6]:
# --- Variante IQR ---
df_iqr = df.copy()

df_iqr["price"] =  np.expm1(df["price"])  # revertimos log transform para calcular IQR

# --- Variante sin tratamiento de outliers ---
df_no_outliers = df_raw.copy()

# imputar reviews_per_month con 0 si está vacío
df_no_outliers["reviews_per_month"] = df_no_outliers["reviews_per_month"].fillna(0)

# cambiamos last_review a datetime
df_no_outliers["last_review"] = pd.to_datetime(df_no_outliers["last_review"], errors="coerce")

# cambiamos last_review a datetime y creamos feature temporal days_since_last_review
df_no_outliers["days_since_last_review"] = (pd.Timestamp("today") - df_no_outliers["last_review"]).dt.days
df_no_outliers["days_since_last_review"] = df_no_outliers["days_since_last_review"].fillna(
    df_no_outliers["days_since_last_review"].max()
)

# --- Variante Winsorized ---
df_wins = df_raw.copy()
df_wins["days_since_last_review"] = df_no_outliers["days_since_last_review"]

for col in numeric_outlier_cols:
    if col in df_wins.columns:
        lower = df_wins[col].quantile(0.01)
        upper = df_wins[col].quantile(0.99)
        df_wins[col] = df_wins[col].clip(lower=lower, upper=upper)


In [7]:
def smart_format(v):
    """Formatea v según el patrón solicitado."""

    if pd.isna(v):
        return ""

    s = f"{v:.4f}"       # siempre 4 decimales para empezar
    if "." not in s:
        return s

    entero, dec = s.split(".")

    # Caso 0000 → sin decimales
    if dec == "0000":
        return entero

    # Caso abcd → elegir según los ceros finales
    # dec = a b c d
    if dec.endswith("000"):
        return entero + "." + dec[:1]  # 1 decimal
    if dec.endswith("00"):
        return entero + "." + dec[:2]  # 2 decimales
    if dec.endswith("0"):
        return entero + "." + dec[:3]  # 3 decimales

    # Si ninguno de los casos: todos los decimales significativos
    return entero + "." + dec

def latex_escape_header(s: str) -> str:
    """Escapa encabezados y agrega allowbreak en nombres largos."""
    s = str(s)
    s = s.replace("_", r"\_\allowbreak ")
    return r"\texttt{" + s + "}"

def latex_escape_row_label(s: str) -> str:
    """Escapa etiquetas de fila."""
    return str(s).replace("_", r"\_").replace("%", r"\%")

def generar_tabla_descriptiva_latex(
    df,
    cols,
    percentiles,
    output_path,
    filename,
    caption,
    label,
    incluir_varianza=True
):
    """
    Genera una tabla LaTeX en formato ajustado (adjustbox + booktabs)
    basada en describe() + varianza.
    """

    os.makedirs(output_path, exist_ok=True)

    # --- describe ---
    desc = df[cols].describe(percentiles=percentiles)

    # --- varianza ---
    if incluir_varianza:
        variance = df[cols].var()
        variance.name = "variance"
        desc = pd.concat([desc, variance.to_frame().T])

    # --- columnas ---
    cols_escaped = [latex_escape_header(c) for c in desc.columns]

    # --- filas ---
    rows = []
    for idx in desc.index:
        row_label = latex_escape_row_label(idx)
        values = []
        for c in desc.columns:
            v = desc.loc[idx, c]
            if pd.isna(v):
                values.append("")
            else:
                values.append(smart_format(v))
        rows.append((row_label, values))

    # --- template Jinja2 CORREGIDO ---
    template_str = r"""
\begin{table}[H]
    \centering
    \setlength{\tabcolsep}{3pt}
    \begin{scriptsize}
    \begin{adjustbox}{width=\textwidth}
    \begin{tabular}{l{% for _ in cols %}r{% endfor %}}
        \toprule
        & {% for col in cols %}{{ col }}{% if not loop.last %} & {% endif %}{% endfor %} \\
        \midrule
        {% for row_label, values in rows -%}
        {{ row_label }} & {% for val in values %}{{ val }}{% if not loop.last %} & {% endif %}{% endfor %} \\
        {% endfor %}
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \end{scriptsize}
    \caption{ {{ caption }} }
    \label{ {{ label }} }
\end{table}
"""

    env = Environment(loader=BaseLoader(), autoescape=False)
    template = env.from_string(template_str)

    latex_table = template.render(
        cols=cols_escaped,
        rows=rows,
        caption=caption,
        label=label,
    )

    latex_table = re.sub(r'\\label\{\s*(.*?)\s*\}', r'\\label{\1}', latex_table)

    filepath = os.path.join(output_path, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(latex_table)

    print("Archivo LaTeX generado:", filepath)


In [49]:
generar_tabla_descriptiva_latex(
    df=df_no_outliers,
    cols=numeric_outlier_cols,
    percentiles=[0, 0.15, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99],
    output_path="Informe/tex/tables",
    filename="df_no_outliers_desc.tex",
    caption=r"Estadísticos descriptivos de las variables numéricas (df\_no\_outliers).",
    label=r"tab:df-no-outliers-desc"
)

generar_tabla_descriptiva_latex(
    df=df,
    cols=numeric_outlier_cols,
    percentiles=[0, 0.15, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99],
    output_path="Informe/tex/tables",
    filename="df_iqr_desc.tex",
    caption=r"Estadísticos descriptivos de las variables numéricas tras el tratamiento de outliers con IQR (además de aplicar \lstinline[style=python]{np.expm1} a \texttt{price}).",
    label=r"tab:df-iqr-desc-log"
)

generar_tabla_descriptiva_latex(
    df=df_iqr,
    cols=numeric_outlier_cols,
    percentiles=[0, 0.15, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99],
    output_path="Informe/tex/tables",
    filename="df_iqr_desc_no_log.tex",
    caption=r"Estadísticos descriptivos de las variables numéricas tras el tratamiento de outliers con IQR sin aplicar transformaciones adicionales.",
    label=r"tab:df-iqr-desc-no-log"
)


Archivo LaTeX generado: Informe/tex/tables/df_no_outliers_desc.tex
Archivo LaTeX generado: Informe/tex/tables/df_iqr_desc.tex
Archivo LaTeX generado: Informe/tex/tables/df_iqr_desc_no_log.tex


In [9]:
# Crear carpeta si no existe
output_dir = "Informe/img/histograma"
os.makedirs(output_dir, exist_ok=True)

for col in numeric_outlier_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df_no_outliers[col], bins=50, kde=True)
    plt.title(f"Histograma de {col}")

    # ruta del archivo
    filename = f"{col}_histograma_bins50.png"
    filepath = os.path.join(output_dir, filename)

    # guardar figura
    plt.savefig(filepath, dpi=300, bbox_inches="tight")
    plt.close()   # cerrar figura para evitar acumulación

    print(f"Guardado: {filepath}")

Guardado: Informe/img/histograma/price_histograma_bins50.png
Guardado: Informe/img/histograma/minimum_nights_histograma_bins50.png
Guardado: Informe/img/histograma/number_of_reviews_histograma_bins50.png
Guardado: Informe/img/histograma/reviews_per_month_histograma_bins50.png
Guardado: Informe/img/histograma/calculated_host_listings_count_histograma_bins50.png
Guardado: Informe/img/histograma/availability_365_histograma_bins50.png
Guardado: Informe/img/histograma/days_since_last_review_histograma_bins50.png


In [10]:
def plot_percentiles_all_methods(
    numeric_cols,
    dfs,
    labels,
    percentiles=None,
    output_dir="Informe/img/percentiles"
):
    """
    Genera y guarda gráficos de percentiles para cada columna y cada método.
    El archivo se guarda como:
    img/percentiles/{col}_percentiles_max=YYY_min=XXX.png
    donde YYY y XXX son los percentiles máximo y mínimo evaluados.
    """

    # Crear carpeta si no existe
    os.makedirs(output_dir, exist_ok=True)

    if percentiles is None:
        percentiles = [90, 95, 97, 98, 99, 99.5, 99.9]

    q = [p / 100.0 for p in percentiles]

    for col in numeric_cols:

        if not any(col in df.columns for df in dfs):
            continue

        plt.figure(figsize=(10, 6))

        for df_tmp, label in zip(dfs, labels):

            if col not in df_tmp.columns:
                continue

            series = df_tmp[col].dropna()
            if series.empty:
                continue

            p_vals = series.quantile(q)
            plt.plot(percentiles, p_vals.values, marker="o", label=label)

        # Etiquetas
        plt.xlabel("Percentil")
        plt.ylabel(col)
        plt.title(f"Comparación de métodos en percentiles de {col}")
        plt.xticks(percentiles, [f"{p}%" for p in percentiles])
        plt.grid(True)
        plt.legend()
        plt.tight_layout()

        # Crear filename con el percentil máximo y mínimo (no los valores)
        max_p = max(percentiles)
        min_p = min(percentiles)

        filename = f"{col}_percentiles_max={max_p}_min={min_p}.png"
        # Si quisieras formato flotante fijo:
        # filename = f"{col}_percentiles_max={max_p:.1f}_min={min_p:.1f}.png"

        filepath = os.path.join(output_dir, filename)

        plt.savefig(filepath, dpi=300, bbox_inches="tight")
        plt.close()

        print(f"Guardado: {filepath}")


In [11]:
dfs = [df_no_outliers, df_wins, df_iqr]
labels = ["Original (sin tratamiento de outliers)", "Winsorize (1–99%)", "IQR"]

percentiles = [0,10, 25, 50, 75, 90, 95, 99]
plot_percentiles_all_methods(numeric_outlier_cols, dfs, labels, percentiles=percentiles)

plot_percentiles_all_methods(numeric_outlier_cols, dfs, labels, percentiles=[95, 96, 97, 98, 99, 99.5, 100])

plot_percentiles_all_methods(numeric_outlier_cols, dfs, labels, percentiles=[0, 0.5 , 1, 2, 3, 4, 5, 10 ,12])


Guardado: Informe/img/percentiles/price_percentiles_max=99_min=0.png
Guardado: Informe/img/percentiles/minimum_nights_percentiles_max=99_min=0.png
Guardado: Informe/img/percentiles/number_of_reviews_percentiles_max=99_min=0.png
Guardado: Informe/img/percentiles/reviews_per_month_percentiles_max=99_min=0.png
Guardado: Informe/img/percentiles/calculated_host_listings_count_percentiles_max=99_min=0.png
Guardado: Informe/img/percentiles/availability_365_percentiles_max=99_min=0.png
Guardado: Informe/img/percentiles/days_since_last_review_percentiles_max=99_min=0.png
Guardado: Informe/img/percentiles/price_percentiles_max=100_min=95.png
Guardado: Informe/img/percentiles/minimum_nights_percentiles_max=100_min=95.png
Guardado: Informe/img/percentiles/number_of_reviews_percentiles_max=100_min=95.png
Guardado: Informe/img/percentiles/reviews_per_month_percentiles_max=100_min=95.png
Guardado: Informe/img/percentiles/calculated_host_listings_count_percentiles_max=100_min=95.png
Guardado: Informe/

## Split the data

Split the DataFrame into training and validation sets (70% for training, 30% for validation).


In [12]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)
print("Tamaños:", X_train.shape, X_val.shape)

Tamaños: (11209, 11) (4804, 11)


## Define models

In [13]:
# Identificamos variables numéricas y categóricas
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object"]).columns

# Definimos transformaciones
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]
)

In [14]:
models = {
    "Dummy": DummyRegressor(strategy="mean"),
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "NeuralNetwork": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
}

## Train the models


In [36]:
# Exportador a LaTeX de los modelos

def export_df_to_latex(
    df,
    filename,
    caption,
    label,
    output_dir="Informe/tex/tables",
    float_format="%.4f",
    bigTable=False
):
    """
    Exporta un DataFrame a una tabla LaTeX con formato ajustado (adjustbox + booktabs).
    
    Parámetros
    ----------
    df : pd.DataFrame
        DataFrame a exportar.
    filename : str
        Nombre del archivo .tex a generar.
    caption : str
        Texto del caption LaTeX.
    label : str
        Label LaTeX para referencias cruzadas.
    output_dir : str
        Carpeta de salida donde se guardará el archivo .tex.
    float_format : str
        Formato para los valores numéricos.
    scriptsize : bool
        Si True, envuelve la tabla en un entorno scriptsize.
    """

    # Crear carpeta si no existe
    os.makedirs(output_dir, exist_ok=True)

    latex_path = os.path.join(output_dir, filename)

    # Convertir DataFrame a LaTeX (estructura interna)
    # escape=True asegura que caracteres como _ se conviertan en \\_
    latex_table = df.to_latex(
        index=False,
        float_format=float_format,
        caption=None,
        label=None,
        longtable=False,
        escape=True,
    )

    if not bigTable:
       latex_wrapped = rf"""
\begin{{table}}[H]
\centering
{latex_table}
\caption{{{caption}}}
\label{{{label}}}
\end{{table}}
"""
    else: 
        # Envolver en entorno table + adjustbox
        start_scriptsize = "    \\begin{scriptsize}\n"
        end_scriptsize = "    \\end{scriptsize}\n"

        latex_wrapped = rf"""
\begin{{table}}[H]
    \centering
    \setlength{{\tabcolsep}}{{4pt}}
    {start_scriptsize}\begin{{adjustbox}}{{width=\textwidth}}
    {latex_table}
    \end{{adjustbox}}
{end_scriptsize}    \caption{{{caption}}}
    \label{{{label}}}
\end{{table}}
"""

    # Guardar archivo
    with open(latex_path, "w", encoding="utf-8") as f:
        f.write(latex_wrapped)

    print(f"Archivo LaTeX generado: {latex_path}")


In [58]:
results = []

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    results.append({"Modelo": name, "rmse": rmse, "MSE": mse, "MAE": mae, "R2": r2})
    print(f"{name} -> rmse: {rmse:.3f}, MSE: {mse:.3f}, MAE: {mae:.3f}, R2: {r2:.3f}")

results_df = pd.DataFrame(results)
display(results_df)

export_df_to_latex(
    results_df,
    output_dir="Informe/tex/tables",
    filename="model_results_final.tex",
    caption="Resultados del primer barrido de modelos sobre la partición de validación (70/30).",
    label="tab:model-results-first-barrido"
)


Dummy -> rmse: 0.683, MSE: 0.467, MAE: 0.545, R2: -0.000
LinearRegression -> rmse: 0.544, MSE: 0.296, MAE: 0.420, R2: 0.366
DecisionTree -> rmse: 0.730, MSE: 0.533, MAE: 0.548, R2: -0.142
RandomForest -> rmse: 0.517, MSE: 0.268, MAE: 0.397, R2: 0.427
GradientBoosting -> rmse: 0.521, MSE: 0.271, MAE: 0.407, R2: 0.419
NeuralNetwork -> rmse: 0.539, MSE: 0.291, MAE: 0.420, R2: 0.378


Unnamed: 0,Modelo,rmse,MSE,MAE,R2
0,Dummy,0.683368,0.466991,0.545486,-8.1e-05
1,LinearRegression,0.54409,0.296034,0.420459,0.366032
2,DecisionTree,0.730309,0.533352,0.548152,-0.142194
3,RandomForest,0.517476,0.267781,0.397447,0.426536
4,GradientBoosting,0.521024,0.271466,0.407025,0.418644
5,NeuralNetwork,0.539006,0.290528,0.420262,0.377823


Archivo LaTeX generado: Informe/tex/tables/model_results_final.tex


## Evaluate the models

Evaluate the performance of models on the validation data using appropriate metrics.


In [17]:
#Regresión Lasso
lasso_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Lasso(max_iter=10000, random_state=42))
])

# Definimos los valores de alpha a probar (regularización)
param_grid_lasso = {
    "model__alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}

# Buscamos la mejor combinación
grid_lasso = GridSearchCV(
    estimator=lasso_pipe,
    param_grid=param_grid_lasso,
    scoring="neg_root_mean_squared_error",  # usamos rmse
    cv=5,
    n_jobs=-1
)

grid_lasso.fit(X_train, y_train)

# Resultados
print("Mejor alpha encontrado:", grid_lasso.best_params_)
print("Mejor rmse (CV):", -grid_lasso.best_score_)

# Tabla de resultados de GridSearch para Lasso
lasso_cv = pd.DataFrame(grid_lasso.cv_results_)

lasso_cv["rmse_mean"] = -lasso_cv["mean_test_score"]
lasso_cv["rmse_std"] = lasso_cv["std_test_score"]
lasso_cv["MSE_mean"] = lasso_cv["rmse_mean"] ** 2

lasso_summary = lasso_cv[[
    "param_model__alpha",
    "rmse_mean",
    "rmse_std",
    "MSE_mean"
]].sort_values(by="rmse_mean")

display(lasso_summary)

# Métricas con los mejores hiperparámetros
best_lasso = grid_lasso.best_estimator_
y_pred_lasso = best_lasso.predict(X_val)

mse_lasso = mean_squared_error(y_val, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
mae_lasso = mean_absolute_error(y_val, y_pred_lasso)
r2_lasso = r2_score(y_val, y_pred_lasso)

print(f"Lasso (mejor modelo) -> rmse: {rmse_lasso:.3f}, MSE: {mse_lasso:.3f}, "
      f"MAE: {mae_lasso:.3f}, R2: {r2_lasso:.3f}")

Mejor alpha encontrado: {'model__alpha': 0.001}
Mejor rmse (CV): 0.5424295138653626


Unnamed: 0,param_model__alpha,rmse_mean,rmse_std,MSE_mean
0,0.001,0.54243,0.010499,0.29423
1,0.01,0.559584,0.011475,0.313134
2,0.1,0.634464,0.008338,0.402544
3,1.0,0.682922,0.007407,0.466383
4,10.0,0.682922,0.007407,0.466383
5,100.0,0.682922,0.007407,0.466383


Lasso (mejor modelo) -> rmse: 0.543, MSE: 0.295, MAE: 0.421, R2: 0.369


In [50]:
# ===== Tabla 1: Resultados de GridSearch =====
export_df_to_latex(
    df=lasso_summary,
    filename="Lasso_gridsearch_results.tex",
    caption="Resultados del GridSearchCV para la regresión Lasso.",
    label="tab:lasso-gridsearch"
)

Archivo LaTeX generado: Informe/tex/tables/Lasso_gridsearch_results.tex


In [19]:
#Regularización (Ridge)
ridge_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Ridge(max_iter=10000, random_state=42))
  ])

param_grid_ridge = {
    "model__alpha": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}

grid_ridge = GridSearchCV(
    estimator=ridge_pipe,
    param_grid=param_grid_ridge,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1
)

grid_ridge.fit(X_train, y_train)

print("Mejor alpha:", grid_ridge.best_params_)
print("Mejor rmse:", -grid_ridge.best_score_)

ridge_cv = pd.DataFrame(grid_ridge.cv_results_)
ridge_cv["rmse_mean"] = -ridge_cv["mean_test_score"]
ridge_cv["rmse_std"] = ridge_cv["std_test_score"]
ridge_cv["MSE_mean"] = ridge_cv["rmse_mean"] ** 2

ridge_summary = ridge_cv[[
    "param_model__alpha",
    "rmse_mean",
    "rmse_std",
    "MSE_mean"
]].sort_values(by="rmse_mean")

display(ridge_summary)

best_ridge = grid_ridge.best_estimator_
y_pred_ridge = best_ridge.predict(X_val)

mse_ridge = mean_squared_error(y_val, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
mae_ridge = mean_absolute_error(y_val, y_pred_ridge)
r2_ridge = r2_score(y_val, y_pred_ridge)

print(f"Ridge (mejor modelo) -> rmse: {rmse_ridge:.3f}, MSE: {mse_ridge:.3f}, "
      f"MAE: {mae_ridge:.3f}, R2: {r2_ridge:.3f}")

Mejor alpha: {'model__alpha': 0.001}
Mejor rmse: 0.5408449660432584


Unnamed: 0,param_model__alpha,rmse_mean,rmse_std,MSE_mean
0,0.001,0.540845,0.011323,0.292513
1,0.01,0.540846,0.011319,0.292514
2,0.1,0.540852,0.011297,0.292521
3,1.0,0.540987,0.011174,0.292667
4,10.0,0.541496,0.010927,0.293218
5,100.0,0.544347,0.010918,0.296314


Ridge (mejor modelo) -> rmse: 0.544, MSE: 0.296, MAE: 0.420, R2: 0.366


In [51]:
export_df_to_latex(
    df=ridge_summary,
    filename="Ridge_gridsearch_results.tex",
    caption="Resultados del GridSearchCV para regresión Ridge.",
    label="tab:ridge-gridsearch"
)

Archivo LaTeX generado: Informe/tex/tables/Ridge_gridsearch_results.tex


In [21]:
optimized_models = {}

# --- Decision Tree ---
dt_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", DecisionTreeRegressor(random_state=42))
])

param_grid_dt = {
    "model__max_depth": [1, 2, 3, 5, 10, 20, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4]
}

grid_dt = GridSearchCV(
    estimator=dt_pipe,
    param_grid=param_grid_dt,
    scoring="neg_root_mean_squared_error",
    cv=5,
    n_jobs=-1
)

grid_dt.fit(X_train, y_train)
optimized_models["DecisionTree"] = grid_dt.best_estimator_
print(f"✅ DecisionTree - mejor params: {grid_dt.best_params_}, rmse: {-grid_dt.best_score_:.3f}")

dt_cv = pd.DataFrame(grid_dt.cv_results_)
dt_cv["rmse_mean"] = -dt_cv["mean_test_score"]
dt_cv["rmse_std"] = dt_cv["std_test_score"]
dt_cv["MSE_mean"] = dt_cv["rmse_mean"] ** 2

dt_summary = dt_cv[[
    "param_model__max_depth",
    "param_model__min_samples_split",
    "param_model__min_samples_leaf",
    "rmse_mean",
    "rmse_std",
    "MSE_mean"
]].sort_values(by="rmse_mean")

display(dt_summary.head(10))  # por ejemplo, los 10 mejores

best_dt = grid_dt.best_estimator_
y_pred_dt = best_dt.predict(X_val)

mse_dt = mean_squared_error(y_val, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)
mae_dt = mean_absolute_error(y_val, y_pred_dt)
r2_dt = r2_score(y_val, y_pred_dt)

print(
    f"DecisionTree (mejor modelo en validación) -> "
    f"rmse: {rmse_dt:.3f}, MSE: {mse_dt:.3f}, MAE: {mae_dt:.3f}, R2: {r2_dt:.3f}"
)

✅ DecisionTree - mejor params: {'model__max_depth': 5, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2}, rmse: 0.552


Unnamed: 0,param_model__max_depth,param_model__min_samples_split,param_model__min_samples_leaf,rmse_mean,rmse_std,MSE_mean
34,5,5,4,0.551545,0.012454,0.304202
33,5,2,4,0.551545,0.012454,0.304202
30,5,2,2,0.55161,0.012457,0.304274
31,5,5,2,0.55161,0.012457,0.304274
32,5,10,2,0.551624,0.012244,0.304289
35,5,10,4,0.551739,0.012281,0.304416
27,5,2,1,0.551871,0.012227,0.304562
28,5,5,1,0.551871,0.012227,0.304562
29,5,10,1,0.552166,0.011864,0.304888
44,10,10,4,0.564703,0.012516,0.318889


DecisionTree (mejor modelo en validación) -> rmse: 0.545, MSE: 0.297, MAE: 0.426, R2: 0.363


In [52]:
export_df_to_latex(
    df = dt_summary,   # solo los 10 mejores, o usa dt_summary completo
    filename = "DecisionTreeRegressor_gridsearch_results.tex",
    caption = "Resultados del GridSearchCV para el modelo Decision Tree Regressor.",
    label = "tab:decisiontreeregressor-gridsearch",
    bigTable = True
)

Archivo LaTeX generado: Informe/tex/tables/DecisionTreeRegressor_gridsearch_results.tex


In [23]:
# --- Random Forest ---
rf_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

param_grid_rf = {
    "model__n_estimators": [1, 50, 100, 200],
    "model__max_depth": [1, 3, 5, 10, 20, None],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2]
}

grid_rf = GridSearchCV(
    estimator=rf_pipe,
    param_grid=param_grid_rf,
    scoring="neg_root_mean_squared_error",
    cv=5,
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)
optimized_models["RandomForest"] = grid_rf.best_estimator_
print(f"✅ RandomForest - mejor params: {grid_rf.best_params_}, rmse: {-grid_rf.best_score_:.3f}")

rf_cv = pd.DataFrame(grid_rf.cv_results_)
rf_cv["rmse_mean"] = -rf_cv["mean_test_score"]
rf_cv["rmse_std"] = rf_cv["std_test_score"]
rf_cv["MSE_mean"] = rf_cv["rmse_mean"] ** 2

rf_summary = rf_cv[[
    "param_model__n_estimators",
    "param_model__max_depth",
    "param_model__min_samples_split",
    "param_model__min_samples_leaf",
    "rmse_mean",
    "rmse_std",
    "MSE_mean"
]].sort_values(by="rmse_mean")

display(rf_summary.head(10))

best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_val)

mse_rf = mean_squared_error(y_val, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_val, y_pred_rf)
r2_rf = r2_score(y_val, y_pred_rf)

print(
    f"RandomForest (mejor modelo en validación) -> "
    f"rmse: {rmse_rf:.3f}, MSE: {mse_rf:.3f}, MAE: {mae_rf:.3f}, R2: {r2_rf:.3f}"
)

✅ RandomForest - mejor params: {'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 5, 'model__n_estimators': 200}, rmse: 0.524


Unnamed: 0,param_model__n_estimators,param_model__max_depth,param_model__min_samples_split,param_model__min_samples_leaf,rmse_mean,rmse_std,MSE_mean
63,200,10,5,2,0.524255,0.01113,0.274843
59,200,10,2,2,0.524268,0.011111,0.274857
51,200,10,2,1,0.524281,0.011328,0.274871
55,200,10,5,1,0.524292,0.011292,0.274882
50,100,10,2,1,0.524481,0.011583,0.275081
54,100,10,5,1,0.524535,0.011561,0.275137
58,100,10,2,2,0.52454,0.011248,0.275142
62,100,10,5,2,0.524571,0.011224,0.275174
61,50,10,5,2,0.52513,0.011191,0.275761
57,50,10,2,2,0.525177,0.011091,0.275811


RandomForest (mejor modelo en validación) -> rmse: 0.513, MSE: 0.263, MAE: 0.398, R2: 0.437


In [53]:
export_df_to_latex(
    df = rf_summary, 
    filename = "RandomForestRegressor_gridsearch_results.tex",
    caption = "Resultados del GridSearchCV para el modelo Random Forest Regressor.",
    label = "tab:randomforestregressor-gridsearch",
    bigTable = True
)

Archivo LaTeX generado: Informe/tex/tables/RandomForestRegressor_gridsearch_results.tex


In [25]:
# --- Gradient Boosting ---
gb_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor(random_state=42))
])

param_grid_gb = {
    "model__n_estimators": [50, 100, 200],
    "model__learning_rate": [0.05, 0.1, 0.2],
    "model__max_depth": [1, 2, 3, 5, 7],
    "model__min_samples_split": [2, 5]
}

grid_gb = GridSearchCV(
    estimator=gb_pipe,
    param_grid=param_grid_gb,
    scoring="neg_root_mean_squared_error",
    cv=5,
    n_jobs=-1
)

grid_gb.fit(X_train, y_train)
optimized_models["GradientBoosting"] = grid_gb.best_estimator_
print(f"✅ GradientBoosting - mejor params: {grid_gb.best_params_}, rmse: {-grid_gb.best_score_:.3f}")

gb_cv = pd.DataFrame(grid_gb.cv_results_)
gb_cv["rmse_mean"] = -gb_cv["mean_test_score"]
gb_cv["rmse_std"] = gb_cv["std_test_score"]
gb_cv["MSE_mean"] = gb_cv["rmse_mean"] ** 2

gb_summary = gb_cv[[
    "param_model__n_estimators",
    "param_model__learning_rate",
    "param_model__max_depth",
    "param_model__min_samples_split",
    "rmse_mean",
    "rmse_std",
    "MSE_mean"
]].sort_values(by="rmse_mean")

display(gb_summary.head(10))

best_gb = grid_gb.best_estimator_
y_pred_gb = best_gb.predict(X_val)

mse_gb = mean_squared_error(y_val, y_pred_gb)
rmse_gb = np.sqrt(mse_gb)
mae_gb = mean_absolute_error(y_val, y_pred_gb)
r2_gb = r2_score(y_val, y_pred_gb)

print(
    f"GradientBoosting (mejor modelo en validación) -> "
    f"rmse: {rmse_gb:.3f}, MSE: {mse_gb:.3f}, MAE: {mae_gb:.3f}, R2: {r2_gb:.3f}"
)


✅ GradientBoosting - mejor params: {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__min_samples_split': 5, 'model__n_estimators': 200}, rmse: 0.521


Unnamed: 0,param_model__n_estimators,param_model__learning_rate,param_model__max_depth,param_model__min_samples_split,rmse_mean,rmse_std,MSE_mean
23,200,0.05,5,5,0.521062,0.009689,0.271506
20,200,0.05,5,2,0.521295,0.010001,0.271749
49,100,0.1,5,2,0.521509,0.009175,0.271972
28,100,0.05,7,5,0.522077,0.009619,0.272565
52,100,0.1,5,5,0.522271,0.008837,0.272767
50,200,0.1,5,2,0.522347,0.008686,0.272846
25,100,0.05,7,2,0.522548,0.009772,0.273056
29,200,0.05,7,5,0.522761,0.009757,0.273279
53,200,0.1,5,5,0.522821,0.009115,0.273341
76,100,0.2,3,5,0.522856,0.009973,0.273378


GradientBoosting (mejor modelo en validación) -> rmse: 0.513, MSE: 0.263, MAE: 0.400, R2: 0.436


In [54]:
export_df_to_latex(
    df = rf_summary,   # solo los 10 mejores, o usa rf_summary completo
    filename = "GradientBoostingRegressor_gridsearch_results.tex",
    caption = "Resultados del GridSearchCV para el modelo Gradient Boosting.",
    label = "tab:GradientBoostingRegressor-gridsearch",
    bigTable = True
)

Archivo LaTeX generado: Informe/tex/tables/GradientBoostingRegressor_gridsearch_results.tex


In [27]:
# --- Neural Network (MLP) ---
mlp_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", MLPRegressor(max_iter=500, random_state=42, early_stopping=True))
])

param_grid_mlp = {
    "model__hidden_layer_sizes": [(64,), (128,64), (64,32)],
    "model__alpha": [0.0001, 0.001, 0.01],
    "model__learning_rate_init": [0.001, 0.01]
}

grid_mlp = GridSearchCV(
    estimator=mlp_pipe,
    param_grid=param_grid_mlp,
    scoring="neg_root_mean_squared_error",
    cv=3,
    n_jobs=-1
)

grid_mlp.fit(X_train, y_train)
optimized_models["NeuralNetwork"] = grid_mlp.best_estimator_
print(f"✅ NeuralNetwork - mejor params: {grid_mlp.best_params_}, rmse: {-grid_mlp.best_score_:.3f}")

mlp_cv = pd.DataFrame(grid_mlp.cv_results_)
mlp_cv["rmse_mean"] = -mlp_cv["mean_test_score"]
mlp_cv["rmse_std"] = mlp_cv["std_test_score"]
mlp_cv["MSE_mean"] = mlp_cv["rmse_mean"] ** 2

mlp_summary = mlp_cv[[
    "param_model__hidden_layer_sizes",
    "param_model__alpha",
    "param_model__learning_rate_init",
    "rmse_mean",
    "rmse_std",
    "MSE_mean"
]].sort_values(by="rmse_mean")

display(mlp_summary.head(10))

best_mlp = grid_mlp.best_estimator_
y_pred_mlp = best_mlp.predict(X_val)

mse_mlp = mean_squared_error(y_val, y_pred_mlp)
rmse_mlp = np.sqrt(mse_mlp)
mae_mlp = mean_absolute_error(y_val, y_pred_mlp)
r2_mlp = r2_score(y_val, y_pred_mlp)

print(
    f"NeuralNetwork (mejor modelo en validación) -> "
    f"rmse: {rmse_mlp:.3f}, MSE: {mse_mlp:.3f}, MAE: {mae_mlp:.3f}, R2: {r2_mlp:.3f}"
)

✅ NeuralNetwork - mejor params: {'model__alpha': 0.01, 'model__hidden_layer_sizes': (64,), 'model__learning_rate_init': 0.001}, rmse: 0.541


Unnamed: 0,param_model__hidden_layer_sizes,param_model__alpha,param_model__learning_rate_init,rmse_mean,rmse_std,MSE_mean
12,"(64,)",0.01,0.001,0.541132,0.008048,0.292824
0,"(64,)",0.0001,0.001,0.541264,0.007601,0.292967
6,"(64,)",0.001,0.001,0.541546,0.007332,0.293272
13,"(64,)",0.01,0.01,0.543828,0.00951,0.295749
4,"(64, 32)",0.0001,0.001,0.544007,0.008164,0.295944
10,"(64, 32)",0.001,0.001,0.544008,0.008252,0.295945
16,"(64, 32)",0.01,0.001,0.544228,0.008017,0.296184
8,"(128, 64)",0.001,0.001,0.544758,0.008063,0.296762
2,"(128, 64)",0.0001,0.001,0.544838,0.007734,0.296848
7,"(64,)",0.001,0.01,0.545197,0.010686,0.29724


NeuralNetwork (mejor modelo en validación) -> rmse: 0.535, MSE: 0.286, MAE: 0.415, R2: 0.388


In [55]:
export_df_to_latex(
    df = rf_summary,  
    filename = "NeuralNetwork_gridsearch_results.tex",
    caption = "Resultados del GridSearchCV para el modelo Neural Network.",
    label = "tab:NeuralNetwork-gridsearch",
    bigTable = True
)

Archivo LaTeX generado: Informe/tex/tables/NeuralNetwork_gridsearch_results.tex


In [56]:
all_models = {
    "Dummy": models["Dummy"],
    "LinearRegression": models["LinearRegression"],
    "LinearRegression_Lasso": grid_lasso.best_estimator_,
    "LinearRegression_Ridge": grid_ridge.best_estimator_,
    **optimized_models
}

In [30]:
print("Ejecutando validación cruzada (Cross-Validation) en todos los modelos...")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []

for name, model in all_models.items():
    print(f"→ {name}")

    # Si el modelo ya es un Pipeline (best_estimator_), lo usamos tal cual.
    # Si no, lo envolvemos con el preprocessor.
    pipe = model if isinstance(model, Pipeline) else Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    # rmse NEGATIVO (sklearn lo devuelve negativo por convención de maximizar)
    scores_rmse_neg = cross_val_score(
        pipe, X, y,
        scoring="neg_root_mean_squared_error",
        cv=kf,
        n_jobs=-1
    )

    # Convertimos a rmse positivo
    scores_rmse = -scores_rmse_neg

    # A partir de rmse podemos obtener MSE por fold
    scores_mse = scores_rmse ** 2

    # MAE (negativo)
    scores_mae_neg = cross_val_score(
        pipe, X, y,
        scoring="neg_mean_absolute_error",
        cv=kf,
        n_jobs=-1
    )
    scores_mae = -scores_mae_neg

    # R2
    scores_r2 = cross_val_score(
        pipe, X, y,
        scoring="r2",
        cv=kf,
        n_jobs=-1
    )

    cv_results.append({
        "Modelo": name,
        "rmse_mean": np.mean(scores_rmse),
        "rmse_std": np.std(scores_rmse),
        "MSE_mean": np.mean(scores_mse),
        "MAE_mean": np.mean(scores_mae),
        "R2_mean": np.mean(scores_r2)
    })

cv_results_df = pd.DataFrame(cv_results).sort_values(by="rmse_mean")
display(cv_results_df)


Ejecutando validación cruzada (Cross-Validation) en todos los modelos...
→ Dummy
→ LinearRegression
→ LinearRegression_Lasso
→ LinearRegression_Ridge
→ DecisionTree
→ RandomForest
→ GradientBoosting
→ NeuralNetwork


Unnamed: 0,Modelo,rmse_mean,rmse_std,MSE_mean,MAE_mean,R2_mean
6,GradientBoosting,0.516694,0.006335,0.267013,0.402559,0.427397
5,RandomForest,0.517135,0.007516,0.267485,0.40145,0.426432
7,NeuralNetwork,0.534196,0.00587,0.2854,0.41679,0.387854
3,LinearRegression_Ridge,0.541587,0.005775,0.29335,0.421807,0.370851
1,LinearRegression,0.541591,0.005762,0.293354,0.421811,0.370842
2,LinearRegression_Lasso,0.542477,0.005674,0.294314,0.423378,0.368782
4,DecisionTree,0.548191,0.006997,0.300562,0.428031,0.355438
0,Dummy,0.683111,0.007253,0.466694,0.546833,-0.000754


In [59]:
export_df_to_latex(
    df=cv_results_df,
    filename="cv_all_models_results.tex",
    caption=r"Resultados de validación cruzada (5-fold) para todos los modelos.",
    label=r"tab:cv-all-models",
    bigTable=True
)

Archivo LaTeX generado: Informe/tex/tables/cv_all_models_results.tex


## Load the test data and save results

In [32]:
test_df = pd.read_csv(base_path + "test.csv")
display(test_df.head())

# Mismas transformaciones que en train
test_df["reviews_per_month"] = test_df["reviews_per_month"].fillna(0)
test_df["last_review"] = pd.to_datetime(test_df["last_review"], errors="coerce")
test_df["days_since_last_review"] = (pd.Timestamp("today") - test_df["last_review"]).dt.days
test_df["days_since_last_review"] = test_df["days_since_last_review"].fillna(df["days_since_last_review"].max())

if is_Drive:
    pred_folder = os.path.join(base_path, "pred")
    os.makedirs(pred_folder, exist_ok=True)
else:
    pred_folder = "pred"
    os.makedirs(pred_folder, exist_ok=True)

for name, model in all_models.items():
    print(f"Entrenando y guardando predicciones para {name}...")
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)]) if not isinstance(model, Pipeline) else model

    pipe.fit(X, y)
    test_pred = pipe.predict(test_df.drop(columns=["id"], errors="ignore"))
    test_pred = np.expm1(test_pred)  # revertimos log1p

    submission = pd.DataFrame({
        "id": test_df["id"],
        "price": test_pred
    })

    filename = f"pred_{name}.csv"
    save_path = os.path.join(pred_folder, filename)
    submission.to_csv(save_path, index=False)
    print(f"Guardado: {filename}")

print("Todas las predicciones fueron generadas y guardadas correctamente.")

Unnamed: 0,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,15843708,Monoambiente en Barrio Norte,19787638,Pablo,Recoleta,-34.59053,-58.40898,Entire home/apt,2,26,03-11-2019,0.9,1,74
1,9735218,Busco Roomate :),38507726,Angela,Palermo,-34.58633,-58.41312,Private room,1,0,,,1,0
2,35682605,Betty´s home,100972248,Cecilia,Balvanera,-34.59979,-58.3934,Entire home/apt,3,4,03-11-2019,2.67,5,270
3,9473906,Lovely Studio in Palermo,25602761,Mariano,Recoleta,-34.59395,-58.41423,Entire home/apt,3,43,21-11-2019,0.89,1,142
4,34155238,Cozy and comfortable apartment in Belgrano.,128766227,Maite,Colegiales,-34.56911,-58.45162,Entire home/apt,2,13,10-11-2019,2.12,1,241


Entrenando y guardando predicciones para Dummy...
Guardado: pred_Dummy.csv
Entrenando y guardando predicciones para LinearRegression...
Guardado: pred_LinearRegression.csv
Entrenando y guardando predicciones para LinearRegression_Lasso...
Guardado: pred_LinearRegression_Lasso.csv
Entrenando y guardando predicciones para LinearRegression_Ridge...
Guardado: pred_LinearRegression_Ridge.csv
Entrenando y guardando predicciones para DecisionTree...
Guardado: pred_DecisionTree.csv
Entrenando y guardando predicciones para RandomForest...
Guardado: pred_RandomForest.csv
Entrenando y guardando predicciones para GradientBoosting...
Guardado: pred_GradientBoosting.csv
Entrenando y guardando predicciones para NeuralNetwork...
Guardado: pred_NeuralNetwork.csv
Todas las predicciones fueron generadas y guardadas correctamente.
