In [7]:
import pandas as pd
import os

df = pd.read_csv('../data/03_casos/df_con_variables_sub_reg_nt_50-all.csv')
df


Unnamed: 0,departamento,provincia,distrito,ano,semana,sub_reg_nt,ubigeo,ira_no_neumonia,neumonias_men5,neumonias_60mas,...,lag_3_semanas_defunciones_men5,lag_4_semanas_defunciones_men5,lag_5_semanas_defunciones_men5,lag_6_semanas_defunciones_men5,lag_1_semanas_defunciones_60mas,lag_2_semanas_defunciones_60mas,lag_3_semanas_defunciones_60mas,lag_4_semanas_defunciones_60mas,lag_5_semanas_defunciones_60mas,lag_6_semanas_defunciones_60mas
0,LIMA,YAUYOS,ALLAUCA,2021,47,50,151003,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,LIMA,YAUYOS,ALLAUCA,2021,52,50,151003,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LIMA,YAUYOS,ALLAUCA,2022,43,50,151003,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,LIMA,YAUYOS,ALLAUCA,2022,44,50,151003,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,LIMA,YAUYOS,ALLAUCA,2023,14,50,151003,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54294,LIMA,YAUYOS,YAUYOS,2022,37,50,151001,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54295,LIMA,YAUYOS,YAUYOS,2023,10,50,151001,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54296,LIMA,CA�ETE,ZU�IGA,2005,24,50,150516,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54297,LIMA,CA�ETE,ZU�IGA,2006,44,50,150516,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Target
y = df["hospitalizados_men5"]

# Features (drop target + identifiers that leak info)
X = df.drop(columns=["hospitalizados_men5", "ubigeo"])

# Define categorical and numeric features
categorical = ["departamento", "provincia", "distrito", "mes",
               "tiempo_mes", "tiempo_mes_anterior", "tiempo_estacion"]
numeric = [col for col in X.columns if col not in categorical]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric)
    ]
)

# Model (Random Forest)
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

# Pipeline
pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])

# Train/Test split by time
train = df[df["ano"] < 2017]
test = df[(df["ano"] >= 2017) & (df["ano"] < 2021)]

X_train, y_train = train.drop(columns=["hospitalizados_men5", "ubigeo"]), train["hospitalizados_men5"]
X_test, y_test = test.drop(columns=["hospitalizados_men5", "ubigeo"]), test["hospitalizados_men5"]

# Fit
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")


MAE: 0.3320
RMSE: 0.7659
R²: 0.8035


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# =======================
# Target
# =======================
y = df["hospitalizados_men5"]

# Features (drop target + identifiers that leak info)
X = df.drop(columns=["hospitalizados_men5", "ubigeo"])

# Lag features
lag_features = [
    "lag_1_semanas_neumonias_men5",
    "lag_1_semanas_hospitalizados_men5",
    "lag_2_semanas_neumonias_men5",
    "lag_3_semanas_neumonias_men5",
    "lag_2_semanas_hospitalizados_men5",
    "lag_4_semanas_neumonias_men5",
    "ira_no_neumonia",
    "lag_2_semanas_ira_no_neumonia",
    "lag_semana_anterior_ira_no_neumonia",
    "lag_3_semanas_hospitalizados_men5",
    "lag_3_semanas_ira_no_neumonia",
    "lag_5_semanas_neumonias_men5",
    "lag_4_semanas_hospitalizados_men5"
]

# Define categorical and numeric features
categorical = ["departamento", "provincia", "distrito", "mes",
               "tiempo_mes", "tiempo_mes_anterior", "tiempo_estacion"]

numeric = [col for col in X.columns if col not in categorical]
numeric = list(set(numeric + lag_features))  # aseguramos que los lags estén incluidos

# =======================
# Preprocessor
# =======================
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric)
    ]
)

# =======================
# Model (Random Forest)
# =======================
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

# =======================
# Pipeline
# =======================
pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])

# =======================
# Train/Test split by time
# =======================
train = df[df["ano"] < 2017]
test = df[(df["ano"] >= 2017) & (df["ano"] < 2021)]

X_train, y_train = train.drop(columns=["hospitalizados_men5", "ubigeo"]), train["hospitalizados_men5"]
X_test, y_test = test.drop(columns=["hospitalizados_men5", "ubigeo"]), test["hospitalizados_men5"]

# =======================
# Fit
# =======================
pipeline.fit(X_train, y_train)

# =======================
# Evaluate
# =======================
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")


MAE: 0.3321
RMSE: 0.7660
R²: 0.8034
