In [50]:
import csv

In [51]:
import pandas as pd

# Importar el dataset desde un archivo CSV
df_train = pd.read_csv('dengue_features_train.csv', sep=',', header=0)

# Mostrar las primeras filas del dataframe
df_train.head()

# df_labels = pd.read_csv('dengue_labels_train.csv')

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import sys

def load_and_merge_data(features_path, labels_path):
    """
    Carga los datasets de características y etiquetas, y los une.
    """
    try:
        features_df = pd.read_csv(features_path, parse_dates=['week_start_date'])
        labels_df = pd.read_csv(labels_path)
        df = pd.merge(features_df, labels_df, on=['city', 'year', 'weekofyear'])
        df.sort_values(by=['city', 'week_start_date'], inplace=True)
        return df
    except FileNotFoundError:
        print(f"Error: No se encontraron los archivos en las rutas especificadas:")
        print(f"- {features_path}")
        print(f"- {labels_path}")
        return None

def create_preprocessor(df):
    """
    Crea un preprocesador para manejar características categóricas y numéricas.
    """
    # Seleccionar columnas numéricas (excluyendo las que no son features y la variable objetivo)
    numeric_features = df.select_dtypes(include=np.number).columns.tolist()
    numeric_features = [col for col in numeric_features if col not in ['year', 'weekofyear', 'total_cases']]
    
    # Variables categóricas
    categorical_features = ['city']
    
    # Pipeline para variables numéricas: imputación por mediana y escalado
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Pipeline para variables categóricas: OneHotEncoder
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'  # Incluye las columnas que no se especificaron
    )
    
    return preprocessor

def preprocess_and_split(df, preprocessor, target_col='total_cases', test_size=0.2, random_state=42):
    """
    Preprocesa los datos y los divide respetando la transformación de la variable objetivo.
    """
    # Extraer variables temporales a partir de 'week_start_date'
    df['month'] = df['week_start_date'].dt.month
    df['day'] = df['week_start_date'].dt.day
    df['day_of_week'] = df['week_start_date'].dt.dayofweek
    
    # Transformar la variable objetivo para mitigar el sesgo en la distribución.
    # Se utiliza log1p para poder trabajar con ceros.
    y = np.log1p(df[target_col])
    
    # Eliminar columnas no deseadas para el modelo
    X = df.drop(columns=[target_col, 'week_start_date'])
    
    # Dividir en conjunto de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=True
    )
    
    # Ajustar el preprocesador con X_train y transformar ambos conjuntos
    preprocessor.fit(X_train)
    X_train_processed = preprocessor.transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Verificar si quedan NaNs tras el preprocesamiento
    if np.isnan(X_train_processed).sum() > 0 or np.isnan(X_test_processed).sum() > 0:
         print("Advertencia: NaNs detectados después del preprocesamiento.")
    
    return X_train_processed, X_test_processed, y_train, y_test

def train_predict_evaluate(X_train, X_test, y_train, y_test):
    """
    Entrena el modelo, realiza la predicción y evalúa el rendimiento utilizando GridSearchCV.
    """
    # Definir la rejilla de hiperparámetros a explorar
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 15, 20],
        'min_samples_leaf': [1, 5, 10],
        'max_features': ['sqrt', 'log2']
    }
    
    base_model = RandomForestRegressor(random_state=42, n_jobs=-1)
    
    grid_search = GridSearchCV(
        estimator=base_model, 
        param_grid=param_grid,
        cv=3, 
        scoring='neg_mean_absolute_error',
        verbose=1
    )
    
    print("Iniciando GridSearchCV para optimización de hiperparámetros...")
    grid_search.fit(X_train, y_train)
    print("Mejores parámetros encontrados:", grid_search.best_params_)
    
    model = grid_search.best_estimator_
    
    print("Entrenando RandomForestRegressor con los mejores parámetros...")
    # El modelo ya se ha entrenado durante el GridSearchCV
    predictions_log = model.predict(X_test)
    
    # Invertir la transformación logarítmica para obtener predicciones en la escala original
    predictions = np.expm1(predictions_log)
    predictions = np.maximum(0, predictions.round().astype(int))
    y_test_original = np.expm1(y_test)
    
    mae = mean_absolute_error(y_test_original, predictions)
    print(f'\nMean Absolute Error (MAE): {mae:.4f}')
    
    print("\nContexto para el MAE (Estadísticas de 'total_cases' en el conjunto de prueba):")
    stats = pd.Series(y_test_original).describe()
    print(f"- Media:      {stats['mean']:.4f}")
    print(f"- Mediana:    {stats['50%']:.4f}")
    print(f"- Desv. Est.: {stats['std']:.4f}")
    print(f"- Mínimo:     {stats['min']:.4f}")
    print(f"- Máximo:     {stats['max']:.4f}")
    
    if stats['mean'] != 0:
        mae_perc_mean = (mae / stats['mean']) * 100
        print(f"\n- MAE como % de la Media: {mae_perc_mean:.2f}%")
    else:
        print("\n- No se puede calcular MAE como % de la Media (la media es 0).")

# --- Flujo Principal ---
if __name__ == "__main__":
    FEATURES_FILE = 'dengue_features_train.csv'
    LABELS_FILE = 'dengue_labels_train.csv'
    
    # 1. Cargar y unir los datos
    merged_df = load_and_merge_data(FEATURES_FILE, LABELS_FILE)
    
    if merged_df is not None:
        # 2. Crear el preprocesador basado en el DataFrame completo (sin la variable objetivo y fecha)
        preprocessor = create_preprocessor(merged_df.drop(columns=['total_cases', 'week_start_date']))
        
        # 3. Preprocesar y dividir los datos
        X_train, X_test, y_train, y_test = preprocess_and_split(merged_df, preprocessor)
        
        # 4. Entrenar, predecir y evaluar el modelo
        train_predict_evaluate(X_train, X_test, y_train, y_test)
    else:
        print("Proceso detenido debido a error en la carga de archivos.")
        sys.exit(1)



Iniciando GridSearchCV para optimización de hiperparámetros...
Fitting 3 folds for each of 36 candidates, totalling 108 fits
Mejores parámetros encontrados: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100}
Entrenando RandomForestRegressor con los mejores parámetros...

Mean Absolute Error (MAE): 13.2808

Contexto para el MAE (Estadísticas de 'total_cases' en el conjunto de prueba):
- Media:      25.4829
- Mediana:    13.0000
- Desv. Est.: 41.7328
- Mínimo:     0.0000
- Máximo:     410.0000

- MAE como % de la Media: 52.12%


  _data = np.array(data, dtype=dtype, copy=copy,
