# Import Required Libraries
Import all necessary libraries, including pandas, numpy, sklearn, and others.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import sys

# Define Utility Functions
Define the functions `load_and_merge_data`, `create_preprocessor`, `preprocess_and_split`, and `train_predict_evaluate` in separate cells.

In [2]:
# Define the `load_and_merge_data` function
def load_and_merge_data(features_path, labels_path):
    """
    Carga los datasets de características y etiquetas, y los une en un único DataFrame.
    """
    try:
        features_df = pd.read_csv(features_path, parse_dates=['week_start_date'])
        labels_df = pd.read_csv(labels_path)
        df = pd.merge(features_df, labels_df, on=['city', 'year', 'weekofyear'])
        df.sort_values(by=['city', 'week_start_date'], inplace=True)
        return df
    except FileNotFoundError as e:
        print(f"Error: {e}")
        return None

# Define the `create_preprocessor` function
def create_preprocessor(df):
    """
    Crea un preprocesador para manejar características categóricas y numéricas.
    """
    numeric_features = df.select_dtypes(include=np.number).columns.tolist()
    numeric_features = [col for col in numeric_features if col not in ['year', 'weekofyear', 'total_cases']]
    categorical_features = ['city']

    # Pipelines para transformar datos numéricos y categóricos
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # ColumnTransformer para aplicar las transformaciones
    return ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

# Define the `preprocess_and_split` function
def preprocess_and_split(df, preprocessor, target_col='total_cases', test_size=0.2, random_state=42):
    """
    Preprocesa los datos, extrae características temporales y divide en conjuntos de entrenamiento y prueba.
    """
    # Extraer características temporales
    df['month'] = df['week_start_date'].dt.month
    df['day'] = df['week_start_date'].dt.day
    df['day_of_week'] = df['week_start_date'].dt.dayofweek

    # Transformar la variable objetivo con log1p
    y = np.log1p(df[target_col])
    X = df.drop(columns=[target_col, 'week_start_date'])

    # Dividir en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=True
    )

    # Ajustar y transformar los datos con el preprocesador
    preprocessor.fit(X_train)
    X_train_processed = preprocessor.transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    return X_train_processed, X_test_processed, y_train, y_test

# Define the `train_predict_evaluate` function
def train_predict_evaluate(X_train, X_test, y_train, y_test):
    """
    Entrena un modelo RandomForestRegressor, realiza predicciones y evalúa el rendimiento.
    """
    # Definir hiperparámetros para GridSearchCV
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [10, 15, 20],
        'min_samples_leaf': [1, 5, 10],
        'max_features': ['sqrt', 'log2']
    }

    # Configurar GridSearchCV
    grid_search = GridSearchCV(
        estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
        param_grid=param_grid,
        cv=3,
        scoring='neg_mean_absolute_error',
        verbose=1
    )

    # Entrenar el modelo
    grid_search.fit(X_train, y_train)
    model = grid_search.best_estimator_

    # Realizar predicciones
    predictions_log = model.predict(X_test)
    predictions = np.maximum(0, np.expm1(predictions_log).round().astype(int))
    y_test_original = np.expm1(y_test)

    # Calcular y mostrar el MAE
    mae = mean_absolute_error(y_test_original, predictions)
    print(f'\nMean Absolute Error (MAE): {mae:.4f}')

    # Mostrar estadísticas del conjunto de prueba
    stats = pd.Series(y_test_original).describe()
    print(f"\nEstadísticas de 'total_cases' en el conjunto de prueba:")
    print(stats)
    print(f"\nMAE como % de la Media: {(mae / stats['mean']) * 100:.2f}%")

# Main Workflow - Load and Merge Data
Load the datasets using the `load_and_merge_data` function and display the merged DataFrame.

In [3]:
# Load and merge the datasets
FEATURES_FILE = 'dengue_features_train.csv'
LABELS_FILE = 'dengue_labels_train.csv'

merged_df = load_and_merge_data(FEATURES_FILE, LABELS_FILE)

# Display the merged DataFrame
if merged_df is not None:
    display(merged_df.head())
else:
    print("Error in loading data.")

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
936,iq,2000,26,2000-07-01,0.192886,0.132257,0.340886,0.2472,25.41,296.74,...,92.418571,25.41,16.651429,8.928571,26.4,10.775,32.5,20.7,3.0,0
937,iq,2000,27,2000-07-08,0.216833,0.2761,0.289457,0.241657,60.61,296.634286,...,93.581429,60.61,16.862857,10.314286,26.9,11.566667,34.0,20.8,55.6,0
938,iq,2000,28,2000-07-15,0.176757,0.173129,0.204114,0.128014,55.52,296.415714,...,95.848571,55.52,17.12,7.385714,26.8,11.466667,33.0,20.7,38.1,0
939,iq,2000,29,2000-07-22,0.227729,0.145429,0.2542,0.200314,5.6,295.357143,...,87.234286,5.6,14.431429,9.114286,25.766667,10.533333,31.5,14.7,30.0,0
940,iq,2000,30,2000-07-29,0.328643,0.322129,0.254371,0.361043,62.76,296.432857,...,88.161429,62.76,15.444286,9.5,26.6,11.48,33.3,19.1,4.0,0


# Main Workflow - Create Preprocessor
Create the preprocessor using the `create_preprocessor` function and display its configuration.

In [4]:
# Create the preprocessor using the `create_preprocessor` function
if merged_df is not None:
    preprocessor = create_preprocessor(merged_df.drop(columns=['total_cases', 'week_start_date']))
    # Display the preprocessor configuration
    print(preprocessor)
else:
    print("Merged DataFrame is not available.")

ColumnTransformer(remainder='passthrough',
                  transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
                                  'precipitation_amt_mm',
                                  'reanalysis_air_temp_k',
                                  'reanalysis_avg_temp_k',
                                  'reanalysis_dew_point_temp_k',
                                  'reanalysis_max_air_temp_k',
                                  'reanalysis_min_air_temp_k',
                                  'reanalysis_precip_amt_kg_per_m2',
                                  'reanalysis_relative_humidity_percent',
                                  'reanalysis_sat_precip_amt_mm',
                                  'r

# Main Workflow - Preprocess and Split Data
Preprocess the data and split it into training and testing sets using the `preprocess_and_split` function.

In [5]:
# Preprocess and split the data
if merged_df is not None:
    X_train, X_test, y_train, y_test = preprocess_and_split(merged_df, preprocessor)
    # Display the shapes of the processed datasets
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")
else:
    print("Merged DataFrame is not available for preprocessing and splitting.")

X_train shape: (1164, 27)
X_test shape: (292, 27)
y_train shape: (1164,)
y_test shape: (292,)


# Main Workflow - Train, Predict, and Evaluate
Train the model, make predictions, and evaluate its performance using the `train_predict_evaluate` function.

In [6]:
# Train, Predict, and Evaluate
if merged_df is not None:
    train_predict_evaluate(X_train, X_test, y_train, y_test)
else:
    print("Data is not available for training, prediction, and evaluation.")

Fitting 3 folds for each of 36 candidates, totalling 108 fits

Mean Absolute Error (MAE): 13.2808

Estadísticas de 'total_cases' en el conjunto de prueba:
count    292.000000
mean      25.482877
std       41.732783
min        0.000000
25%        5.000000
50%       13.000000
75%       28.250000
max      410.000000
Name: total_cases, dtype: float64

MAE como % de la Media: 52.12%
