# Instalación de las dependencias

In [None]:
!pip3 install --upgrade pip
!pip3 install -r requirements.txt

# Carga de utilidades personales

In [3]:
import requests  # Used for making HTTP requests to fetch online resources.

# URL of the raw file in the GitHub Gist
url = 'https://gist.githubusercontent.com/JMartinArocha/79e6f5c94ab6a8d3f0b2f57296395e76/raw/3d60d10fb336eb870cb03535929502bc8234abc9/ml_utilities.py'

# Fetching the content of the file
r = requests.get(url)

# Writing the content to a local file to ensure the utility script is available for import
with open('ml_utilities.py', 'w') as f:
    f.write(r.text)

# Importing the ml_utilities script after downloading it
import ml_utilities

# Carga de las librerias necesarias

In [4]:
from __future__ import print_function
from __future__ import division

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# just for the sake of this blog post!
from warnings import filterwarnings
filterwarnings('ignore')

# Usar el github para la importacion del dataset

In [None]:
# URLs of the data hosted on GitHub
url_features_train = 'https://raw.githubusercontent.com/JMartinArocha/MasterBigData/main/Dengue/data/dengue_features_train.csv'
url_labels_train = 'https://raw.githubusercontent.com/JMartinArocha/MasterBigData/main/Dengue/data/dengue_labels_train.csv'
url_features_test = 'https://raw.githubusercontent.com/JMartinArocha/MasterBigData/main/Dengue/data/dengue_features_test.csv'

# Load the data directly from GitHub
train_features = pd.read_csv(url_features_train, index_col=[0,1,2])
train_labels = pd.read_csv(url_labels_train, index_col=[0,1,2])

# Separate data for San Juan
sj_train_features = train_features.loc['sj']
sj_train_labels = train_labels.loc['sj']

# Separate data for Iquitos
iq_train_features = train_features.loc['iq']
iq_train_labels = train_labels.loc['iq']

# Initial inspection of the datasets using custom utility functions
ml_utilities.df_look(sj_train_features)
ml_utilities.df_look(train_labels)
ml_utilities.df_look(iq_train_features)
ml_utilities.df_look(iq_train_labels)


# Preparación de los datos

In [14]:
# Fill missing values with forward fill method to ensure continuity and avoid NaN values that could affect the analysis
sj_train_features.fillna(method='ffill', inplace=True)
iq_train_features.fillna(method='ffill', inplace=True)

# Merge the total cases into the feature dataset for both San Juan and Iquitos to facilitate direct analysis and modeling
sj_train_features['total_cases'] = sj_train_labels.total_cases
iq_train_features['total_cases'] = iq_train_labels.total_cases

# A second fillna call seems redundant as it was already done previously. Ensure no missing values remain before normalization.
# It might be an oversight, or meant as a precautionary step; however, it's good practice to verify the necessity of such steps.
sj_train_features.fillna(method='ffill', inplace=True)
iq_train_features.fillna(method='ffill', inplace=True)

# Normalize the datasets using a custom utility function. This standardizes the scale of the features, improving model performance.
sj_train_features = ml_utilities.normalize_dataset(sj_train_features)
iq_train_features = ml_utilities.normalize_dataset(iq_train_features)


# Metodos gráficos para la seleccion de caracteristicas

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Graphical Methods for Feature Selection

# Compute the correlations after dropping non-numeric columns like 'week_start_date' that don't contribute to correlation analysis
sj_correlations = sj_train_features.drop(columns="week_start_date").corr()
iq_correlations = iq_train_features.drop(columns="week_start_date").corr()

# Plot San Juan Variable Correlations using a heatmap
sj_corr_heat = sns.heatmap(sj_correlations)
plt.title('San Juan Variable Correlations')  # Setting the title for the heatmap
plt.show()  # Display the heatmap

# Plot Iquitos Variable Correlations in a similar manner
iq_corr_heat = sns.heatmap(iq_correlations)
plt.title('Iquitos Variable Correlations')  # Setting the title for the heatmap
plt.show()  # Display the heatmap


In [None]:
# For San Juan: Ranking features based on their correlation with total dengue cases
(sj_correlations
     .total_cases
     .drop('total_cases')  # Exclude self-correlation
     .sort_values(ascending=False)  # Sort features by correlation strength
     .plot
     .barh())  # Generate a horizontal bar chart
plt.show()  # Display the chart

In [None]:
# For Iquitos: Repeat the process to analyze and visualize feature correlations
(iq_correlations
     .total_cases
     .drop('total_cases')  # Exclude self-correlation
     .sort_values(ascending=False)  # Sort features by correlation strength
     .plot
     .barh())  # Generate a horizontal bar chart
plt.show()  # Display the chart

In [18]:
# Apply the function to the datasets, excluding 'week_start_date' as it's non-numeric and does not contribute to pair plots
ml_utilities.generate_pairwise_pairplots(sj_train_features.drop(columns='week_start_date'), hue='total_cases')
ml_utilities.generate_pairwise_pairplots(iq_train_features.drop(columns='week_start_date'), hue='total_cases')

# Métodos no gráficos para la seleccion de caracteristicas

In [None]:
# Import necessary libraries for feature selection
from sklearn.feature_selection import SelectKBest, f_classif

# Assuming df is your DataFrame with San Juan training features
df = sj_train_features

# Separate the features and the target variable
X = df.drop(columns=['total_cases','week_start_date'])  # Features, excluding 'week_start_date' as it's non-numeric
y = df['total_cases']  # Target variable: total cases of dengue

# Initialize SelectKBest with f_classif, the ANOVA F-value function, choosing top 'k' features
selector = SelectKBest(f_classif, k=5)  # 'k=5' can be adjusted based on desired number of features

# Fit the model and transform the dataset to select the top 'k' features
X_new = selector.fit_transform(X, y)

# Get the names of the selected features
KBest_selected_features = X.columns[selector.get_support()]

# Print the names of the selected features
print(KBest_selected_features)

# Split de datos Train, Test y Validation

In [20]:
# Assuming sj_train_features is your DataFrame with selected San Juan training features
df = sj_train_features

# Selecting the features for modeling
features = ['reanalysis_specific_humidity_g_per_kg', 
            'reanalysis_dew_point_temp_k', 
            'station_avg_temp_c', 
            'station_min_temp_c',
            'total_cases']
df = df[features]

# Separating the dataset into features (X) and target variable (y)
X = df.drop(columns=['total_cases'])  # Exclude the target variable from features
y = df['total_cases']  # Target variable

# Splitting the dataset into training and testing sets, with a test size of 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further splitting the training set into training and validation sets, with validation set being 25% of the training set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# The result is three sets for training, validation, and testing purposes

# Entrenamiento - RandomForestRegressor, Cross validation, GridSearch y RandomSearch

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt

# Initialize the RandomForestRegressor model
model = RandomForestRegressor()

# Hyperparameter space definition for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth': [4, 6, 8]  # Maximum number of levels in tree
}

# GridSearchCV for hyperparameter tuning
model_RandomForestRegressor_GS = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
model_RandomForestRegressor_GS.fit(X_train, y_train)  # Training with GridSearchCV
best_hyperparameters_GS = model_RandomForestRegressor_GS.best_params_  # Best parameters found by GridSearchCV

# Training the model with the best hyperparameters found by GridSearchCV
model_RandomForestRegressor_GS = RandomForestRegressor(**best_hyperparameters_GS)
model_RandomForestRegressor_GS.fit(X_train, y_train)

# Feature names excluding the target variable
feature_names = X_train.columns.tolist()
# Feature importance visualization for the model trained with GridSearchCV
importances_GS = model_RandomForestRegressor_GS.feature_importances_
ml_utilities.plot_feature_importance(importances_GS, feature_names, 'Feature Importance of Random Forest Regressor (GridSearchCV)')

In [None]:
# Hyperparameter space definition for RandomizedSearchCV is the same as for GridSearchCV
param_distributions = param_grid

# RandomizedSearchCV for hyperparameter tuning
model_RandomForestRegressor_RS = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=27, cv=5)
model_RandomForestRegressor_RS.fit(X_train, y_train)  # Training with RandomizedSearchCV
best_hyperparameters_RS = model_RandomForestRegressor_RS.best_params_  # Best parameters found by RandomizedSearchCV

# Training the model with the best hyperparameters found by RandomizedSearchCV
model_RandomForestRegressor_RS = RandomForestRegressor(**best_hyperparameters_RS)
model_RandomForestRegressor_RS.fit(X_train, y_train)

# Feature importance visualization for the model trained with RandomizedSearchCV
importances_RS = model_RandomForestRegressor_RS.feature_importances_
indices_RS = np.argsort(importances_RS)[::-1]
sorted_names_RS = [X.columns[i] for i in indices_RS]

# Feature names excluding the target variable
feature_names = X_train.columns.tolist()
ml_utilities.plot_feature_importance(importances_RS, feature_names, 'Feature Importance of Random Forest Regressor (RandomizedSearchCV)')

# Entrenamiento - DecisionTreeRegressor, GridSearch y RandomSearch

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt

# Cambiar a DecisionTreeRegressor
model = DecisionTreeRegressor()

# Ajustar el espacio de hiperparámetros para DecisionTreeRegressor
param_grid = {
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8, None]  # Añadiendo la opción de no limitar la profundidad
}

# Realizar la búsqueda en cuadrícula para la sintonización de hiperparámetros
model_DecisionTreeRegressor_GS = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
model_DecisionTreeRegressor_GS.fit(X_train, y_train)

# Obtener y mostrar los mejores hiperparámetros de la búsqueda en cuadrícula
best_hyperparameters = model_DecisionTreeRegressor_GS.best_params_
print("Best Hyperparameters:")
print(best_hyperparameters)

# Realizar predicciones con el modelo entrenado
y_pred = model_DecisionTreeRegressor_GS.predict(X_test)

# Feature names excluding the target variable
feature_names = X_train.columns.tolist()
# Feature importance visualization for the model trained with GridSearchCV
importances_GS = model_DecisionTreeRegressor_GS.best_estimator_.feature_importances_
ml_utilities.plot_feature_importance(importances_GS, feature_names, 'Feature Importance of Decision Tree Regressor (GridSearchCV)')



In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
import numpy as np
import matplotlib.pyplot as plt

model = DecisionTreeRegressor()

# Adjusting the hyperparameter space for a more flexible search
param_distributions = {
    'max_features': ['auto', 'sqrt', 'log2', None],
    'max_depth': sp_randint(3, 20),
    'min_samples_split': sp_randint(2, 11),
    'min_samples_leaf': sp_randint(1, 11)
}

# Randomized search for the best hyperparameters
model_DecisionTreeRegressor_RS = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=100, cv=5, random_state=42)
model_DecisionTreeRegressor_RS.fit(X_train, y_train)
# Extract and display the best hyperparameters
best_hyperparameters_RS = model_DecisionTreeRegressor_RS.best_params_
print("Best Hyperparameters from RandomizedSearchCV:")
print(best_hyperparameters_RS)

# Realizar predicciones con el modelo entrenado
y_pred = model_DecisionTreeRegressor_RS.predict(X_test)

# Dado que DecisionTreeRegressor no tiene feature_importances_ de la misma manera que los modelos de ensemble,
# la visualización de la importancia de características se hace directamente desde el modelo entrenado.
importances = model_DecisionTreeRegressor_RS.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
sorted_names = [X_train.columns[i] for i in indices]

# Feature names excluding the target variable
feature_names = X_train.columns.tolist()
# Feature importance visualization for the model trained with GridSearchCV
importances_RS = model_DecisionTreeRegressor_RS.best_estimator_.feature_importances_
ml_utilities.plot_feature_importance(importances_RS, feature_names, 'Feature Importance of Decision Tree Regressor (RandomizedSearchCV)')



# Entrenamiento - GradientBoostingRegressor, GridSearch y RandomSearch

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt

# Initialize the GradientBoostingRegressor model
model = GradientBoostingRegressor()

# Define the hyperparameter space for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Perform GridSearchCV for hyperparameter tuning
model_GradientBoostingRegressor_GS = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
model_GradientBoostingRegressor_GS.fit(X_train, y_train)

# Display the best hyperparameters found by GridSearchCV
best_hyperparameters = model_GradientBoostingRegressor_GS.best_params_
print("Best Hyperparameters:", best_hyperparameters)

# Predict on the test set with the tuned model
y_pred = model_GradientBoostingRegressor_GS.predict(X_test)


# Feature names excluding the target variable
feature_names = X_train.columns.tolist()
# Feature importance visualization for the model trained with GridSearchCV
importances_GS = model_GradientBoostingRegressor_GS.best_estimator_.feature_importances_
ml_utilities.plot_feature_importance(importances_GS, feature_names, 'Feature Importance of Gradient Boosting Regressor (GridSearchCV)')


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint, uniform
import numpy as np
import matplotlib.pyplot as plt

# Initialize GradientBoostingRegressor model
model = GradientBoostingRegressor()

# Define hyperparameter distributions rather than a fixed hyperparameter grid
param_distributions = {
    'n_estimators': sp_randint(100, 400),  # Uniform distribution between 100 and 400
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': sp_randint(3, 10),  # Tree depths between 3 and 10
    'learning_rate': uniform(0.01, 0.2)  # Continuous values between 0.01 and 0.2
}

# Perform hyperparameter tuning with RandomizedSearchCV
model_GradientBoostingRegressor_RS = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=100, cv=5, random_state=42)
model_GradientBoostingRegressor_RS.fit(X_train, y_train)

# Display the best hyperparameters found by RandomizedSearchCV
best_hyperparameters = model_GradientBoostingRegressor_RS.best_params_
print("Best Hyperparameters:", best_hyperparameters)

# Predict on the test set with the optimized model
y_pred = model_GradientBoostingRegressor_RS.predict(X_test)


# Feature names excluding the target variable
feature_names = X_train.columns.tolist()
# Feature importance visualization for the model trained with GridSearchCV
importances_RS = model_GradientBoostingRegressor_RS.best_estimator_.feature_importances_
ml_utilities.plot_feature_importance(importances_RS, feature_names, 'Feature Importance of Gradient Boosting Regressor (RandomizedSearchCV)')


# Uso de gráficos para obtener comparativas en el entrenamiento y ayudar a entender la presicion de los resutados

## Comparación de métricas

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt

# Dictionary of models for comparison
models = {
    'Random Forest Regresor - GridSearch': model_RandomForestRegressor_GS,
    'Random Forest Regresor - RandomSearch': model_RandomForestRegressor_RS,
    'GradientBoostingRegressor - GridSearch': model_GradientBoostingRegressor_GS,
    'GradientBoostingRegressor - RandomSearch': model_GradientBoostingRegressor_RS,
    'DecisionTreeRegressor - GridSearch': model_DecisionTreeRegressor_GS,
    'DecisionTreeRegressor - RandomSearch': model_DecisionTreeRegressor_RS,
}

models_metrics = {}
# Generate and collect metrics for each model
for name, model in models.items():
    metrics = ml_utilities.generate_regresion_evaluation_report(model, X, y, cv=5)
    models_metrics[name] = metrics

# Print collected metrics for comparison
for model_name, metrics in models_metrics.items():
    print(f"Metrics for {model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")
    print()  # Space for readability

# Assuming ml_utilities.plot_all_metrics_comparisons is implemented as shown
ml_utilities.plot_all_metrics_comparisons(models_metrics, scale_factor=100)


## Prediccion vs Actual

In [None]:
def plot_predictions_vs_actual_(X_train, X_test, y_train, y_test, model):
    """
    Visualiza las predicciones frente a los casos reales de dengue utilizando el modelo entrenado.
    Mejorado para legibilidad en gráficos grandes o complejos.
    """
    
    # Obtener predicciones
    train_predictions = model.predict(X_train)
    test_predictions = model.predict(X_test)
    
    # Crear DataFrames para la visualización
    df_train_pred = pd.DataFrame({'Actual': y_train, 'Predicted': train_predictions}).sample(n=100)  # Ajusta n según sea necesario
    df_test_pred = pd.DataFrame({'Actual': y_test, 'Predicted': test_predictions}).sample(n=50)  # Ajusta n según sea necesario

    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 12))  # Ajusta el tamaño según sea necesario

    # Gráfico para el conjunto de entrenamiento
    df_train_pred.plot(ax=axes[0], marker='o', linestyle='-', markersize=5, alpha=0.6)
    axes[0].set_title("Training Set: Predicted vs Actual Cases")
    axes[0].legend(["Actual", "Predicted"])
    
    # Gráfico para el conjunto de prueba
    df_test_pred.plot(ax=axes[1], marker='o', linestyle='-', markersize=5, alpha=0.6)
    axes[1].set_title("Test Set: Predicted vs Actual Cases")
    axes[1].legend(["Actual", "Predicted"])

    plt.suptitle("Dengue Predicted Cases vs. Actual Cases", fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()
print(f'{type(model_RandomForestRegressor_GS).__name__} Grid Search')
plot_predictions_vs_actual_(X_train, X_test, y_train, y_test, model_RandomForestRegressor_GS)

print(f'{type(model_RandomForestRegressor_RS).__name__} Random Search')
plot_predictions_vs_actual_(X_train, X_test, y_train, y_test, model_RandomForestRegressor_RS)

print(f'{type(model_GradientBoostingRegressor_GS).__name__} Grid Search')
plot_predictions_vs_actual_(X_train, X_test, y_train, y_test, model_GradientBoostingRegressor_GS)

print(f'{type(model_GradientBoostingRegressor_RS).__name__} Random Search')
plot_predictions_vs_actual_(X_train, X_test, y_train, y_test, model_GradientBoostingRegressor_RS)

print(f'{type(model_DecisionTreeRegressor_GS).__name__} Grid Search')
plot_predictions_vs_actual_(X_train, X_test, y_train, y_test, model_DecisionTreeRegressor_GS)

print(f'{type(model_DecisionTreeRegressor_RS).__name__} Random Search')
plot_predictions_vs_actual_(X_train, X_test, y_train, y_test, model_DecisionTreeRegressor_RS)




## Prediccion VS Validación

In [None]:
def plot_predictions_vs_validation(X_train, X_val, y_train, y_val, model):
    """
    Visualiza las predicciones frente a los casos reales de dengue en el conjunto de validación utilizando el modelo entrenado.

    Parámetros:
    - X_train: Características del conjunto de entrenamiento.
    - X_val: Características del conjunto de validación.
    - y_train: Etiquetas del conjunto de entrenamiento.
    - y_val: Etiquetas del conjunto de validación.
    - model: Modelo entrenado.
    """
    
    # Obtener predicciones para el conjunto de entrenamiento y validación
    train_predictions = model.predict(X_train)
    val_predictions = model.predict(X_val)
    
    # Crear DataFrames para la visualización
    df_train_pred = pd.DataFrame({'Actual': y_train, 'Predicted': train_predictions}).sample(n=100)  # Ajusta según necesidad
    df_val_pred = pd.DataFrame({'Actual': y_val, 'Predicted': val_predictions}).sample(n=50)  # Ajusta según necesidad

    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 12))

    # Gráfico para el conjunto de entrenamiento
    df_train_pred.plot(ax=axes[0], marker='o', linestyle='-', markersize=5, alpha=0.6)
    axes[0].set_title("Training Set: Predicted vs Actual Cases")
    axes[0].legend(["Actual", "Predicted"])
    
    # Gráfico para el conjunto de validación
    df_val_pred.plot(ax=axes[1], marker='o', linestyle='-', markersize=5, alpha=0.6)
    axes[1].set_title("Validation Set: Predicted vs Actual Cases")
    axes[1].legend(["Actual", "Predicted"])

    plt.suptitle("Dengue Predicted Cases: Training vs. Validation Sets", fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()
print(f'{type(model_RandomForestRegressor_GS).__name__} Grid Search')
plot_predictions_vs_validation(X_train, X_val, y_train, y_val, model_RandomForestRegressor_GS)
print(f'{type(model_RandomForestRegressor_RS).__name__} Random Search')
plot_predictions_vs_validation(X_train, X_val, y_train, y_val, model_RandomForestRegressor_RS)
print(f'{type(model_GradientBoostingRegressor_GS).__name__} Grid Search')
plot_predictions_vs_validation(X_train, X_val, y_train, y_val, model_GradientBoostingRegressor_GS)
print(f'{type(model_GradientBoostingRegressor_RS).__name__} Random Search')
plot_predictions_vs_validation(X_train, X_val, y_train, y_val, model_GradientBoostingRegressor_RS)
print(f'{type(model_DecisionTreeRegressor_GS).__name__} Grid Search')
plot_predictions_vs_validation(X_train, X_val, y_train, y_val, model_DecisionTreeRegressor_GS)
print(f'{type(model_DecisionTreeRegressor_RS).__name__} Random Search')
plot_predictions_vs_validation(X_train, X_val, y_train, y_val, model_DecisionTreeRegressor_RS)
