In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import os
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib

In [3]:
file_path = 'dengue_data1.csv'
split_index = 143

In [5]:
data = pd.read_csv(file_path)

In [7]:
features = ['pr', 'pr_1', 'tas', 'tas_1', 'tasmax', 'tasmax_1', 'tasmin', 'tasmin_1', 
            'deltemp', 'deltemp_1', 'dengue_befor', 'dengue_befor_1']
X = data[features]
y = data['dengue_incidence']

X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [9]:
param_grid = {
    'n_estimators': [6],
    'max_depth': [100],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': [1, 'sqrt']
}

In [11]:
def r_squared(y_true, y_pred):
    """Compute the R-squared value for two arrays of data."""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_pred_sum = np.sum((y_pred-np.mean(y_pred))*(y_true-np.mean(y_true)))
    ss_res = (np.sum((y_pred - np.mean(y_pred)) ** 2) * np.sum((y_true - np.mean(y_true)) ** 2))**0.5
    return (y_pred_sum/ss_res)**2

In [13]:
def train_and_evaluate_model(X_train, y_train, X_test, y_test, param_grid):
    rf = RandomForestRegressor()
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, 
                               scoring='neg_mean_absolute_percentage_error')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)
    r2 = r_squared(y_test, predictions)  # Using our custom r_squared function
    mae = mean_absolute_error(y_test, predictions)
    
    return grid_search, r2, mae

In [15]:
r2_best, mae_best = -np.inf, np.inf
iteration = 0

# Create a directory to store all results
results_dir = f"dengue_model_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
os.makedirs(results_dir, exist_ok=True)

In [17]:
best_models = []

# Define max iterations
max_iterations = 100  # Run exactly 100 iterations

iteration = 0

# Loop for exactly 100 iterations
while iteration < max_iterations:
    iteration += 1
    model, r2, mae = train_and_evaluate_model(X_train, y_train, X_test, y_test, param_grid)
    
    print(f"Iteration {iteration}: R2 = {r2}, MAE = {mae}")
    
    # Store each model regardless of performance
    best_models.append({
        'iteration': iteration,
        'model': model.best_estimator_,
        'R2': r2,
        'MAE': mae,
        'best_params': model.best_params_
    })
    
# Sort the models by R² (descending) and MAE (ascending) and select top 10
best_models = sorted(best_models, key=lambda x: (-x['R2'], x['MAE']))[:10]

# Save the top 10 models to disk
for i, best_model in enumerate(best_models):
    model_filename = f"{results_dir}/best_model_iteration_{best_model['iteration']}.pkl"
    joblib.dump(best_model['model'], model_filename)
    print(f"Model {i+1} saved to {model_filename}")
    
    # Save summary information (R2, MAE, best parameters) to a DataFrame for each model
    summary = pd.DataFrame({
        'Iteration': [best_model['iteration']],
        'R2': [best_model['R2']],
        'MAE': [best_model['MAE']],
        'Best_Parameters': [str(best_model['best_params'])]
    })
    
    # Update or create the summary Excel file
    summary_filename = f"{results_dir}/model_summary.xlsx"
    if os.path.exists(summary_filename):
        existing_summary = pd.read_excel(summary_filename)
        updated_summary = pd.concat([existing_summary, summary], ignore_index=True)
        updated_summary.to_excel(summary_filename, index=False)
    else:
        summary.to_excel(summary_filename, index=False)

print(f"Top 10 models and summaries updated in {results_dir}")

Iteration 1: R2 = 0.33974948961387397, MAE = 3.692083333333334
New model added to top 10. Current best R2 = 0.33974948961387397, MAE = 3.692083333333334
Iteration 2: R2 = 0.5009595173376393, MAE = 3.836527777777777
New model added to top 10. Current best R2 = 0.5009595173376393, MAE = 3.836527777777777
Iteration 3: R2 = 0.2905339892019182, MAE = 2.981145833333333
New model added to top 10. Current best R2 = 0.5009595173376393, MAE = 3.836527777777777
Iteration 4: R2 = 0.5891233949757807, MAE = 3.911423611111111
New model added to top 10. Current best R2 = 0.5891233949757807, MAE = 3.911423611111111
Iteration 5: R2 = 0.45622074882898417, MAE = 4.189895833333334
New model added to top 10. Current best R2 = 0.5891233949757807, MAE = 3.911423611111111
Iteration 6: R2 = 0.350876808075864, MAE = 3.4984375
New model added to top 10. Current best R2 = 0.5891233949757807, MAE = 3.911423611111111
Iteration 7: R2 = 0.24294853757508952, MAE = 2.9464236111111113
New model added to top 10. Current b

In [None]:
print("Model training completed.")
print(f"Best model: R2 = {r2_best}, MAE = {mae_best}")
print(f"All results saved in directory: {results_dir}")