## **Predicting Disease Spread: Model for Iquitos**

### **Imports**

In [1]:
import sys
import os

import numpy as np
import pandas as pd

sys.path.append(os.path.abspath(".."))
from src.data_setup import load_data, split_cities
from src.model import (
    ModelTrainer,
    supporting_model,
    predict_iteratively_and_save,
    # predict_iteratively_and_save_city
)

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor


### **Data**

In [2]:
features_train, labels_train = load_data(train=True)
sj_features_train, iq_features_train = split_cities(features_train)
sj_labels_train, iq_labels_train = split_cities(labels_train)

In [3]:
features_test, labels_test = load_data(train=False)
sj_features_test, iq_features_test = split_cities(features_test)

#### Drop only non-numeric, irrelevant column

In [4]:
iq_features_train_num = iq_features_train.drop(columns=['week_start_date'])
iq_features_test_num = iq_features_test.drop(columns=['week_start_date'])
iq_features_test_added_prev_3 = pd.read_csv("../data/processed/iq_features_test_added_prev_3.csv")

### **Prev cases as predictions of another model**

#### Best supporting model choice 

In [6]:
knn = KNeighborsRegressor()
rf = RandomForestRegressor()
en = ElasticNet()
svr = SVR()
boosting = GradientBoostingRegressor()
mlp = MLPRegressor()

In [10]:
# Parameter grid for KNeighborsRegressor
PARAM_GRID_KNN = {
    'n_neighbors': [3, 5, 7, 9, 13, 17],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Parameter grid for RandomForestRegressor
PARAM_GRID_RF = {
    'n_estimators': [500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Parameter grid for GradientBoostingRegressor
PARAM_GRID_BOOSTING = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Parameter grid for ElasticNet
PARAM_GRID_EN = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99]
}

# Parameter grid for SVR
PARAM_GRID_SVR = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'kernel': ['linear', 'rbf', 'poly']
}

# Parameter grid for MLPRegressor
PARAM_GRID_MLP = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01]
}

In [8]:
K_PREV = 3
IMPUTATION_METHOD = "linear"
TOP_N_FEATURES = 10
CORR_METHOD = "pearson"
SCALER = "standard"

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=ConvergenceWarning)

supp = supporting_model(
    iq_features_train_num,
    iq_labels_train,
    iq_features_test_num,
    [knn, rf, en, svr, boosting, mlp],
    [PARAM_GRID_KNN, PARAM_GRID_RF, PARAM_GRID_EN, PARAM_GRID_SVR, PARAM_GRID_BOOSTING, PARAM_GRID_MLP],
)

Summary of the best supporting model choice: 
| Model                     | Score     |
|---------------------------|-----------|
| KNeighborsRegressor       | 6.9542    |
| RandomForestRegressor     | 7.0734    |
| ElasticNet                | 6.6470    |
| SVR                       | 6.1562  |
| GradientBoostingRegressor | 6.7217  |
| **MLPRegressor**              | 6.6006  |

In [None]:
iq_features_test_added_prev = supp["X_test"]

In [10]:
iq_features_test_added_prev.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,ndvi_ne_ndvi_nw_sum,ndvi_ne_ndvi_se_diff,ndvi_ne_ndvi_se_prod,ndvi_ne_ndvi_se_quot,ndvi_ne_ndvi_se_sum,ndvi_ne_ndvi_sw_diff,ndvi_ne_ndvi_sw_prod,1_prev_cases,2_prev_cases,3_prev_cases
year,weekofyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2010,26,0.183783,0.1425,0.225129,0.150214,82.29,297.648571,299.214286,296.118571,307.9,293.7,...,0.326283,-0.041345,0.041375,0.816348,0.408912,0.033569,0.027607,4.0,1.0,1.0


#### Models

In [9]:
model_trainer_knn = ModelTrainer(model=KNeighborsRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_KNN)

model_trainer_knn.fit(iq_features_train_num, iq_labels_train)
model_trainer_knn.transform(iq_features_train_num, iq_labels_train, iq_features_test_added_prev_3)

--- Model KNeighborsRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.




CV results: -5.2750,-5.2903,-5.1635,-5.1857,-5.2177,-5.1968,-5.1204,-5.1116,-5.1852,-5.1637,-5.0692,-5.0589,-5.1436,-5.1231,-4.9577,-4.9471,-5.1033,-5.0824,-5.0223,-4.9988,-5.1486,-5.1231,-5.0687,-5.0406
Mean: -5.1165


In [11]:
model_trainer_boosting = ModelTrainer(model=GradientBoostingRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_BOOSTING)

model_trainer_boosting.fit(iq_features_train_num, iq_labels_train)
model_trainer_boosting.transform(iq_features_train_num, iq_labels_train, iq_features_test_added_prev_3)

--- Model GradientBoostingRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.
CV results: -4.5545,-4.6486,-4.6236,-4.6483,-4.7125,-4.5738,-4.6057,-4.6083,-4.6320,-4.6349,-4.5905,-4.5554,-4.6428,-4.5001,-4.3461,-4.5405,-4.6378,-4.6471,-4.7334,-4.5261,-4.7032,-4.1754,-4.3756,-4.8540,-4.5405,-4.4861,-4.1266,-4.3684,-4.2300,-4.6146,-4.4579,-4.5479,-4.1706,-4.7399,-4.5431,-4.5091,-4.3073,-4.4401,-4.4959,-4.6372,-4.6578,-4.3623,-4.4889,-4.1865,-4.4118,-4.4870,-4.4321,-4.2922,-4.4716,-4.4503,-4.5047,-4.6361,-4.5088,-4.3098,-4.5468,-4.4328,-4.3909,-4.6149,-4.5630,-4.5276,-4.5897,-4.4342,-4.6062,-4.6332,-4.4420,-4.4669,-4.7533,-4.5417,-4.8002,-4.4112,-4.4188,-4.5671,-4.3205,-4.2985,-4.4064,-4.4470,-4.6799,-4.1070,-4.4682,-4.4110,-4.4764,-4.5771,-4.4420,-4.4474,-4.3640,-4.5324,-4.5257,-4.6395,-4.2743,-4.3369,-4.6819,-4.4054,-4.4241,-4.5407,-4.5694,-4.3895,-4.8658,-4.6856,-4.3705,-4.2932
Mean: -4.5021


In [12]:
model_trainer_rf = ModelTrainer(model=RandomForestRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_RF)

model_trainer_rf.fit(iq_features_train_num, iq_labels_train)
model_trainer_rf.transform(iq_features_train_num, iq_labels_train, iq_features_test_added_prev_3)

--- Model RandomForestRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.




CV results: -4.0896,-4.0675,-4.0903,-4.0449,-4.0416,-4.0414,-3.9733,-3.9700,-3.9557,-4.1215,-4.1019,-4.0874,-4.0247,-4.0344,-4.0368,-3.9618,-3.9554,-3.9982,-4.1165,-4.1041,-4.1127,-4.0644,-4.0383,-4.0608,-3.9533,-3.9752,-3.9831,-4.1288,-4.0919,-4.0877,-4.0435,-4.0759,-4.0345,-3.9728,-3.9703,-3.9730
Mean: -4.0384


In [13]:
model_trainer_en = ModelTrainer(model=ElasticNet(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_EN)

model_trainer_en.fit(iq_features_train_num, iq_labels_train)
model_trainer_en.transform(iq_features_train_num, iq_labels_train, iq_features_test_added_prev_3)

--- Model ElasticNet ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


CV results: -4.3088,-4.3104,-4.3130,-4.3174,-4.3239,-4.3259,-4.3277,-4.1523,-4.1437,-4.1441,-4.1380,-4.1349,-4.1339,-4.1334,-4.4584,-4.3844,-4.3367,-4.2616,-4.1656,-4.1335,-4.1033,-6.0725,-6.2659,-6.4972,-6.7049,-6.8483,-6.8483,-6.8483,-6.8483,-6.8483,-6.8483,-6.8483,-6.8483,-6.8483,-6.8483
Mean: -5.2307


In [14]:
model_trainer_svr = ModelTrainer(model=SVR(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_SVR)

model_trainer_svr.fit(iq_features_train_num, iq_labels_train)
model_trainer_svr.transform(iq_features_train_num, iq_labels_train, iq_features_test_added_prev_3)

--- Model SVR ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.




CV results: -3.8758,-5.9715,-5.9276,-3.8740,-5.9757,-5.9359,-3.8698,-5.9797,-5.9375,-3.8752,-5.9904,-5.9519,-3.8546,-4.8628,-5.4843,-3.8448,-4.8641,-5.4803,-3.8424,-4.8652,-5.4797,-3.8267,-4.8898,-5.4805,-3.9073,-4.2092,-5.6565,-3.9018,-4.2098,-5.6459,-3.8937,-4.2115,-5.6324,-3.8683,-4.2273,-5.5955,-3.9100,-5.0378,-8.3716,-3.9031,-5.0302,-8.3267,-3.8961,-5.0221,-8.2788,-3.8866,-5.0042,-8.1633
Mean: -5.0777


In [15]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=ConvergenceWarning)

model_trainer_mlp = ModelTrainer(model=MLPRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_MLP)

model_trainer_mlp.fit(iq_features_train_num, iq_labels_train)
model_trainer_mlp.transform(iq_features_train_num, iq_labels_train, iq_features_test_added_prev_3)

--- Model MLPRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.




CV results: -4.5187,-4.8462,-4.5729,-4.9247,-4.5173,-4.7707,-4.3917,-4.8133,-4.8264,-4.8509,-4.6494,-4.9822,-4.7210,-4.8118,-4.8893,-4.9915,-4.5840,-4.9267,-4.5804,-4.8674,-4.4134,-4.7166,-4.4481,-4.7977,-4.7731,-4.7803,-4.5709,-5.2355,-4.7638,-4.8533,-4.7879,-5.0048,-4.5172,-4.8904,-4.4915,-4.8849,-4.4705,-4.8854,-4.5027,-4.9448,-4.7366,-4.9062,-4.8148,-5.0268,-4.7456,-4.8362,-4.6442,-4.8272,-3.9964,-4.6729,-4.0500,-4.7422,-3.9892,-4.5801,-4.0147,-4.6826,-4.2112,-5.4240,-4.2477,-5.6387,-4.3226,-5.2173,-4.3177,-5.3462,-3.9903,-4.6898,-3.9837,-4.6772,-3.9368,-4.6403,-3.9854,-4.6986,-4.3510,-5.6738,-4.3035,-5.7443,-4.2686,-5.2730,-4.2480,-5.2784,-3.9654,-4.7051,-3.9734,-4.8125,-3.9454,-4.6581,-3.9171,-4.7469,-4.3489,-5.7421,-4.2405,-5.5001,-4.3441,-5.2779,-4.3620,-5.3636
Mean: -4.6813


Summary:   

| Model                       | Score   |
|-----------------------------|---------|
| KNeighborsRegressor         | 5.1165 |
| GradientBoostingRegressor   | 4.5064 |
| **RandomForestRegressor**   | 4.0327  |
| ElasticNet                  | 5.2307 |
| SVR                         | 5.0777 |
| MLPRegressor                | 4.6813 |

In [31]:
# Ensure the test dataset has the same feature columns as the training dataset
iq_features_test_filtered = iq_features_test_added_prev_3.drop(columns=['year', 'weekofyear'], errors='ignore')

# Make predictions
preds = model_trainer_rf.predict(iq_features_test_filtered)

submission = pd.DataFrame()
submission['year'] = iq_features_test.index.get_level_values(0)
submission['weekofyear'] = iq_features_test.index.get_level_values(1)
submission['city'] = 'iq'
submission['total_cases'] = preds.round().astype(int)  # Round predictions and convert to integer

# Reorder columns to match the required format
submission = submission[['city', 'year', 'weekofyear', 'total_cases']]

submission_format = pd.read_csv("../data/submission_format.csv")
submission_format = submission_format.set_index(['city', 'year', 'weekofyear']).drop(columns=['total_cases']).join(
    submission.set_index(['city', 'year', 'weekofyear']), how='left'
).reset_index()

# Ensure total_cases is an integer
submission_format['total_cases'] = submission_format['total_cases'].fillna(0).astype(int)

submission_format.head()
submission_format.to_csv("../data/predictions/prev_as_pred_rf_0204.csv", index=False)

### **Previous cases as iterative predictions**

Model Random Forest

In [46]:
def predict_iteratively_and_save_city(model_trainer,
                                 model_name: str,
                                 X_test_raw: pd.DataFrame,
                                 y_train: np.ndarray,
                                 k_prev: int,
                                 submission_format_path: str,
                                 output_path: str,
                                 city: str,
                                 ) -> None:

    X_test = X_test_raw.copy()
    X_pred = []

    X_prev = model_trainer.get_X_train().iloc[-k_prev:]
    y_prev = y_train[-k_prev:]

    model = model_trainer.get_model()
    scaler = model_trainer.get_scaler()
    columns_expected = model_trainer.get_X_train().columns.tolist()

    for i in range(len(X_test)):
        row = X_test.iloc[i].copy()
        for j in range(1, k_prev + 1):
            row[f'prev_{j}'] = y_prev[-j]
        row_scaled = pd.DataFrame(scaler.transform([row[columns_expected]]), columns=columns_expected)
        y_new = model.predict(row_scaled)[0]
        X_pred.append(y_new)
        y_prev = np.append(y_prev[1:], y_new)
        
        
    submission_format = pd.read_csv(submission_format_path)
    submission_format = submission_format[submission_format['city'] == city]
    submission_format = submission_format.merge(
        X_test[['year', 'weekofyear']],
        on=['year', 'weekofyear'],
        how='inner'
    )
    submission_format['total_cases'] = np.round(X_pred).astype(int)
    submission_format.to_csv(output_path, index=False)
    print(f"Saved predictions for {model_name} to {output_path}")

In [47]:
predict_iteratively_and_save_city(
    model_trainer_rf,
    model_name="rf",
    X_test_raw=iq_features_test_added_prev_3,
    y_train=iq_labels_train.values,
    k_prev=K_PREV,
    submission_format_path="../data/submission_format.csv",
    output_path="../data/predictions/iq/prev_as_iterative_rf.csv",
    city="iq",
)



Saved predictions for rf to ../data/predictions/iq/prev_as_iterative_rf.csv




### **Previous cases as iterative predictions**

### **Dedicated library**