## **Predicting Disease Spread: Model for both cities**

### **Imports**

In [1]:
import sys
import os

import numpy as np
import pandas as pd

sys.path.append(os.path.abspath(".."))
from src.data_setup import load_data, split_cities
from src.model import (
    ModelTrainer,
    supporting_model
)

from src.feature_engineering import (
    NAImputer,
)

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### **Data**

In [2]:
features_train, labels_train = load_data(train=True)
sj_features_train, iq_features_train = split_cities(features_train)
sj_labels_train, iq_labels_train = split_cities(labels_train)

In [3]:
features_test, labels_test = load_data(train=False)
sj_features_test, iq_features_test = split_cities(features_test)

#### Drop only non-numeric, irrelevant column

In [4]:
iq_features_train_num = iq_features_train.drop(columns=['week_start_date'])
iq_features_test_num = iq_features_test.drop(columns=['week_start_date'])
sj_features_train_num = sj_features_train.drop(columns=['week_start_date'])
sj_features_test_num = sj_features_test.drop(columns=['week_start_date'])

#### Join datasets into one

In [5]:
train_data = pd.concat([sj_features_train_num, iq_features_train_num])
train_data['is_iquitos'] = train_data.index.get_level_values(0).isin(iq_features_train_num.index.get_level_values(0))

train_labels = pd.concat([sj_labels_train, iq_labels_train])
test_data = pd.concat([sj_features_test_num, iq_features_test_num])
test_data['is_iquitos'] = test_data.index.get_level_values(0).isin(iq_features_test_num.index.get_level_values(0))

test_data_added_prev = pd.read_csv(
    os.path.join("..", "data", "processed", "features_test_added_prev_3.csv")
)

### **Prev cases as predictions of another model**

#### Best supporting model choice 

In [6]:
knn = KNeighborsRegressor()
rf = RandomForestRegressor()
en = ElasticNet()
svr = SVR()
boosting = GradientBoostingRegressor()
mlp = MLPRegressor()

In [7]:
# Parameter grid for KNeighborsRegressor
PARAM_GRID_KNN = {
    'n_neighbors': [3, 5, 7, 9, 13, 17],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Parameter grid for RandomForestRegressor
PARAM_GRID_RF = {
    'n_estimators': [500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Parameter grid for GradientBoostingRegressor
PARAM_GRID_BOOSTING = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Parameter grid for ElasticNet
PARAM_GRID_EN = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99]
}

# Parameter grid for SVR
PARAM_GRID_SVR = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'kernel': ['linear', 'rbf', 'poly']
}

# Parameter grid for MLPRegressor
PARAM_GRID_MLP = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01]
}

#### Settings

In [8]:
K_PREV = 3
IMPUTATION_METHOD = "linear"
TOP_N_FEATURES = 10
CORR_METHOD = "pearson"
SCALER = "standard"

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=ConvergenceWarning)

supp = supporting_model(
    train_data,
    train_labels,
    test_data,
    [knn, rf, en, svr, boosting, mlp],
    [PARAM_GRID_KNN, PARAM_GRID_RF, PARAM_GRID_EN, PARAM_GRID_SVR, PARAM_GRID_BOOSTING, PARAM_GRID_MLP],
)

Summary of the best supporting model choice: 
| Model                     | Score     |
|---------------------------|-----------|
| KNeighborsRegressor       | 21.0264    |
| RandomForestRegressor     | 20.2423   |
| ElasticNet                | 19.4159    |
| **SVR**                   | 16.9356  |
| GradientBoostingRegressor | 20.5962  |
| MLPRegressor              | 18.6864  |

In [34]:
features_test_added_prev = supp["X_test"]

In [35]:
features_test_added_prev.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,ndvi_ne_ndvi_nw_sum,ndvi_ne_ndvi_se_diff,ndvi_ne_ndvi_se_prod,ndvi_ne_ndvi_se_quot,ndvi_ne_ndvi_se_sum,ndvi_ne_ndvi_sw_diff,ndvi_ne_ndvi_sw_prod,1_prev_cases,2_prev_cases,3_prev_cases
year,weekofyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2008,18,-0.0189,-0.0189,0.102729,0.0912,78.6,298.492857,298.55,294.527143,301.1,296.4,...,-0.0378,-0.121629,-0.001942,-0.18398,0.083829,-0.1101,-0.001724,4.0,1.0,1.0


#### Models

In [9]:
model_trainer_knn = ModelTrainer(model=KNeighborsRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_KNN)

model_trainer_knn.fit(train_data, train_labels)
model_trainer_knn.transform(train_data, train_labels, test_data_added_prev)

--- Model KNeighborsRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.




CV results: -13.3172,-13.2379,-12.4259,-12.3635,-12.9029,-12.8300,-12.3177,-12.2165,-12.6692,-12.5994,-12.2304,-12.1160,-12.7643,-12.6668,-12.0932,-11.9941,-12.9591,-12.8167,-12.2909,-12.1404,-13.1663,-13.0198,-12.4686,-12.2888
Mean: -12.5790


In [10]:
model_trainer_boosting = ModelTrainer(model=GradientBoostingRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_BOOSTING)

model_trainer_boosting.fit(train_data, train_labels)
model_trainer_boosting.transform(train_data, train_labels, test_data_added_prev)

--- Model GradientBoostingRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.
CV results: -12.1594,-12.1581,-7.9062,-7.9303,-8.1776,-8.0985,-7.9941,-7.8463,-12.2095,-8.1343,-11.8623,-8.0659,-8.2318,-8.1546,-7.8366,-7.8042,-8.1579,-12.1497,-12.1192,-7.9606,-11.9270,-8.6565,-8.2246,-12.0471,-8.1179,-8.3570,-8.7660,-8.5070,-7.7135,-7.8491,-8.0032,-8.2440,-8.6482,-12.0570,-12.1609,-7.7987,-7.5942,-7.9941,-8.1579,-7.9521,-11.8987,-7.8680,-8.0599,-8.6492,-7.9763,-8.8310,-8.3324,-8.0102,-7.7517,-7.7893,-8.0469,-7.8434,-7.6871,-7.6873,-9.0317,-8.0709,-8.0835,-8.1026,-7.8736,-8.0284,-7.8566,-7.7097,-7.6690,-8.2713,-8.0397,-8.0869,-12.0431,-7.7909,-8.3605,-7.9619,-8.2166,-8.2890,-8.4892,-8.2196,-7.9366,-7.6618,-7.9423,-7.9712,-8.3199,-7.7452,-7.9426,-8.1551,-8.0609,-7.8456,-8.0244,-7.8863,-8.7657,-12.2166,-7.7766,-8.9431,-11.9063,-7.9704,-7.8074,-8.1583,-11.8625,-7.9431,-8.0562,-8.3579,-8.5114,-8.0127
Mean: -8.6814


In [11]:
model_trainer_rf = ModelTrainer(model=RandomForestRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_RF)

model_trainer_rf.fit(train_data, train_labels)
model_trainer_rf.transform(train_data, train_labels, test_data_added_prev)

--- Model RandomForestRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.




CV results: -7.5146,-7.4914,-7.4514,-7.4356,-7.4420,-7.3651,-7.4046,-7.3835,-7.3797,-7.5075,-7.4915,-7.4341,-7.4524,-7.4246,-7.3967,-7.3578,-7.3659,-7.3692,-7.5074,-7.5027,-7.4827,-7.4359,-7.3938,-7.3820,-7.3822,-7.3710,-7.3840,-7.5210,-7.4614,-7.4817,-7.4427,-7.4001,-7.3996,-7.3857,-7.3938,-7.3745
Mean: -7.4269


In [12]:
model_trainer_en = ModelTrainer(model=ElasticNet(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_EN)

model_trainer_en.fit(train_data, train_labels)
model_trainer_en.transform(train_data, train_labels, test_data_added_prev)

--- Model ElasticNet ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.


  model = cd_fast.enet_coordinate_descent(


CV results: -6.7407,-6.7437,-6.7486,-6.7564,-6.7676,-6.7711,-6.7738,-7.1066,-6.9852,-6.8531,-6.7281,-6.6368,-6.6256,-6.6243,-9.8811,-9.2966,-8.6911,-7.9765,-7.0770,-6.8241,-6.6084,-18.9276,-18.6777,-17.9570,-16.8038,-14.1951,-12.9504,-11.4602,-24.2179,-24.4549,-24.6924,-24.6927,-24.6927,-24.6927,-24.6927
Mean: -12.4093


In [13]:
model_trainer_svr = ModelTrainer(model=SVR(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_SVR)

model_trainer_svr.fit(train_data, train_labels)
model_trainer_svr.transform(train_data, train_labels, test_data_added_prev)

--- Model SVR ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.




CV results: -7.5634,-19.0435,-18.7897,-7.5694,-19.0466,-18.7853,-7.5611,-19.0431,-18.7750,-7.5572,-19.0525,-18.7801,-6.5957,-14.6085,-14.0614,-6.5941,-14.6230,-14.0474,-6.5926,-14.6292,-14.0348,-6.5925,-14.6424,-14.0265,-6.5614,-10.4987,-12.5440,-6.5612,-10.4922,-12.5369,-6.5557,-10.4939,-12.5298,-6.5464,-10.5074,-12.5228,-6.5457,-9.3787,-13.5530,-6.5520,-9.3732,-13.5340,-6.5468,-9.3660,-13.5140,-6.5416,-9.3485,-13.4832
Mean: -11.6396


In [14]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=ConvergenceWarning)

model_trainer_mlp = ModelTrainer(model=MLPRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_MLP)

model_trainer_mlp.fit(train_data, train_labels)
model_trainer_mlp.transform(train_data, train_labels, test_data_added_prev)

--- Model MLPRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.




CV results: -8.2793,-7.4172,-8.3488,-7.4165,-8.1371,-7.4874,-8.1464,-7.5657,-7.5292,-7.8777,-7.5101,-8.2079,-7.8705,-8.2572,-7.5299,-8.4957,-8.3290,-7.3808,-8.3376,-7.5159,-7.9992,-7.4936,-8.0778,-7.4766,-7.7894,-9.3422,-7.5637,-8.5272,-7.7587,-8.3384,-7.6335,-8.2327,-8.3658,-7.2367,-8.2649,-7.1746,-7.9572,-7.5450,-8.0269,-7.3485,-7.7956,-7.6673,-7.8680,-8.1518,-7.5062,-7.9279,-7.9106,-8.1649,-10.7150,-12.7497,-10.6840,-12.1953,-9.5862,-11.1337,-9.5142,-11.1723,-11.2024,-11.7887,-10.9883,-12.2587,-11.0335,-11.3566,-11.0836,-11.7366,-10.6163,-12.5994,-10.6943,-12.9991,-9.6513,-11.4736,-9.6270,-11.3037,-11.2320,-11.6465,-11.0807,-11.1273,-11.2100,-11.4899,-11.1915,-11.2750,-10.6434,-13.0336,-10.7041,-12.3313,-9.5623,-11.2755,-9.6219,-11.4029,-11.0832,-10.8762,-11.1332,-11.4605,-11.1577,-11.2644,-11.2269,-11.5698
Mean: -9.5370


Summary:   

| Model                       | Score   |
|-----------------------------|---------|
| KNeighborsRegressor         | 12.5790 |
| GradientBoostingRegressor   | 8.6801 |
| **RandomForestRegressor**   | 7.4366  |
| ElasticNet                  | 12.4903 |
| SVR                         | 11.6396 |
| MLPRegressor                | 9.5044 |

#### Stacking regressor built from best 3 models 

In [None]:
train_data_trans = model_trainer_knn.get_X_train()

stacking_regressor = StackingRegressor(
    estimators=[
        ('mlp_regressor', model_trainer_mlp.get_model()),
        ('random_forest', model_trainer_rf.get_model()),
        ('gradient_boosting', model_trainer_boosting.get_model())
    ],
    final_estimator=GradientBoostingRegressor()
)

print("Fitting stacking regressor.")
stacking_regressor.fit(train_data_trans, train_labels.values.ravel())

print("Cross-validating stacking regressor.")
cv_scores = cross_val_score(
    stacking_regressor, train_data_trans, train_labels.values.ravel(), cv=5, scoring='neg_mean_absolute_error'
)

mean_cv_score = -cv_scores.mean()
std_cv_score = cv_scores.std()

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean Absolute Error: {mean_cv_score:.4f}")
print(f"Cross-Validation Standard Deviation: {std_cv_score:.4f}")

Fitting stacking regressor.
Cross-validating stacking regressor.
Cross-Validation Scores: [-17.51530333  -8.5498158   -5.44374855  -4.82402628  -4.87888811]
Cross-Validation Mean Squared Error: 8.2424
Cross-Validation Standard Deviation: 4.8355


#### Making predictions with the best models

In [27]:
y_pred = stacking_regressor.predict(test_data_added_prev)

# Create a DataFrame for submission
submission = pd.DataFrame()
submission['year'] = test_data.index.get_level_values(0)
submission['weekofyear'] = test_data.index.get_level_values(1)
submission['city'] = test_data_added_prev['is_iquitos'].apply(lambda x: 'iq' if x else 'sj')
submission['total_cases'] = y_pred.round().astype(int)  # Round predictions and convert to integer

# Reorder columns to match the required format
submission = submission[['city', 'year', 'weekofyear', 'total_cases']]


# Display the first few rows of the submission DataFrame
submission.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,327
1,sj,2008,19,311
2,sj,2008,20,311
3,sj,2008,21,311
4,sj,2008,22,311


In [None]:
y_pred = model_trainer_boosting.get_model().predict(test_data_added_prev)
submission_format = pd.read_csv("../data/submission_format.csv")
assert len(y_pred) == len(submission_format), "Length of y_pred does not match submission_format"
submission_format['total_cases'] = y_pred.round().astype(int)
submission_format.head()
submission_format.to_csv("../data/predictions/prev_as_pred_boosting_2703.csv", index=False)

### **Previous cases as iterative predictions**

### **Dedicated library**