## **Predicting Disease Spread: Model for both cities**

### **Imports**

In [1]:
import sys
import os

import numpy as np
import pandas as pd

sys.path.append(os.path.abspath(".."))
from src.data_setup import load_data, split_cities
from src.model import (
    ModelTrainer,
    supporting_model,
    predict_iteratively_and_save,
    DummyTrainer
)

from src.feature_engineering import (
    NAImputer,
)

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### **Data**

In [2]:
features_train, labels_train = load_data(train=True)
sj_features_train, iq_features_train = split_cities(features_train)
sj_labels_train, iq_labels_train = split_cities(labels_train)

In [3]:
features_test, labels_test = load_data(train=False)
sj_features_test, iq_features_test = split_cities(features_test)

#### Drop only non-numeric, irrelevant column

In [4]:
iq_features_train_num = iq_features_train.drop(columns=['week_start_date'])
iq_features_test_num = iq_features_test.drop(columns=['week_start_date'])
sj_features_train_num = sj_features_train.drop(columns=['week_start_date'])
sj_features_test_num = sj_features_test.drop(columns=['week_start_date'])

#### Join datasets into one

In [5]:
train_data = pd.concat([sj_features_train_num, iq_features_train_num])
train_data['is_iquitos'] = train_data.index.get_level_values(0).isin(iq_features_train_num.index.get_level_values(0))

train_labels = pd.concat([sj_labels_train, iq_labels_train])
test_data = pd.concat([sj_features_test_num, iq_features_test_num])
test_data['is_iquitos'] = test_data.index.get_level_values(0).isin(iq_features_test_num.index.get_level_values(0))

test_data_added_prev = pd.read_csv(
    os.path.join("..", "data", "processed", "features_test_added_prev_3.csv")
)

### **Prev cases as predictions of another model**

#### Best supporting model choice 

In [6]:
knn = KNeighborsRegressor()
rf = RandomForestRegressor()
en = ElasticNet()
svr = SVR()
boosting = GradientBoostingRegressor()
mlp = MLPRegressor()

In [7]:
# Parameter grid for KNeighborsRegressor
PARAM_GRID_KNN = {
    'n_neighbors': [3, 5, 7, 9, 13, 17],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Parameter grid for RandomForestRegressor
PARAM_GRID_RF = {
    'n_estimators': [500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Parameter grid for GradientBoostingRegressor
PARAM_GRID_BOOSTING = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Parameter grid for ElasticNet
PARAM_GRID_EN = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 0.95, 0.99]
}

# Parameter grid for SVR
PARAM_GRID_SVR = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'kernel': ['linear', 'rbf', 'poly']
}

# Parameter grid for MLPRegressor
PARAM_GRID_MLP = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.01]
}

#### Settings

In [8]:
K_PREV = 3
IMPUTATION_METHOD = "linear"
TOP_N_FEATURES = 10
CORR_METHOD = "pearson"
SCALER = "standard"

In [9]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=ConvergenceWarning)

supp = supporting_model(
    train_data,
    train_labels,
    test_data,
    [knn, rf, en, svr, boosting, mlp],
    [PARAM_GRID_KNN, PARAM_GRID_RF, PARAM_GRID_EN, PARAM_GRID_SVR, PARAM_GRID_BOOSTING, PARAM_GRID_MLP],
)

Choosing supporting model.
Evaluating 6 models.
Evaluating model: KNeighborsRegressor
Best score for KNeighborsRegressor: -21.0264
Evaluating model: RandomForestRegressor
Best score for RandomForestRegressor: -20.3599
Evaluating model: ElasticNet


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best score for ElasticNet: -19.4159
Evaluating model: SVR
Best score for SVR: -16.9356
Evaluating model: GradientBoostingRegressor
Best score for GradientBoostingRegressor: -20.5889
Evaluating model: MLPRegressor




Best score for MLPRegressor: -18.7366
Best model: None with score: -16.9356
First 5 predictions: [33.17636419 33.08784927 36.67912139 39.18084007 46.94562615]
preparing test dataset.


Summary of the best supporting model choice: 
| Model                     | Score     |
|---------------------------|-----------|
| KNeighborsRegressor       | 21.0264    |
| RandomForestRegressor     | 20.2423   |
| ElasticNet                | 19.4159    |
| **SVR**                   | 16.9356  |
| GradientBoostingRegressor | 20.5962  |
| MLPRegressor              | 18.6864  |

In [10]:
features_test_added_prev = supp["X_test"]

In [11]:
features_test_added_prev.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,ndvi_ne_ndvi_nw_sum,ndvi_ne_ndvi_se_diff,ndvi_ne_ndvi_se_prod,ndvi_ne_ndvi_se_quot,ndvi_ne_ndvi_se_sum,ndvi_ne_ndvi_sw_diff,ndvi_ne_ndvi_sw_prod,1_prev_cases,2_prev_cases,3_prev_cases
year,weekofyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2008,18,-0.0189,-0.0189,0.102729,0.0912,78.6,298.492857,298.55,294.527143,301.1,296.4,...,-0.0378,-0.121629,-0.001942,-0.18398,0.083829,-0.1101,-0.001724,4.0,1.0,1.0


#### Models

In [12]:
model_trainer_knn = ModelTrainer(model=KNeighborsRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_KNN)

model_trainer_knn.fit(train_data, train_labels)
model_trainer_knn.transform(train_data, train_labels, test_data_added_prev)

--- Model KNeighborsRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.
CV results: -13.3172,-13.2379,-12.4259,-12.3635,-12.9029,-12.8300,-12.3177,-12.2165,-12.6692,-12.5994,-12.2304,-12.1160,-12.7643,-12.6668,-12.0932,-11.9941,-12.9591,-12.8167,-12.2909,-12.1404,-13.1663,-13.0198,-12.4686,-12.2888
Mean: -12.5790


In [13]:
model_trainer_boosting = ModelTrainer(model=GradientBoostingRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_BOOSTING)

model_trainer_boosting.fit(train_data, train_labels)
model_trainer_boosting.transform(train_data, train_labels, test_data_added_prev)

--- Model GradientBoostingRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.
CV results: -12.1639,-12.1578,-7.8843,-7.9505,-8.1839,-8.2084,-8.0892,-7.8656,-12.2370,-8.1231,-11.8681,-8.0483,-8.1662,-8.1373,-7.9312,-7.8537,-8.1114,-12.1609,-12.1277,-7.9164,-11.9319,-8.6607,-8.1340,-12.0426,-8.0677,-8.4182,-8.7699,-8.5123,-7.7753,-7.8366,-7.9728,-8.2305,-8.6495,-12.0519,-12.1641,-7.7947,-7.5965,-8.0597,-8.1033,-7.9840,-11.9041,-7.9075,-8.0868,-8.6520,-7.9784,-8.8353,-8.3572,-7.9961,-7.8542,-7.8281,-8.0005,-7.8562,-7.7083,-7.6940,-9.0336,-8.0227,-8.0675,-8.1460,-7.8999,-8.0746,-7.8268,-7.7345,-7.6684,-8.3562,-8.0567,-8.1068,-12.0428,-7.8006,-8.3146,-7.9640,-8.2875,-8.3190,-8.4895,-8.2068,-7.9689,-7.7037,-7.9554,-7.9707,-8.3051,-7.7553,-7.9411,-8.1383,-8.0839,-7.9128,-8.0302,-7.8708,-8.7165,-12.2514,-7.7795,-8.9572,-11.9003,-7.9539,-7.7856,-8.2188,-11.8639,-7.9630,-8.0282,-8.4004,-8.5049,-8.0062
Mean: -8.6898


In [14]:
model_trainer_rf = ModelTrainer(model=RandomForestRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_RF)

model_trainer_rf.fit(train_data, train_labels)
model_trainer_rf.transform(train_data, train_labels, test_data_added_prev)

--- Model RandomForestRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.
CV results: -7.5364,-7.4849,-7.4426,-7.4324,-7.3937,-7.3771,-7.4021,-7.3884,-7.4189,-7.4915,-7.4926,-7.4441,-7.4214,-7.4091,-7.3771,-7.3563,-7.3610,-7.4002,-7.5154,-7.4749,-7.4797,-7.4516,-7.4362,-7.3736,-7.3932,-7.3832,-7.3918,-7.5518,-7.4325,-7.4334,-7.4518,-7.3975,-7.3702,-7.4066,-7.3768,-7.3944
Mean: -7.4262


In [15]:
model_trainer_en = ModelTrainer(model=ElasticNet(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_EN)

model_trainer_en.fit(train_data, train_labels)
model_trainer_en.transform(train_data, train_labels, test_data_added_prev)

--- Model ElasticNet ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.
CV results: -6.7407,-6.7437,-6.7486,-6.7564,-6.7676,-6.7711,-6.7738,-7.1066,-6.9852,-6.8531,-6.7281,-6.6368,-6.6256,-6.6243,-9.8811,-9.2966,-8.6911,-7.9765,-7.0770,-6.8241,-6.6084,-18.9276,-18.6777,-17.9570,-16.8038,-14.1951,-12.9504,-11.4602,-24.2179,-24.4549,-24.6924,-24.6927,-24.6927,-24.6927,-24.6927
Mean: -12.4093


  model = cd_fast.enet_coordinate_descent(


In [16]:
model_trainer_svr = ModelTrainer(model=SVR(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_SVR)

model_trainer_svr.fit(train_data, train_labels)
model_trainer_svr.transform(train_data, train_labels, test_data_added_prev)

--- Model SVR ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.
CV results: -7.5634,-19.0435,-18.7897,-7.5694,-19.0466,-18.7853,-7.5611,-19.0431,-18.7750,-7.5572,-19.0525,-18.7801,-6.5957,-14.6085,-14.0614,-6.5941,-14.6230,-14.0474,-6.5926,-14.6292,-14.0348,-6.5925,-14.6424,-14.0265,-6.5614,-10.4987,-12.5440,-6.5612,-10.4922,-12.5369,-6.5557,-10.4939,-12.5298,-6.5464,-10.5074,-12.5228,-6.5457,-9.3787,-13.5530,-6.5520,-9.3732,-13.5340,-6.5468,-9.3660,-13.5140,-6.5416,-9.3485,-13.4832
Mean: -11.6396


In [17]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=ConvergenceWarning)

model_trainer_mlp = ModelTrainer(model=MLPRegressor(),
                             imputation_method=IMPUTATION_METHOD,
                             top_n_features=TOP_N_FEATURES,
                             corr_method=CORR_METHOD,
                             scaling_method=SCALER,
                             k_prev_targets=K_PREV,
                             param_grid=PARAM_GRID_MLP)

model_trainer_mlp.fit(train_data, train_labels)
model_trainer_mlp.transform(train_data, train_labels, test_data_added_prev)

--- Model MLPRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.




CV results: -8.5189,-7.2976,-8.4999,-7.2913,-8.0779,-7.3769,-8.0927,-7.5985,-7.8566,-7.9244,-7.5594,-8.2291,-7.6331,-7.5281,-7.6187,-8.0993,-8.4097,-7.2224,-8.4228,-7.4415,-8.1005,-7.5819,-8.1828,-7.2494,-7.6748,-7.7078,-7.5686,-8.3654,-7.4738,-7.7123,-7.5033,-7.9490,-8.3873,-7.4557,-8.4294,-7.4655,-8.0951,-7.4994,-8.0353,-7.4222,-7.4791,-8.6734,-7.5885,-8.2901,-7.6622,-8.4377,-7.4639,-8.1734,-10.7004,-11.9948,-10.7163,-12.3547,-9.5793,-11.3456,-9.5832,-11.3901,-11.2109,-11.7311,-11.2536,-11.7653,-11.1021,-11.3397,-11.0881,-11.5274,-10.7318,-12.4116,-10.6442,-12.8541,-9.6409,-11.4844,-9.5533,-10.9821,-11.0826,-11.9759,-11.1474,-11.8968,-11.1770,-11.0638,-11.2920,-12.1156,-10.5846,-12.4610,-10.7431,-12.6359,-9.5967,-11.0036,-9.5651,-11.1507,-11.1530,-11.3566,-11.1440,-11.4914,-11.1673,-11.5859,-11.0975,-11.7160
Mean: -9.5158


Summary:   

| Model                       | Score   |
|-----------------------------|---------|
| KNeighborsRegressor         | 12.5790 |
| GradientBoostingRegressor   | 8.6801 |
| **RandomForestRegressor**   | 7.4366  |
| ElasticNet                  | 12.4903 |
| SVR                         | 11.6396 |
| MLPRegressor                | 9.5044 |

#### Stacking regressor built from best 3 models 

In [18]:
train_data_trans = model_trainer_knn.get_X_train()

stacking_regressor = StackingRegressor(
    estimators=[
        ('mlp_regressor', model_trainer_mlp.get_model()),
        ('random_forest', model_trainer_rf.get_model()),
        ('gradient_boosting', model_trainer_boosting.get_model())
    ],
    final_estimator=GradientBoostingRegressor()
)

print("Fitting stacking regressor.")
stacking_regressor.fit(train_data_trans, train_labels.values.ravel())

print("Cross-validating stacking regressor.")
cv_scores = cross_val_score(
    stacking_regressor, train_data_trans, train_labels.values.ravel(), cv=5, scoring='neg_mean_absolute_error'
)

mean_cv_score = -cv_scores.mean()
std_cv_score = cv_scores.std()

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean Absolute Error: {mean_cv_score:.4f}")
print(f"Cross-Validation Standard Deviation: {std_cv_score:.4f}")

Fitting stacking regressor.
Cross-validating stacking regressor.
Cross-Validation Scores: [-16.86942471  -8.67015331  -5.05343899  -4.95712885  -4.85675014]
Cross-Validation Mean Absolute Error: 8.0814
Cross-Validation Standard Deviation: 4.6239


#### Making predictions with the best models

In [19]:
y_pred = stacking_regressor.predict(test_data_added_prev)

# Create a DataFrame for submission
submission = pd.DataFrame()
submission['year'] = test_data.index.get_level_values(0)
submission['weekofyear'] = test_data.index.get_level_values(1)
submission['city'] = test_data_added_prev['is_iquitos'].apply(lambda x: 'iq' if x else 'sj')
submission['total_cases'] = y_pred.round().astype(int)  # Round predictions and convert to integer

# Reorder columns to match the required format
submission = submission[['city', 'year', 'weekofyear', 'total_cases']]


# Display the first few rows of the submission DataFrame
submission.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,360
1,sj,2008,19,299
2,sj,2008,20,299
3,sj,2008,21,292
4,sj,2008,22,299


In [20]:
y_pred = model_trainer_boosting.get_model().predict(test_data_added_prev)
submission_format = pd.read_csv("../data/submission_format.csv")
assert len(y_pred) == len(submission_format), "Length of y_pred does not match submission_format"
submission_format['total_cases'] = y_pred.round().astype(int)
submission_format.head()
submission_format.to_csv("../data/predictions/prev_as_pred_boosting_2703.csv", index=False)

### **Previous cases as iterative predictions**

In [28]:
# gradient boosting
model_trainer_gd = ModelTrainer(
    model=GradientBoostingRegressor(),
    imputation_method=IMPUTATION_METHOD,
    top_n_features=TOP_N_FEATURES,
    corr_method=CORR_METHOD,
    scaling_method=SCALER,
    k_prev_targets=K_PREV,
    param_grid=PARAM_GRID_BOOSTING
)

model_trainer_gd.fit(train_data, train_labels)
model_trainer_gd.transform(train_data, train_labels, test_data_added_prev)   # model traning on data from prev_cases as features
model = model_trainer_gd.get_model()
scaler = model_trainer_gd.get_scaler()

predict_iteratively_and_save(
    model_trainer_gd,
    model_name="gradient_boosting",
    X_test_raw=test_data_added_prev,
    y_train=train_labels.values,
    k_prev=K_PREV,
    submission_format_path="../data/submission_format.csv",
    output_path="../data/predictions/prev_as_iterative_gb.csv"
)

--- Model GradientBoostingRegressor ---
Preprocessing started.
Preprocessing finished.
Training model.
Tunning model's hyperparameters.
CV results: -12.1690,-12.1590,-7.8985,-8.0021,-8.1306,-8.1961,-8.0369,-7.8529,-12.2144,-8.0991,-11.8668,-8.0329,-8.1227,-8.1304,-7.8841,-7.8508,-8.1513,-12.1518,-12.1073,-7.9557,-11.9210,-8.6635,-8.1807,-12.0566,-8.1667,-8.3282,-8.7527,-8.5066,-7.6684,-7.8683,-7.9580,-8.1591,-8.6434,-12.0612,-12.1664,-7.8182,-7.6069,-8.0506,-8.1293,-7.9331,-11.9009,-7.9204,-8.0525,-8.6530,-7.9502,-8.8361,-8.3407,-8.0327,-7.8850,-7.8136,-8.0320,-7.8276,-7.7016,-7.6690,-9.0299,-8.0660,-8.1648,-8.1300,-7.8280,-8.0078,-7.8450,-7.7314,-7.6625,-8.3094,-8.0662,-8.0010,-12.0464,-7.8130,-8.3888,-7.9430,-8.2829,-8.3236,-8.4890,-8.2160,-7.9429,-7.6854,-7.9709,-7.9667,-8.2982,-7.7458,-7.8629,-8.1255,-8.0671,-7.8455,-8.0581,-7.8534,-8.7636,-12.2363,-7.7688,-8.9380,-11.8996,-7.9844,-7.7683,-8.2552,-11.8507,-7.9275,-8.0636,-8.3686,-8.5095,-8.0082
Mean: -8.6838
Saved predictions for g

In [29]:
# random forest
predict_iteratively_and_save(
    model_trainer_rf,
    model_name="random_forest",
    X_test_raw=test_data_added_prev,
    y_train=train_labels.values,
    k_prev=K_PREV,
    submission_format_path="../data/submission_format.csv",
    output_path="../data/predictions/prev_as_iterative_rf.csv"
)

Saved predictions for random_forest to ../data/predictions/prev_as_iterative_rf.csv


In [30]:
# knn
predict_iteratively_and_save(
    model_trainer_knn,
    model_name="knn",
    X_test_raw=test_data_added_prev,
    y_train=train_labels.values,
    k_prev=K_PREV,
    submission_format_path="../data/submission_format.csv",
    output_path="../data/predictions/prev_as_iterative_knn.csv"
)

Saved predictions for knn to ../data/predictions/prev_as_iterative_knn.csv


In [31]:
# elastic net
predict_iteratively_and_save(
    model_trainer_en,
    model_name="elastic_net",
    X_test_raw=test_data_added_prev,
    y_train=train_labels.values,
    k_prev=K_PREV,
    submission_format_path="../data/submission_format.csv",
    output_path="../data/predictions/prev_as_iterative_en.csv"
)

Saved predictions for elastic_net to ../data/predictions/prev_as_iterative_en.csv


In [32]:
# svr
predict_iteratively_and_save(
    model_trainer_svr,
    model_name="svr",
    X_test_raw=test_data_added_prev,
    y_train=train_labels.values,
    k_prev=K_PREV,
    submission_format_path="../data/submission_format.csv",
    output_path="../data/predictions/prev_as_iterative_svr.csv"
)

Saved predictions for svr to ../data/predictions/prev_as_iterative_svr.csv


In [33]:
# mlp
predict_iteratively_and_save(
    model_trainer_mlp,
    model_name="mlp",
    X_test_raw=test_data_added_prev,
    y_train=train_labels.values,
    k_prev=K_PREV,
    submission_format_path="../data/submission_format.csv",
    output_path="../data/predictions/prev_as_iterative_mlp.csv"
)

Saved predictions for mlp to ../data/predictions/prev_as_iterative_mlp.csv


#### Stacking regressor

In [34]:
X_train_stacking = model_trainer_knn.get_X_train()
y_train_stacking = train_labels.values.ravel()
scaler_stacking = model_trainer_knn.get_scaler()

stacking_regressor = StackingRegressor(
    estimators=[
        ('mlp', model_trainer_mlp.get_model()),
        ('rf', model_trainer_rf.get_model()),
        ('gb', model_trainer_boosting.get_model())
    ],
    final_estimator=GradientBoostingRegressor()
)

print("Cross-validating stacking regressor.")
cv_scores = cross_val_score(
    stacking_regressor,
    X_train_stacking,
    y_train_stacking,
    cv=5,
    scoring='neg_mean_absolute_error'
)

mean_cv_score = -cv_scores.mean()
std_cv_score = cv_scores.std()

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Cross-Validation Mean Absolute Error: {mean_cv_score:.4f}")
print(f"Cross-Validation Standard Deviation: {std_cv_score:.4f}")

print("Fitting stacking regressor.")
stacking_regressor.fit(X_train_stacking, y_train_stacking)

Cross-validating stacking regressor.
Cross-Validation Scores: [-17.79349983  -8.8773947   -5.33796352  -4.95614806  -5.02252591]
Cross-Validation Mean Absolute Error: 8.3975
Cross-Validation Standard Deviation: 4.9216
Fitting stacking regressor.


In [35]:
model_trainer_stacking = DummyTrainer(
    model=stacking_regressor,
    X_train=X_train_stacking,
    scaler=scaler_stacking
)

In [36]:
predict_iteratively_and_save(
    model_trainer=model_trainer_stacking,
    model_name="stacking",
    X_test_raw=test_data_added_prev,
    y_train=train_labels.values,
    k_prev=K_PREV,
    submission_format_path="../data/submission_format.csv",
    output_path="../data/predictions/prev_as_iterative_stacking.csv"
)

Saved predictions for stacking to ../data/predictions/prev_as_iterative_stacking.csv


### **Dedicated library**