In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from sklearn.model_selection import ParameterGrid

In [3]:
df = pd.read_csv('final_combined_dataset.csv')


In [4]:
from sklearn.preprocessing import LabelEncoder


df['week'] = pd.to_numeric(df['week'], errors='coerce')
df['date'] = pd.to_datetime(df['date'])
# Add date_ordinal
df['date_ordinal'] = df['date'].apply(lambda x: x.toordinal())

# Extract year and month
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Add cyclic month representation
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Add cyclic week representation
df['week_sin'] = np.sin(2 * np.pi * df['week'] / 52)
df['week_cos'] = np.cos(2 * np.pi * df['week'] / 52)

df["year_sin"] = np.sin(2 * np.pi * df["year"] / df["year"].max())
df["year_cos"] = np.cos(2 * np.pi * df["year"] / df["year"].max())

# df["cases_rolling_mean"] = df.groupby("geocode")["cases"].transform(lambda x: x.rolling(4).mean())


# Step 2: Create lag features
def create_lags(dataframe, group_col, target_col, lags, inplace = False):
    if isinstance(target_col, list):  # If target_col is a list of columns
        for col in target_col:
            for lag in lags:
                if inplace:
                    dataframe[target_col] = dataframe.groupby(group_col)[col].shift(lag)
                else:
                    dataframe[f'{col}_lag{lag}'] = dataframe.groupby(group_col)[col].shift(lag)
    else:  # If target_col is a single column
        for lag in lags:
            if inplace:
                dataframe[target_col] = dataframe.groupby(group_col)[target_col].shift(lag)
            else:
                dataframe[f'{target_col}_lag{lag}'] = dataframe.groupby(group_col)[target_col].shift(lag)
    return dataframe

# Lag cases by 1 and 2 weeks
data = create_lags(df, group_col='city', target_col='cases', lags=[0,1,-2])

# Lag weather-related variables by 5 and 6 weeks for each city
weather_columns = ['tempe_min', 'temp_avg', 'temp_max', 'humidity_max', 'humidity_avg', 'humidity_min',
                   'precipitation_avg_ordinary_kriging', 'precipitation_max_ordinary_kriging',
                   'precipitation_avg_regression_kriging', 'precipitation_max_regression_kriging']
data = create_lags(df, group_col='city', target_col=weather_columns, lags=[4, 5, 6])
data = data.dropna().reset_index(drop=True)
data['geocode'] = data['geocode'].astype(str)  # Ensure categorical features are in string format

# Now 'transformed_df' contains the transformed data with the correct types
train_data = data[data['date'].dt.year <= 2020]
test_data = data[data['date'].dt.year >= 2021]
pd.set_option('display.max_columns', None)

X_train = train_data.drop(columns=['cases','cases_per_100k','cases_lag-2'])
y_train = train_data[['cases_lag-2']]

X_test = test_data.drop(columns=['cases','cases_per_100k','cases_lag-2'])
y_test = test_data[['cases_lag-2']]



In [None]:
X_test

Unnamed: 0,date,week,population,tempe_min,humidity_max,humidity_avg,humidity_min,temp_avg,temp_max,city,geocode,vim,vim_monthly,precipitation_avg_ordinary_kriging,precipitation_max_ordinary_kriging,precipitation_avg_regression_kriging,precipitation_max_regression_kriging,long,lat,nearby_cases_weighted,date_ordinal,year,month,month_sin,month_cos,week_sin,week_cos,year_sin,year_cos,cases_lag0,cases_lag1,tempe_min_lag4,tempe_min_lag5,tempe_min_lag6,temp_avg_lag4,temp_avg_lag5,temp_avg_lag6,temp_max_lag4,temp_max_lag5,temp_max_lag6,humidity_max_lag4,humidity_max_lag5,humidity_max_lag6,humidity_avg_lag4,humidity_avg_lag5,humidity_avg_lag6,humidity_min_lag4,humidity_min_lag5,humidity_min_lag6,precipitation_avg_ordinary_kriging_lag4,precipitation_avg_ordinary_kriging_lag5,precipitation_avg_ordinary_kriging_lag6,precipitation_max_ordinary_kriging_lag4,precipitation_max_ordinary_kriging_lag5,precipitation_max_ordinary_kriging_lag6,precipitation_avg_regression_kriging_lag4,precipitation_avg_regression_kriging_lag5,precipitation_avg_regression_kriging_lag6,precipitation_max_regression_kriging_lag4,precipitation_max_regression_kriging_lag5,precipitation_max_regression_kriging_lag6
464,2021-01-03,202101,207044,20.571429,93.220059,71.354385,51.797334,26.159341,31.142857,angra dos reis,3300100,0.855700,0.858233,3.4918,14.5038,2.8288,14.1979,-44.319627,-23.009116,2.158316,737793,2021,1,5.000000e-01,0.866025,-0.354605,-0.935016,-3.107406e-03,0.999995,1,0.0,19.700000,19.433333,19.166667,23.607875,24.228308,26.043163,26.371429,28.457143,31.166667,92.478601,91.546282,87.389231,80.701773,72.443475,61.218134,70.075335,56.354723,44.425297,6.1941,3.3076,1.5273,22.7958,10.7464,7.8955,5.7965,1.7763,1.3727,21.6670,7.8814,7.0997
465,2021-01-10,202102,207044,21.857143,94.056203,74.070766,56.106602,26.469388,30.857143,angra dos reis,3300100,0.857531,0.858233,4.1258,11.5154,1.6147,9.1777,-44.319627,-23.009116,1.420572,737800,2021,1,5.000000e-01,0.866025,-0.464723,-0.885456,-3.107406e-03,0.999995,0,1.0,19.966667,19.700000,19.433333,24.542475,23.607875,24.228308,28.338095,26.371429,28.457143,92.049874,92.478601,91.546282,73.439776,80.701773,72.443475,59.027411,70.075335,56.354723,3.9988,6.1941,3.3076,12.8230,22.7958,10.7464,3.1300,5.7965,1.7763,10.6539,21.6670,7.8814
466,2021-01-17,202103,207044,22.285714,85.551194,58.037205,42.041434,28.803768,32.857143,angra dos reis,3300100,0.859299,0.858233,0.0000,1.0099,0.0000,0.3171,-44.319627,-23.009116,1.123414,737807,2021,1,5.000000e-01,0.866025,-0.568065,-0.822984,-3.107406e-03,0.999995,0,0.0,20.233333,19.966667,19.700000,22.868606,24.542475,23.607875,24.514286,28.338095,26.371429,92.960505,92.049874,92.478601,85.215738,73.439776,80.701773,77.664367,59.027411,70.075335,7.0943,3.9988,6.1941,31.8542,12.8230,22.7958,6.6823,3.1300,5.7965,31.5070,10.6539,21.6670
467,2021-01-24,202104,207044,21.285714,86.385208,55.524891,38.248368,29.108494,33.714286,angra dos reis,3300100,0.860919,0.858233,0.0000,0.0000,0.0025,0.0257,-44.319627,-23.009116,0.000000,737814,2021,1,5.000000e-01,0.866025,-0.663123,-0.748511,-3.107406e-03,0.999995,0,0.0,20.500000,20.233333,19.966667,23.285714,22.868606,24.542475,27.000000,24.514286,28.338095,94.018760,92.960505,92.049874,83.248160,85.215738,73.439776,67.555290,77.664367,59.027411,5.2942,7.0943,3.9988,17.1068,31.8542,12.8230,5.6054,6.6823,3.1300,17.4055,31.5070,10.6539
468,2021-01-31,202105,207044,21.000000,92.613150,75.049454,56.072542,25.506279,30.428571,angra dos reis,3300100,0.862375,0.858233,5.1522,23.7147,3.6285,21.5241,-44.319627,-23.009116,0.985127,737821,2021,1,5.000000e-01,0.866025,-0.748511,-0.663123,-3.107406e-03,0.999995,1,0.0,20.571429,20.500000,20.233333,26.159341,23.285714,22.868606,31.142857,27.000000,24.514286,93.220059,94.018760,92.960505,71.354385,83.248160,85.215738,51.797334,67.555290,77.664367,3.4918,5.2942,7.0943,14.5038,17.1068,31.8542,2.8288,5.6054,6.6823,14.1979,17.4055,31.5070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51501,2022-11-13,202246,273988,20.857143,90.883001,79.522168,61.268456,23.714286,27.857143,volta redonda,3306305,0.645087,0.642000,2.2254,9.5493,2.1986,9.4636,-44.093522,-22.509968,0.887888,738472,2022,11,-5.000000e-01,0.866025,0.822984,-0.568065,-2.449294e-16,1.000000,0,1.0,20.714286,20.285714,19.714286,23.803830,23.351190,21.847403,28.285714,27.857143,24.000000,92.496761,95.758676,96.594516,81.978423,85.265832,87.372491,63.452716,66.626779,76.540713,1.1280,0.8827,0.8149,5.8510,3.2493,3.1440,1.1517,0.8812,0.8472,6.0163,3.2454,3.2848
51502,2022-11-20,202247,273988,20.714286,93.280477,82.167320,67.014975,23.726190,27.000000,volta redonda,3306305,0.664894,0.642000,3.2501,15.0136,3.3057,17.6268,-44.093522,-22.509968,0.000000,738479,2022,11,-5.000000e-01,0.866025,0.748511,-0.663123,-2.449294e-16,1.000000,1,0.0,20.285714,20.714286,20.285714,24.448016,23.803830,23.351190,29.000000,28.285714,27.857143,92.425103,92.496761,95.758676,78.058320,81.978423,85.265832,61.497879,63.452716,66.626779,2.3012,1.1280,0.8827,9.4750,5.8510,3.2493,2.3382,1.1517,0.8812,8.8927,6.0163,3.2454
51503,2022-11-27,202248,273988,21.857143,95.875069,82.840922,61.313170,25.507143,30.857143,volta redonda,3306305,0.683798,0.642000,4.9788,17.2846,4.8787,16.7239,-44.093522,-22.509968,0.000000,738486,2022,11,-5.000000e-01,0.866025,0.663123,-0.748511,-2.449294e-16,1.000000,1,1.0,18.428571,20.285714,20.714286,21.704574,24.448016,23.803830,25.285714,29.000000,28.285714,95.788498,92.425103,92.496761,82.978925,78.058320,81.978423,66.874779,61.497879,63.452716,3.9107,2.3012,1.1280,11.4685,9.4750,5.8510,3.8144,2.3382,1.1517,10.8449,8.8927,6.0163
51504,2022-12-04,202249,273988,21.857143,92.543531,80.151192,64.110460,24.903139,28.142857,volta redonda,3306305,0.701526,0.716900,1.8253,8.1846,1.9016,8.4208,-44.093522,-22.509968,0.938861,738493,2022,12,-2.449294e-16,1.000000,0.568065,-0.822984,-2.449294e-16,1.000000,0,1.0,18.857143,18.428571,20.285714,21.597789,21.704574,24.448016,26.000000,25.285714,29.000000,96.562765,95.788498,92.425103,85.946250,82.978925,78.058320,67.027147,66.874779,61.497879,3.5073,3.9107,2.3012,15.8554,11.4685,9.4750,3.6403,3.8144,2.3382,15.6891,10.8449,8.8927


In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor

# Step 1-6: Keep the previous preprocessing steps the same (data preparation, scaling, etc.)
columns_not_to_scale = ['week_sin', 'week_cos', 'month_sin', 'month_cos', 'week', 'lat', 'long', 
                    'geocode', 'year_sin', 'year_cos']

# Fixed missing comma in selected_columns
selected_columns = [
    'cases_lag0', 'cases_lag1', 'geocode',
    'temp_avg_lag4', 'humidity_avg_lag4',
    'precipitation_max_regression_kriging_lag4',
    'week_sin', 'month_sin', 'week_cos', 'month_cos',
    'year_sin', 'year_cos','nearby_cases_weighted','lat','long'
]

# Prepare data
X_train = X_train[selected_columns]
X_test = X_test[selected_columns]

# Scaling features
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

numeric_features = [col for col in X_train.columns if X_train[col].dtype in ['float64', 'int64', 'float32', 'int32'] 
                   and col not in columns_not_to_scale]

X_train[numeric_features] = feature_scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = feature_scaler.transform(X_test[numeric_features])

# Reshaping the target variable for scaling
y_train_reshaped = y_train.values.reshape(-1, 1)
y_test_reshaped = y_test.values.reshape(-1, 1)

y_train_scaled = target_scaler.fit_transform(y_train_reshaped).ravel()
y_test_scaled = target_scaler.transform(y_test_reshaped).ravel()

from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor

# Define the parameter grid
param_grid = {
    'iterations': [500, 1000, 1500],  # Number of boosting rounds
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size
    'depth': [4, 6, 8, 10],  # Tree depth
    'l2_leaf_reg': [1, 3, 5, 7, 9],  # L2 regularization to prevent overfitting
    'bagging_temperature': [0, 1, 5, 10],  # Controls randomness in bootstrap sampling
    'random_strength': [1, 5, 10],  # Random noise added for overfitting prevention
    'border_count': [32, 64, 128],  # Number of splits for numerical features
    'boosting_type': ['Ordered', 'Plain'],  # Ordered (better for small data), Plain (faster)
    'early_stopping_rounds': [50, 100],  # Stop training if no improvement
    'verbose': [100]  # Logging frequency
}

# Initialize the CatBoost model
catboost_model = CatBoostRegressor(loss_function='RMSE', cat_features=['geocode'])

# Use RandomizedSearchCV for faster tuning (or GridSearchCV for exhaustive search)
random_search = RandomizedSearchCV(
    estimator=catboost_model,
    param_distributions=param_grid,
    n_iter=20,  # Number of different combinations to try
    scoring='neg_root_mean_squared_error',  # Optimize for RMSE
    cv=3,  # 3-fold cross-validation
    verbose=2,
    n_jobs=-1
)

# Perform hyperparameter tuning
random_search.fit(X_train, y_train_scaled)

# Print the best parameters
print("Best Parameters:", random_search.best_params_)


# Predictions
y_train_pred_cb_scaled = catboost_model.predict(X_train)
y_test_pred_cb_scaled = catboost_model.predict(X_test)

# Inverse scaling for predictions
y_train_pred_cb = target_scaler.inverse_transform(y_train_pred_cb_scaled.reshape(-1, 1)).ravel()
y_test_pred_cb = target_scaler.inverse_transform(y_test_pred_cb_scaled.reshape(-1, 1)).ravel()

# Performance Metrics - CatBoost
train_rmse_cb = np.sqrt(mean_squared_error(y_train, y_train_pred_cb))
test_rmse_cb = np.sqrt(mean_squared_error(y_test, y_test_pred_cb))
train_r2_cb = r2_score(y_train, y_train_pred_cb)
test_r2_cb = r2_score(y_test, y_test_pred_cb)

print("\nCatBoost Model Performance:")
print("CatBoost RMSE (Train):", train_rmse_cb)
print("CatBoost RMSE (Test):", test_rmse_cb)
print("CatBoost R² (Train):", train_r2_cb)
print("CatBoost R² (Test):", test_r2_cb)

# Feature Importance - CatBoost
importance_cb = pd.DataFrame({'Feature': X_train.columns, 'Importance': catboost_model.feature_importances_})
importance_cb = importance_cb.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
plt.barh(importance_cb['Feature'], importance_cb['Importance'])
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Feature Importance in CatBoost Model")
plt.gca().invert_yaxis()
plt.show()

# -----------------------------------------------
# Final Visualization - Actual vs Predicted
# -----------------------------------------------


unique_geocodes = X_test['geocode'].unique()

# Loop through each geocode and plot separately
for geocode in unique_geocodes:
    # Filter the data for the current geocode
    geocode_mask = X_test['geocode'] == geocode
    y_test_geocode = y_test[geocode_mask]
    y_test_pred_cb_geocode = y_test_pred_cb[geocode_mask]
    
    # Create an index for the current geocode
    indices = X_test[geocode_mask].index  # Get the row indices where the geocode is present
    
    # Plot the results for the current geocode
    plt.figure(figsize=(12, 6))
    plt.plot(indices, y_test_geocode, label="Actual Cases", color='b')
    plt.plot(indices, y_test_pred_cb_geocode, label="CatBoost Predictions", color='r')
    plt.xlabel("Test Set Index")
    plt.ylabel("Dengue Cases")
    plt.title(f"Actual vs Predicted Cases for Geocode {geocode}")
    plt.legend()
    plt.show()

Fitting 3 folds for each of 20 candidates, totalling 60 fits
0:	learn: 0.0150890	total: 73.4ms	remaining: 1m 13s
100:	learn: 0.0038697	total: 8.83s	remaining: 1m 18s
200:	learn: 0.0037481	total: 16.5s	remaining: 1m 5s
300:	learn: 0.0034702	total: 22.9s	remaining: 53.2s
400:	learn: 0.0033859	total: 28.4s	remaining: 42.5s
500:	learn: 0.0033580	total: 33.6s	remaining: 33.4s
600:	learn: 0.0033334	total: 37.4s	remaining: 24.8s
700:	learn: 0.0033300	total: 40.9s	remaining: 17.4s
800:	learn: 0.0032042	total: 44.8s	remaining: 11.1s
900:	learn: 0.0031252	total: 49.1s	remaining: 5.39s
999:	learn: 0.0029681	total: 53.3s	remaining: 0us
Best Parameters: {'verbose': 100, 'random_strength': 1, 'learning_rate': 0.05, 'l2_leaf_reg': 3, 'iterations': 1000, 'early_stopping_rounds': 100, 'depth': 4, 'border_count': 128, 'boosting_type': 'Ordered', 'bagging_temperature': 10}


CatBoostError: There is no trained model to use predict(). Use fit() to train model. Then use this method.