In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [2]:
df = pd.read_csv('data/final_combined_dataset.csv')


In [3]:
df = df.drop(columns=['nearby_cases_weighted','city'])

In [4]:
df.head(10)

Unnamed: 0,date,week,cases,population,tempe_min,humidity_max,humidity_avg,humidity_min,temp_avg,temp_max,geocode,vim,vim_monthly,precipitation_avg_ordinary_kriging,precipitation_max_ordinary_kriging,precipitation_avg_regression_kriging,precipitation_max_regression_kriging,long,lat,cases_per_100k
0,2012-01-01,201201,32,207044,19.0,79.428571,55.514486,35.0,25.048951,29.571429,3300100,0.8505,0.852633,5.6804,18.3375,5.2038,18.7979,-44.319627,-23.009116,15.455652
1,2012-01-08,201202,40,207044,19.714286,82.285714,62.357393,47.428571,23.737513,26.571429,3300100,0.85205,0.852633,4.0716,16.5375,3.9114,16.8583,-44.319627,-23.009116,19.319565
2,2012-01-15,201203,19,207044,20.0,83.0,65.236264,45.571429,24.413187,28.714286,3300100,0.853541,0.852633,2.0555,5.05,1.7556,3.1368,-44.319627,-23.009116,9.176793
3,2012-01-22,201204,33,207044,19.285714,83.0,60.362637,43.428571,24.879121,28.857143,3300100,0.854877,0.852633,1.5416,5.7059,1.4138,5.2767,-44.319627,-23.009116,15.938641
4,2012-01-29,201205,36,207044,18.857143,80.857143,50.885924,33.142857,25.989992,30.428571,3300100,0.856021,0.852633,2.8204,9.3826,2.0057,7.8749,-44.319627,-23.009116,17.387608
5,2012-02-05,201206,39,207044,19.571429,81.571429,43.494048,22.142857,28.614927,33.571429,3300100,0.857159,0.858,3.817,25.983,3.8228,26.0478,-44.319627,-23.009116,18.836576
6,2012-02-12,201207,37,207044,20.142857,80.142857,56.778911,40.571429,25.791314,29.571429,3300100,0.858241,0.858,4.0861,29.4086,4.092,29.011,-44.319627,-23.009116,17.870598
7,2012-02-19,201208,33,207044,21.857143,74.0,51.458425,34.428571,27.253296,31.285714,3300100,0.859142,0.858,0.5962,4.0133,0.4783,3.2643,-44.319627,-23.009116,15.938641
8,2012-02-26,201209,46,207044,20.714286,80.142857,43.760989,21.285714,28.90293,34.0,3300100,0.860082,0.858,0.8143,3.8456,0.8171,3.8703,-44.319627,-23.009116,22.2175
9,2012-03-04,201210,42,207044,20.428571,71.714286,48.333639,28.428571,26.467491,30.857143,3300100,0.860933,0.861567,1.1867,5.3867,0.6647,3.8741,-44.319627,-23.009116,20.285543


In [5]:
df['week'] = pd.to_numeric(df['week'], errors='coerce')
df['date'] = pd.to_datetime(df['date'])

ADDING CYCLIC FEATURES

In [6]:
# Add date_ordinal
df['date_ordinal'] = df['date'].apply(lambda x: x.toordinal())

# Extract year and month
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Add cyclic month representation
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Add cyclic week representation
df['week_sin'] = np.sin(2 * np.pi * df['week'] / 52)
df['week_cos'] = np.cos(2 * np.pi * df['week'] / 52)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52234 entries, 0 to 52233
Data columns (total 27 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   date                                  52234 non-null  datetime64[ns]
 1   week                                  52234 non-null  int64         
 2   cases                                 52234 non-null  int64         
 3   population                            52234 non-null  int64         
 4   tempe_min                             52234 non-null  float64       
 5   humidity_max                          52234 non-null  float64       
 6   humidity_avg                          52234 non-null  float64       
 7   humidity_min                          52234 non-null  float64       
 8   temp_avg                              52234 non-null  float64       
 9   temp_max                              52234 non-null  float64       
 10

In [8]:
# columns = df.columns
# print(columns)


In [9]:
train_df = df[df['date'].dt.year <= 2020]
test_df = df[df['date'].dt.year >= 2021]

print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

Training set shape: (42770, 27)
Test set shape: (9464, 27)


In [10]:
test_df

Unnamed: 0,date,week,cases,population,tempe_min,humidity_max,humidity_avg,humidity_min,temp_avg,temp_max,...,long,lat,cases_per_100k,date_ordinal,year,month,month_sin,month_cos,week_sin,week_cos
470,2021-01-03,202101,1,207044,20.571429,93.220059,71.354385,51.797334,26.159341,31.142857,...,-44.319627,-23.009116,0.482989,737793,2021,1,5.000000e-01,0.866025,-0.354605,-0.935016
471,2021-01-10,202102,0,207044,21.857143,94.056203,74.070766,56.106602,26.469388,30.857143,...,-44.319627,-23.009116,0.000000,737800,2021,1,5.000000e-01,0.866025,-0.464723,-0.885456
472,2021-01-17,202103,0,207044,22.285714,85.551194,58.037205,42.041434,28.803768,32.857143,...,-44.319627,-23.009116,0.000000,737807,2021,1,5.000000e-01,0.866025,-0.568065,-0.822984
473,2021-01-24,202104,0,207044,21.285714,86.385208,55.524891,38.248368,29.108494,33.714286,...,-44.319627,-23.009116,0.000000,737814,2021,1,5.000000e-01,0.866025,-0.663123,-0.748511
474,2021-01-31,202105,1,207044,21.000000,92.613150,75.049454,56.072542,25.506279,30.428571,...,-44.319627,-23.009116,0.482989,737821,2021,1,5.000000e-01,0.866025,-0.748511,-0.663123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52229,2022-11-27,202248,1,273988,21.857143,95.875069,82.840922,61.313170,25.507143,30.857143,...,-44.093522,-22.509968,0.364979,738486,2022,11,-5.000000e-01,0.866025,0.663123,-0.748511
52230,2022-12-04,202249,0,273988,21.857143,92.543531,80.151192,64.110460,24.903139,28.142857,...,-44.093522,-22.509968,0.000000,738493,2022,12,-2.449294e-16,1.000000,0.568065,-0.822984
52231,2022-12-11,202250,2,273988,21.000000,94.213747,79.480439,59.964030,24.597222,28.500000,...,-44.093522,-22.509968,0.729959,738500,2022,12,-2.449294e-16,1.000000,0.464723,-0.885456
52232,2022-12-18,202251,1,273988,21.000000,91.312065,84.229947,73.950180,23.184295,26.000000,...,-44.093522,-22.509968,0.364979,738507,2022,12,-2.449294e-16,1.000000,0.354605,-0.935016


In [11]:
X_train = train_df.drop(columns=['cases','geocode','cases_per_100k'])
y_train = train_df[['cases_per_100k']]

X_test = test_df.drop(columns=['cases','geocode','cases_per_100k'])
y_test = test_df[['cases_per_100k']]

In [12]:
y_train

Unnamed: 0,cases_per_100k
0,15.455652
1,19.319565
2,9.176793
3,15.938641
4,17.387608
...,...
52125,1.094938
52126,0.364979
52127,0.000000
52128,0.729959


In [13]:
X_train = train_df.drop(columns=['cases','geocode','cases_per_100k'])
y_train = train_df[['cases_per_100k']]

X_test = test_df.drop(columns=['cases','geocode','cases_per_100k'])
y_test = test_df[['cases_per_100k']]

In [14]:
from sklearn.preprocessing import MinMaxScaler

# Exclude columns_to_scale and the target variable from scaling
columns_to_scale = ['week_sin', 'week_cos', 'month_sin', 'month_cos', 'week']
numeric_cols = [
    col for col in df.select_dtypes(include=['float64', 'int64', 'int32']).columns 
    if col not in ['cases_per_100k'] + columns_to_scale
]

# Initialize scalers
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Scale the selected numeric features
X_train[numeric_cols] = feature_scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = feature_scaler.transform(X_test[numeric_cols])

# Scale the target variable
y_train = target_scaler.fit_transform(y_train)
y_test = target_scaler.transform(y_test)

# Verify the scaling



        date    week  population  tempe_min  humidity_max  humidity_avg  \
0 2012-01-01  201201    0.029875   0.628378      0.007476      0.104881   
1 2012-01-08  201202    0.029875   0.662162      0.008410      0.147066   
2 2012-01-15  201203    0.029875   0.675676      0.008644      0.164814   
3 2012-01-22  201204    0.029875   0.641892      0.008644      0.134769   
4 2012-01-29  201205    0.029875   0.621622      0.007943      0.076346   

   humidity_min  temp_avg  temp_max       vim  ...  \
0      0.198944  0.703671  0.361022  0.954844  ...   
1      0.352113  0.647177  0.305112  0.957340  ...   
2      0.329225  0.676283  0.345048  0.959742  ...   
3      0.302817  0.696355  0.347710  0.961894  ...   
4      0.176056  0.744210  0.376997  0.963737  ...   

   precipitation_max_regression_kriging      long       lat  date_ordinal  \
0                              0.302329  0.106879  0.091828      0.000000   
1                              0.271134  0.106879  0.091828      0.002

In [15]:
X_train.head(70)


Unnamed: 0,date,week,population,tempe_min,humidity_max,humidity_avg,humidity_min,temp_avg,temp_max,vim,...,precipitation_max_regression_kriging,long,lat,date_ordinal,year,month,month_sin,month_cos,week_sin,week_cos
0,2012-01-01,201201,0.029875,0.628378,0.007476,0.104881,0.198944,0.703671,0.361022,0.954844,...,0.302329,0.106879,0.091828,0.000000,0.000,0.000000,0.500000,8.660254e-01,1.000000e+00,7.185429e-13
1,2012-01-08,201202,0.029875,0.662162,0.008410,0.147066,0.352113,0.647177,0.305112,0.957340,...,0.271134,0.106879,0.091828,0.002132,0.000,0.000000,0.500000,8.660254e-01,9.927089e-01,-1.205367e-01
2,2012-01-15,201203,0.029875,0.675676,0.008644,0.164814,0.329225,0.676283,0.345048,0.959742,...,0.050450,0.106879,0.091828,0.004264,0.000,0.000000,0.500000,8.660254e-01,9.709418e-01,-2.393157e-01
3,2012-01-22,201204,0.029875,0.641892,0.008644,0.134769,0.302817,0.696355,0.347710,0.961894,...,0.084866,0.106879,0.091828,0.006397,0.000,0.000000,0.500000,8.660254e-01,9.350162e-01,-3.546049e-01
4,2012-01-29,201205,0.029875,0.621622,0.007943,0.076346,0.176056,0.744210,0.376997,0.963737,...,0.126653,0.106879,0.091828,0.008529,0.000,0.000000,0.500000,8.660254e-01,8.854560e-01,-4.647232e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,2013-03-31,201314,0.029875,0.689189,0.013362,0.235118,0.457746,0.697292,0.342386,0.996067,...,0.208384,0.106879,0.091828,0.138593,0.125,0.181818,1.000000,6.123234e-17,4.647232e-01,-8.854560e-01
66,2013-04-07,201315,0.029875,0.682432,0.010092,0.215896,0.492958,0.637508,0.305112,0.995851,...,0.128284,0.106879,0.091828,0.140725,0.125,0.272727,0.866025,-5.000000e-01,3.546049e-01,-9.350162e-01
67,2013-04-14,201316,0.029875,0.452703,0.010840,0.182665,0.390845,0.517414,0.270501,0.995463,...,0.040135,0.106879,0.091828,0.142857,0.125,0.272727,0.866025,-5.000000e-01,2.393157e-01,-9.709418e-01
68,2013-04-21,201317,0.029875,0.391892,0.011961,0.156755,0.315141,0.507216,0.273163,0.994664,...,0.023045,0.106879,0.091828,0.144989,0.125,0.272727,0.866025,-5.000000e-01,1.205367e-01,-9.927089e-01


In [95]:
X_train

Unnamed: 0,date,week,population,tempe_min,humidity_max,humidity_avg,humidity_min,temp_avg,temp_max,vim,...,precipitation_max_regression_kriging,long,lat,date_ordinal,year,month,month_sin,month_cos,week_sin,week_cos
0,2012-01-01,201201,0.029875,0.628378,0.007476,0.104881,0.198944,0.703671,0.361022,0.954844,...,0.302329,0.106879,0.091828,0.000000,0.0,0.000000,5.000000e-01,0.866025,1.000000,7.185429e-13
1,2012-01-08,201202,0.029875,0.662162,0.008410,0.147066,0.352113,0.647177,0.305112,0.957340,...,0.271134,0.106879,0.091828,0.002132,0.0,0.000000,5.000000e-01,0.866025,0.992709,-1.205367e-01
2,2012-01-15,201203,0.029875,0.675676,0.008644,0.164814,0.329225,0.676283,0.345048,0.959742,...,0.050450,0.106879,0.091828,0.004264,0.0,0.000000,5.000000e-01,0.866025,0.970942,-2.393157e-01
3,2012-01-22,201204,0.029875,0.641892,0.008644,0.134769,0.302817,0.696355,0.347710,0.961894,...,0.084866,0.106879,0.091828,0.006397,0.0,0.000000,5.000000e-01,0.866025,0.935016,-3.546049e-01
4,2012-01-29,201205,0.029875,0.621622,0.007943,0.076346,0.176056,0.744210,0.376997,0.963737,...,0.126653,0.106879,0.091828,0.008529,0.0,0.000000,5.000000e-01,0.866025,0.885456,-4.647232e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52125,2020-11-29,202049,0.039804,0.732432,0.011321,0.265076,0.591350,0.637623,0.309172,0.682952,...,0.172175,0.168648,0.309786,0.991471,1.0,0.909091,-5.000000e-01,0.866025,-0.354605,-9.350162e-01
52126,2020-12-06,202050,0.039804,0.741892,0.011759,0.276969,0.644417,0.629001,0.293575,0.698030,...,0.360288,0.168648,0.309786,0.993603,1.0,1.000000,-2.449294e-16,1.000000,-0.464723,-8.854560e-01
52127,2020-12-13,202051,0.039804,0.751351,0.012473,0.280812,0.604158,0.650046,0.317359,0.709180,...,0.212051,0.168648,0.309786,0.995736,1.0,1.000000,-2.449294e-16,1.000000,-0.568065,-8.229839e-01
52128,2020-12-20,202052,0.039804,0.760811,0.010608,0.247515,0.575591,0.741926,0.347710,0.718140,...,0.517714,0.168648,0.309786,0.997868,1.0,1.000000,-2.449294e-16,1.000000,-0.663123,-7.485107e-01


In [16]:
X_train

Unnamed: 0,date,week,population,tempe_min,humidity_max,humidity_avg,humidity_min,temp_avg,temp_max,vim,...,precipitation_max_regression_kriging,long,lat,date_ordinal,year,month,month_sin,month_cos,week_sin,week_cos
0,2012-01-01,201201,0.029875,0.628378,0.007476,0.104881,0.198944,0.703671,0.361022,0.954844,...,0.302329,0.106879,0.091828,0.000000,0.0,0.000000,5.000000e-01,0.866025,1.000000,7.185429e-13
1,2012-01-08,201202,0.029875,0.662162,0.008410,0.147066,0.352113,0.647177,0.305112,0.957340,...,0.271134,0.106879,0.091828,0.002132,0.0,0.000000,5.000000e-01,0.866025,0.992709,-1.205367e-01
2,2012-01-15,201203,0.029875,0.675676,0.008644,0.164814,0.329225,0.676283,0.345048,0.959742,...,0.050450,0.106879,0.091828,0.004264,0.0,0.000000,5.000000e-01,0.866025,0.970942,-2.393157e-01
3,2012-01-22,201204,0.029875,0.641892,0.008644,0.134769,0.302817,0.696355,0.347710,0.961894,...,0.084866,0.106879,0.091828,0.006397,0.0,0.000000,5.000000e-01,0.866025,0.935016,-3.546049e-01
4,2012-01-29,201205,0.029875,0.621622,0.007943,0.076346,0.176056,0.744210,0.376997,0.963737,...,0.126653,0.106879,0.091828,0.008529,0.0,0.000000,5.000000e-01,0.866025,0.885456,-4.647232e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52125,2020-11-29,202049,0.039804,0.732432,0.011321,0.265076,0.591350,0.637623,0.309172,0.682952,...,0.172175,0.168648,0.309786,0.991471,1.0,0.909091,-5.000000e-01,0.866025,-0.354605,-9.350162e-01
52126,2020-12-06,202050,0.039804,0.741892,0.011759,0.276969,0.644417,0.629001,0.293575,0.698030,...,0.360288,0.168648,0.309786,0.993603,1.0,1.000000,-2.449294e-16,1.000000,-0.464723,-8.854560e-01
52127,2020-12-13,202051,0.039804,0.751351,0.012473,0.280812,0.604158,0.650046,0.317359,0.709180,...,0.212051,0.168648,0.309786,0.995736,1.0,1.000000,-2.449294e-16,1.000000,-0.568065,-8.229839e-01
52128,2020-12-20,202052,0.039804,0.760811,0.010608,0.247515,0.575591,0.741926,0.347710,0.718140,...,0.517714,0.168648,0.309786,0.997868,1.0,1.000000,-2.449294e-16,1.000000,-0.663123,-7.485107e-01


In [17]:
# Define the feature and target columns
feature_cols = [
    'humidity_avg', 'temp_avg', 'vim_monthly', 'date_ordinal',
    'precipitation_max_regression_kriging',
    'long', 'lat', 'month_sin', 'month_cos', 'week_sin', 'week_cos'
]

# target_col = 'cases_per_100k'

# Extract the features and target
X_train = X_train[feature_cols]
X_test = X_test[feature_cols]
# y_train = y_train
# y_test = y_test

# Reshape target to match LSTM output
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

# weeks in the lstm memory
seq_length = 4


# Build the LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(seq_length, len(feature_cols))),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)  # Predicting a single value (cases)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

# Save the model
model.save("lstm_dengue_prediction_model.h5")

# Predict on test data
predictions = model.predict(X_test)


  super().__init__(**kwargs)


Epoch 1/50


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(None, 11), dtype=float32). Expected shape (None, 4, 11), but input has incompatible shape (None, 11)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 11), dtype=float32)
  • training=True
  • mask=None

In [None]:
# # Define LSTM Model
# class LSTMModel(nn.Module):
#     def __init__(self, input_size, hidden_size, num_layers, output_size):
#         super(LSTMModel, self).__init__()
#         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, x):
#         out, _ = self.lstm(x)
#         out = self.fc(out[:, -1, :])  # Take the output of the last time step
#         return out

# # Model parameters
# input_size = len(features)
# hidden_size = 64
# num_layers = 2
# output_size = 1
# learning_rate = 0.001
# num_epochs = 50

# # Initialize model, loss, and optimizer
# model = LSTMModel(input_size, hidden_size, num_layers, output_size)
# criterion = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# # Training Loop
# for epoch in range(num_epochs):
#     model.train()
#     epoch_loss = 0
#     for x_batch, y_batch in train_loader:
#         optimizer.zero_grad()
#         y_pred = model(x_batch)
#         loss = criterion(y_pred.squeeze(), y_batch)
#         loss.backward()
#         optimizer.step()
#         epoch_loss += loss.item()
#     print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_loader):.4f}")

# # Evaluation
# model.eval()
# predictions, actuals = [], []

# with torch.no_grad():
#     for x_batch, y_batch in test_loader:
#         y_pred = model(x_batch)
#         predictions.extend(y_pred.squeeze().tolist())
#         actuals.extend(y_batch.tolist())

# # Rescale the predictions and actuals if necessary
# mae = mean_absolute_error(actuals, predictions)
# rmse = np.sqrt(mean_squared_error(actuals, predictions))
# print(f"Mean Absolute Error (MAE): {mae:.2f}")
# print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
