LSTM MODEL DEVELOPMENT AND TRAINING

In [1]:
import pandas as pd 
import numpy as np

In [2]:
df_clean = pd.read_parquet("cleaned_data.parquet")
df_clean.head()

Unnamed: 0,ffb_1%_oer,import,export,production,end_stock,cpo_futures,usd_myr_rate,brent_oil_futures,soybean_futures,precipitation,...,avg_humidity,lag_1,lag_3,lag_7,rolling_mean_7,rolling_mean_30,rolling_std_7,rolling_std_30,pct_change_1,pct_change_7
0,21.2,81477,1680891,1737461,3002871,2204.0,4.1075,61.65,30.74,20.6,...,88.041667,21.25,20.85,20.6,20.992857,20.533333,0.212972,0.319032,-0.002353,0.029126
1,21.3,81477,1680891,1737461,3002871,2200.0,4.096,61.89,30.48,47.5,...,90.083333,21.2,21.2,20.75,21.071429,20.576667,0.209875,0.332113,0.004717,0.026506
2,21.3,94278,1324615,1544518,3056929,2200.0,4.096,62.75,30.21,7.0,...,89.958333,21.3,21.25,20.85,21.135714,20.62,0.199404,0.339015,0.0,0.021583
3,21.3,94278,1324615,1544518,3056929,2200.0,4.096,62.75,30.21,4.7,...,90.083333,21.3,21.2,20.85,21.2,20.66,0.160728,0.346261,0.0,0.021583
4,21.3,94278,1324615,1544518,3056929,2200.0,4.096,62.75,30.21,13.2,...,89.125,21.3,21.3,20.85,21.264286,20.69,0.047559,0.361606,0.0,0.021583


Defining Target and Features + Splitting The Dataset:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 

#define target and features
target_col = "ffb_1%_oer"
drop_eng_features = ["lag_3", "lag_7", "rolling_mean_30", "rolling_std_7", "rolling_std_30", "pct_change_1", "pct_change_7" ]

#drop several engineered features
features = df_clean.drop(columns=[target_col] + drop_eng_features).values 

#maintain only target column as target
target = df_clean[target_col].values.reshape(-1,1)

In [None]:
#Splitting into train-validate-test dataa
N = len(df_clean)
train_size = int(N * 0.7)   # 70% train
val_size   = int(N * 0.2)  # 20% validation
test_size  = N - train_size - val_size  # 10% test

train_df = df_clean.iloc[:train_size]
val_df   = df_clean.iloc[train_size:train_size+val_size]
test_df  = df_clean.iloc[train_size+val_size:]

In [18]:
# split first
X_train_raw = features[:train_size]
X_val_raw   = features[train_size:train_size+val_size]
X_test_raw  = features[train_size+val_size:]

y_train_raw = target[:train_size]
y_val_raw   = target[train_size:train_size+val_size]
y_test_raw  = target[train_size+val_size:]

In [19]:
# scale using only train
from sklearn.preprocessing import MinMaxScaler
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train = scaler_x.fit_transform(X_train_raw)
X_val   = scaler_x.transform(X_val_raw)
X_test  = scaler_x.transform(X_test_raw)

y_train = scaler_y.fit_transform(y_train_raw)
y_val   = scaler_y.transform(y_val_raw)
y_test  = scaler_y.transform(y_test_raw)

Configuring Time Series Generator:

In [16]:
features.shape

(2308, 13)

Determining the best loockback size

In [23]:
#Helper method to create sequences
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error


def create_sequences(features, target, lookback):
    Xs, ys = [], []
    for i in range(len(features) - lookback):
        Xs.append(features[i:i+lookback])
        ys.append(target[i+lookback])
    return np.array(Xs), np.array(ys)


#Helper method to train one LSTM and evaluate
def train_lstm(X_train, y_train, X_val, y_val, lookback, units=50, batch=32, epochs=50):
    model = Sequential([
        LSTM(units, activation='tanh', input_shape=(lookback, X_train.shape[2])),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')

    es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch,
        verbose=0,
        callbacks=[es]
    )

    return model

#Experiment
def evaluate_lookbacks(X_train, y_train, X_val, y_val, X_test, y_test, lookbacks=[7,14,30,60]):
    results = {}

    for lookback in lookbacks:
        #build sequences
        X_train_seq, y_train_seq = create_sequences(X_train, y_train, lookback)
        X_val_seq, y_val_seq = create_sequences(X_val, y_val, lookback)
        X_test_seq, y_test_seq = create_sequences(X_test, y_test, lookback)

        #train baseline LSTM
        model = train_lstm(X_train_seq, y_train_seq, X_val_seq, y_val_seq, lookback)

        #evaluate on validation
        val_pred = model.predict(X_val_seq, verbose=0)
        val_rmse = np.sqrt(mean_squared_error(y_val_seq, val_pred))
        val_mae = mean_absolute_error(y_val_seq, val_pred)
        val_mape = mean_absolute_percentage_error(y_val_seq, val_pred)

        #evaluate on test
        test_pred = model.predict(X_test_seq, verbose=0)
        test_rmse = np.sqrt(mean_squared_error(y_test_seq, test_pred))
        test_mae = mean_absolute_error(y_test_seq, test_pred)
        test_mape = mean_absolute_percentage_error(y_test_seq, test_pred)

        results[lookback] = {
            "val_RMSE": val_rmse, "val_MAE": val_mae,"val_MAPE": val_mape,
            "test_RMSE": test_rmse, "test_MAE": test_mae,"test_MAPE": test_mape,
        }

        print(f"Lookback={lookback}| Val RMSE={val_rmse:.2f}, Test RMSE={test_rmse:.2f}")
    
    #Choose the best lookback
    best = min(results.items(), key=lambda x: x[1]["val_RMSE"])
    print("\nBest lookback window:", best[0])
    print("Metrics:", best[1])

    return results, best


    
    


In [24]:
results, best = evaluate_lookbacks(X_train, y_train, X_val, y_val, X_test, y_test, lookbacks=[7,14,30,60])


  super().__init__(**kwargs)


Lookback=7| Val RMSE=0.03, Test RMSE=0.06


  super().__init__(**kwargs)


Lookback=14| Val RMSE=0.03, Test RMSE=0.06


  super().__init__(**kwargs)


Lookback=30| Val RMSE=0.03, Test RMSE=0.05


  super().__init__(**kwargs)


Lookback=60| Val RMSE=0.03, Test RMSE=0.02

Best lookback window: 60
Metrics: {'val_RMSE': np.float64(0.02704031347476639), 'val_MAE': 0.012436347856657405, 'val_MAPE': 5222623666359.886, 'test_RMSE': np.float64(0.017932196248540627), 'test_MAE': 0.014277967840644798, 'test_MAPE': 0.025463631008203258}


Configuring time series generator

In [25]:
def create_sequences(features, target, lookback):
    Xs, ys = [], []
    for i in range(lookback, len(features)):
        Xs.append(features[i-lookback:i])
        ys.append(target[i])
    return np.array(Xs), np.array(ys)

lookback = 60

X_train_seq, y_train_seq = create_sequences(X_train, y_train, lookback)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, lookback)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, lookback)

Build Baseline LSTM Model (Start with 1 LSTM layer and Dense output, RMSE and MAE for evaluation)

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential([
    LSTM(50, input_shape=(lookback, X_train_seq.shape[2])),
    Dense(1)
])

model.compile(optimizer="adam", loss="mse", metrics=["mae"])

  super().__init__(**kwargs)


Train Baseline Model

In [27]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

history = model.fit(
    X_train_seq, y_train_seq,
    validation_data = (X_val_seq, y_val_seq),
    epochs = 50,
    batch_size = 32,
    callbacks=[es],
    verbose=1
)



Epoch 1/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - loss: 0.0422 - mae: 0.1358 - val_loss: 0.0135 - val_mae: 0.1062
Epoch 2/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0024 - mae: 0.0352 - val_loss: 0.0010 - val_mae: 0.0204
Epoch 3/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0013 - mae: 0.0208 - val_loss: 8.4797e-04 - val_mae: 0.0160
Epoch 4/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0011 - mae: 0.0184 - val_loss: 9.8030e-04 - val_mae: 0.0189
Epoch 5/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0011 - mae: 0.0176 - val_loss: 0.0012 - val_mae: 0.0241
Epoch 6/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0011 - mae: 0.0171 - val_loss: 0.0012 - val_mae: 0.0234
Epoch 7/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step -

Evaluate on Test Set

In [28]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import numpy as np

y_pred = model.predict(X_test_seq)
y_true = y_test_seq

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

print(f"RMSE:{rmse}, MAE:{mae}, MAPE:{mape}")


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
RMSE:0.013861688661906443, MAE:0.01152069776990126, MAPE:0.020576155469833612


Save Model (23 Aug 10pm)

In [29]:
model.save("lstm_baseline.h5")



In [30]:
model.save("lstm_model.keras")