In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import xgboost as xgb

In [3]:
#load cleaned dataset
df= pd.read_parquet("../data_backend/cleaned_data_2.parquet")
df.head()

Unnamed: 0,ffb_1%_oer,import,export,production,end_stock,cpo_futures,usd_myr_rate,brent_oil_futures,soybean_futures,precipitation,...,avg_humidity,lag_1,lag_3,lag_7,rolling_mean_7,rolling_mean_30,rolling_std_7,rolling_std_30,pct_change_1,pct_change_7
0,21.3,81477,1680891,1737461,3002871,2200.0,4.096,61.89,30.48,47.5,...,90.083333,21.2,21.2,20.75,21.071429,20.576667,0.209875,0.332113,0.004717,0.026506
1,21.3,94278,1324615,1544518,3056929,2200.0,4.096,62.75,30.21,7.0,...,89.958333,21.3,21.25,20.85,21.135714,20.62,0.199404,0.339015,0.0,0.021583
2,21.3,94278,1324615,1544518,3056929,2200.0,4.096,62.75,30.21,4.7,...,90.083333,21.3,21.2,20.85,21.2,20.66,0.160728,0.346261,0.0,0.021583
3,21.3,94278,1324615,1544518,3056929,2200.0,4.096,62.75,30.21,13.2,...,89.125,21.3,21.3,20.85,21.264286,20.69,0.047559,0.361606,0.0,0.021583
4,21.35,94278,1324615,1544518,3056929,2207.0,4.0935,62.51,30.44,6.5,...,88.5,21.3,21.3,21.2,21.285714,20.721667,0.047559,0.376619,0.002347,0.007075


In [4]:
target_col = "ffb_1%_oer"
raw_features = ["import", "export", "production", "end_stock", 
                "cpo_futures", "usd_myr_rate", "brent_oil_futures", 
                "soybean_futures", "precipitation", "avg_temperature", "avg_humidity"]

engineered_features = ["lag_1","lag_3","lag_7","rolling_mean_7",
                       "rolling_mean_30","rolling_std_7","rolling_std_30",
                       "pct_change_1","pct_change_7"]

engineered_features_lstm = ["lag_1", "rolling_mean_7"]

X = df[raw_features + engineered_features].values
y = df[target_col].values.reshape(-1,1)


#Splitting into train-validate-test dataa
N = len(df)
train_size = int(N * 0.7)   # 70% train
val_size   = int(N * 0.2)  # 20% validation
test_size  = N - train_size - val_size  # 10% test

X_train_raw = X[:train_size]
X_val_raw   = X[train_size:train_size+val_size]
X_test_raw  = X[train_size+val_size:]

y_train_raw = y[:train_size]
y_val_raw   = y[train_size:train_size+val_size]
y_test_raw  = y[train_size+val_size:]


#scale data
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train = scaler_x.fit_transform(X_train_raw)
X_val   = scaler_x.transform(X_val_raw)
X_test  = scaler_x.transform(X_test_raw)

y_train = scaler_y.fit_transform(y_train_raw)
y_val   = scaler_y.transform(y_val_raw)
y_test  = scaler_y.transform(y_test_raw)

In [5]:
def create_multi_step_sequences(X, y, lookback, horizon):
    Xs, ys = [], []
    for i in range(lookback, len(X) - horizon + 1):
        Xs.append(X[i - lookback:i])
        ys.append(y[i:i + horizon].ravel())  # collect next horizon steps
    return np.array(Xs), np.array(ys)

forecast_horizon = 14
lookback = 90
"""
X_train_lstm, y_train_lstm = create_multi_step_sequences(X_train, y_train, lookback, forecast_horizon)
X_val_lstm, y_val_lstm     = create_multi_step_sequences(X_val, y_val, lookback, forecast_horizon)
X_test_lstm, y_test_lstm   = create_multi_step_sequences(X_test, y_test, lookback, forecast_horizon)
"""
# Use same sequence generation logic for fair comparison
X_train_xgb, y_train_xgb = create_multi_step_sequences(X_train, y_train, lookback, forecast_horizon)
X_val_xgb,   y_val_xgb   = create_multi_step_sequences(X_val, y_val, lookback, forecast_horizon)
X_test_xgb,  y_test_xgb  = create_multi_step_sequences(X_test, y_test, lookback, forecast_horizon)

"""
# Flatten last two dimensions: (samples, lookback * features)
n_samples, seq_len, n_features = X_train_xgb.shape
X_train_xgb = X_train_xgb.reshape(n_samples, seq_len * n_features)
X_val_xgb   = X_val_xgb.reshape(X_val_xgb.shape[0], seq_len * n_features)
X_test_xgb  = X_test_xgb.reshape(X_test_xgb.shape[0], seq_len * n_features)
"""


'\n# Flatten last two dimensions: (samples, lookback * features)\nn_samples, seq_len, n_features = X_train_xgb.shape\nX_train_xgb = X_train_xgb.reshape(n_samples, seq_len * n_features)\nX_val_xgb   = X_val_xgb.reshape(X_val_xgb.shape[0], seq_len * n_features)\nX_test_xgb  = X_test_xgb.reshape(X_test_xgb.shape[0], seq_len * n_features)\n'

In [6]:
forecast_horizon = 14
lookback = 90

xgb_features_idx = [(raw_features + engineered_features).index(f) for f in engineered_features ]

X_train_xgb = X_train[:,xgb_features_idx]
X_val_xgb = X_val[:,xgb_features_idx]
X_test_xgb = X_test[:,xgb_features_idx]

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "eta": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 3,
    "gamma": 0,
    "lambda": 1,
    "alpha": 0,
    "seed": 42
}

xgb_models = []
y_preds_xgb = []

for step in range(forecast_horizon):
    # target is shifted for each horizon step
    y_train_step = y_train[lookback + step: len(y_train) - forecast_horizon + step + 1]
    y_val_step   = y_val[lookback + step: len(y_val) - forecast_horizon + step + 1]
    y_test_step  = y_test[lookback + step: len(y_test) - forecast_horizon + step + 1]
    
    dtrain = xgb.DMatrix(X_train_xgb[lookback:len(y_train_step)+lookback], label=y_train_step)
    dval   = xgb.DMatrix(X_val_xgb[lookback:len(y_val_step)+lookback], label=y_val_step)
    dtest  = xgb.DMatrix(X_test_xgb[lookback:len(y_test_step)+lookback])

    model_step = xgb.train(
        params,
        dtrain,
        num_boost_round=500,
        evals=[(dtrain, "train"), (dval, "val")],
        early_stopping_rounds=20,
        verbose_eval=True
    )
    xgb_models.append(model_step)
    y_preds_xgb.append(model_step.predict(dtest))

y_pred_xgb = np.column_stack(y_preds_xgb)  # shape: (samples, horizon)
y_pred_xgb_inv = scaler_y.inverse_transform(y_pred_xgb)

[0]	train-rmse:0.20158	val-rmse:0.03606
[1]	train-rmse:0.19174	val-rmse:0.03425
[2]	train-rmse:0.18237	val-rmse:0.03257
[3]	train-rmse:0.17352	val-rmse:0.03095
[4]	train-rmse:0.16511	val-rmse:0.02951


[5]	train-rmse:0.15708	val-rmse:0.02803
[6]	train-rmse:0.14943	val-rmse:0.02665
[7]	train-rmse:0.14214	val-rmse:0.02530
[8]	train-rmse:0.13527	val-rmse:0.02405
[9]	train-rmse:0.12868	val-rmse:0.02286
[10]	train-rmse:0.12243	val-rmse:0.02175
[11]	train-rmse:0.11651	val-rmse:0.02070
[12]	train-rmse:0.11084	val-rmse:0.01966
[13]	train-rmse:0.10549	val-rmse:0.01868
[14]	train-rmse:0.10044	val-rmse:0.01780
[15]	train-rmse:0.09558	val-rmse:0.01691
[16]	train-rmse:0.09096	val-rmse:0.01615
[17]	train-rmse:0.08656	val-rmse:0.01537
[18]	train-rmse:0.08242	val-rmse:0.01463
[19]	train-rmse:0.07845	val-rmse:0.01398
[20]	train-rmse:0.07467	val-rmse:0.01326
[21]	train-rmse:0.07108	val-rmse:0.01261
[22]	train-rmse:0.06767	val-rmse:0.01199
[23]	train-rmse:0.06445	val-rmse:0.01145
[24]	train-rmse:0.06138	val-rmse:0.01093
[25]	train-rmse:0.05843	val-rmse:0.01040
[26]	train-rmse:0.05566	val-rmse:0.00982
[27]	train-rmse:0.05299	val-rmse:0.00939
[28]	train-rmse:0.05046	val-rmse:0.00894
[29]	train-rmse:0.048

In [7]:
# --- Create True Multi-Step Targets for Evaluation ---
y_true_inv = scaler_y.inverse_transform(y_test[:len(y_pred_xgb_inv)])
y_true_horizon = []
for step in range(forecast_horizon):
    y_true_horizon.append(y_true_inv[lookback + step : len(y_true_inv) - forecast_horizon + step + 1, 0])
y_true_horizon = np.column_stack(y_true_horizon)

In [8]:
# --- Evaluate Model Performance ---
print("=== Step-wise Evaluation ===")
for step in range(forecast_horizon):
    y_true_step = y_true_horizon[:, step]
    y_pred_step = y_pred_xgb_inv[:, step]

    rmse = root_mean_squared_error(y_true_step, y_pred_step)

    #rmse = np.sqrt(root_mean_squared_error(y_true_step, y_pred_step))
    mae = mean_absolute_error(y_true_step, y_pred_step)
    r2 = r2_score(y_true_step, y_pred_step)

    print(f"Step {step+1:2d}: RMSE={rmse:.4f}, MAE={mae:.4f}, R²={r2:.4f}")


=== Step-wise Evaluation ===


ValueError: Found input variables with inconsistent numbers of samples: [26, 129]

In [None]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

y_true_inv = scaler_y.inverse_transform(y_test[:len(y_pred_xgb_inv)])
y_true_flat = y_true_inv.flatten()
y_pred_flat = y_pred_xgb_inv.flatten()

rmse = np.sqrt(root_mean_squared_error(y_true_inv, y_pred_xgb_inv))
mae = mean_absolute_error(y_true_inv, y_pred_xgb_inv)
r2 = r2_score(y_true_inv, y_pred_xgb_inv)

print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")


In [None]:
plt.figure(figsize=(12,6))
plt.plot(y_true_inv, label="Actual")
plt.plot(y_pred_xgb_inv, label="XGBoost Prediction")
plt.legend()
plt.title("XGBoost Forecast vs Actual")
plt.show()
