In [29]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

done


In [47]:
df = pd.read_csv("../data/processed_dataset.csv", parse_dates=["vehicle_timestamp"])
df = df.sort_values("vehicle_timestamp")
print(df.head())

  df = pd.read_csv("../data/processed_dataset.csv", parse_dates=["vehicle_timestamp"])


               timestamp_utc   vehicle_timestamp                vehicle_id  \
167528   2025-11-10 03:17:07 2025-11-10 03:16:27  VehiclePosition-BKK_2005   
4022240  2025-11-10 03:17:07 2025-11-10 03:16:29  VehiclePosition-BKK_6452   
777704   2025-11-10 03:17:07 2025-11-10 03:16:30   VehiclePosition-BKK_335   
349862   2025-11-10 03:17:07 2025-11-10 03:16:33  VehiclePosition-BKK_2037   
2875687  2025-11-10 03:17:07 2025-11-10 03:16:34  VehiclePosition-BKK_5971   

              trip_id route_id last_stop_id end_stop_id  \
167528   D01611118738     3060       009220      009220   
4022240      D0553366     9230       F00461      F00461   
777704       C9848225     9722       007817      007817   
349862   D01611118933     3060       F02476      F02476   
2875687     C97719112     9312       F01027      F01027   

         current_stop_sequence  hour  weekday  delay_seconds_calc  \
167528                      18     3        0             -3453.0   
4022240                     66     3  

In [49]:
FEATURES = ["delay_seconds_calc", "hour", "weekday"]
TARGET = "y_end_delay_calc"

df = df.dropna(subset=FEATURES + [TARGET]).copy()

print("rows:", len(df))
print("min vehicle_timestamp:", df["vehicle_timestamp"].min())
print("max vehicle_timestamp:", df["vehicle_timestamp"].max())


TRAIN_END = df["vehicle_timestamp"].quantile(0.70)
VAL_END   = df["vehicle_timestamp"].quantile(0.85)

train_df = df[df["vehicle_timestamp"] < TRAIN_END].copy()
val_df   = df[(df["vehicle_timestamp"] >= TRAIN_END) & (df["vehicle_timestamp"] < VAL_END)].copy()
test_df  = df[df["vehicle_timestamp"] >= VAL_END].copy()

print("train/val/test:", len(train_df), len(val_df), len(test_df))

rows: 6170467
min vehicle_timestamp: 2025-11-10 03:16:27
max vehicle_timestamp: 2025-11-24 23:15:24
train/val/test: 4319310 925566 925591


In [50]:
X_train_raw = train_df[FEATURES].values
y_train_raw = train_df[TARGET].values.astype(np.float32)

X_val_raw = val_df[FEATURES].values
y_val_raw = val_df[TARGET].values.astype(np.float32)

X_test_raw = test_df[FEATURES].values
y_test_raw = test_df[TARGET].values.astype(np.float32)


x_scaler = StandardScaler()
X_train = x_scaler.fit_transform(X_train_raw)
X_val   = x_scaler.transform(X_val_raw)
X_test  = x_scaler.transform(X_test_raw)

y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train_raw.reshape(-1, 1)).ravel().astype(np.float32)
y_val   = y_scaler.transform(y_val_raw.reshape(-1, 1)).ravel().astype(np.float32)
y_test  = y_scaler.transform(y_test_raw.reshape(-1, 1)).ravel().astype(np.float32)


In [51]:
class DelayDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [53]:
class MLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)

In [54]:
def eval_mse(m, loader):
    m.eval()
    tot = 0.0
    n = 0
    with torch.no_grad():
        for Xb, yb in loader:
            Xb, yb = Xb.to(device), yb.to(device)
            pred = m(Xb)
            l = loss_fn(pred, yb).item()
            tot += l * len(Xb)
            n += len(Xb)
    return tot / max(n, 1)

In [55]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = MLP(len(FEATURES)).to(device)

opt = torch.optim.AdamW(model.parameters(), lr=3e-3, weight_decay=0.0)
loss_fn = nn.MSELoss()
print("yb train mean/std (scaled):", float(train_ds.y.mean()), float(train_ds.y.std()))


yb train mean/std (scaled): -3.6174758744067503e-09 1.0000001192092896


In [56]:
train_loader_fix = DataLoader(train_ds, batch_size=128, shuffle=False, drop_last=True)
Xb, yb = next(iter(train_loader_fix))
Xb, yb = Xb.to(device), yb.to(device)

model_overfit = MLP(len(FEATURES)).to(device)
opt_overfit = torch.optim.Adam(model_overfit.parameters(), lr=3e-3)
loss_fn_overfit = nn.MSELoss()

model_overfit.train()
for step in range(1, 5001):
    opt_overfit.zero_grad()
    pred = model_overfit(Xb)
    loss = loss_fn_overfit(pred, yb)
    loss.backward()
    opt_overfit.step()
    if step % 500 == 0:
        print(f"step={step:5d}  batch_mse={loss.item():.6f}")

step=  500  batch_mse=0.030817
step= 1000  batch_mse=0.036673
step= 1500  batch_mse=0.027699
step= 2000  batch_mse=0.026048
step= 2500  batch_mse=0.024152
step= 3000  batch_mse=0.023373
step= 3500  batch_mse=0.022734
step= 4000  batch_mse=0.021583
step= 4500  batch_mse=0.020154
step= 5000  batch_mse=0.018676
