# Unpickle X & Y; Train-Validation Split

In [15]:
from sklearn.model_selection import train_test_split
import pickle

X_combined, y_combined = pickle.load(open('Xy_combined.pkl', 'rb'))

In [16]:
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(X_combined, y_combined, test_size=0.05, random_state=42)
X_train_combined, X_va_combined, y_train_combined, y_va_combined = train_test_split(X_train_combined, y_train_combined, test_size=0.05, random_state=42)

# Baseline without Sequential Inputs

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
# get the baseline mae again
model_combined = LinearRegression()
model_combined.fit(X_train_combined, y_train_combined)
y_pred_combined = model_combined.predict(X_test_combined)
mae_combined = mean_absolute_error(y_test_combined, y_pred_combined)
print(f"Baseline MAE with route features: {mae_combined:.2f} seconds")

Baseline MAE with route features: 213.83 seconds


In [18]:
from xgboost import XGBRegressor
# Train XGBoost model with route features
xgb_model_combined = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=42)
xgb_model_combined.fit(X_train_combined, y_train_combined)
y_pred_combined = xgb_model_combined.predict(X_test_combined)
print("XGBoost with Route Features MAE:", mean_absolute_error(y_test_combined, y_pred_combined))

XGBoost with Route Features MAE: 197.36950778035285


# Transformer - Preprocess

In [19]:
import torch

padded_routes = pickle.load(open('padded_routes.pkl', 'rb'))
encoded_routes = pickle.load(open('encoded_routes.pkl', 'rb'))
attention_mask = torch.tensor([
    [0]+ [1]*len(r) + [0]*(padded_routes.shape[1] - len(r)) for r in encoded_routes
])  # (B, L+1) due to the added start token

In [20]:
len(padded_routes)

16470

In [21]:
import numpy as np
# Sequence input: (B, N, 2) where B=batch, N=#points, 2=(lat, lon)
seq_input = torch.tensor(padded_routes, dtype=torch.float32)  # shape: (B, N, 2)

print(f"Scalars used: {len(X_combined.columns)}")# Scalar input: (B, F)
scalar_input = torch.tensor(X_combined.values, dtype=torch.float32)  # shape: (B, F)

# Output: ETA in seconds
eta_target = torch.tensor(y_combined, dtype=torch.float32)  # shape: (B,)


Scalars used: 22


In [22]:
#take 5% of seq,scalar, and eta for validation
seq_input, seq_test, scalar_input, scalar_test, eta_target, eta_test, attention_mask, attention_mask_test = train_test_split(
    seq_input, scalar_input, eta_target, attention_mask, test_size=0.05, random_state=42)
seq_input, seq_va, scalar_input, scalar_va, eta_target, eta_va, attention_mask, attention_mask_va = train_test_split(
    seq_input, scalar_input, eta_target, attention_mask, test_size=0.05, random_state=42)

In [23]:
seq_input.shape, scalar_input.shape, eta_target.shape, attention_mask.shape

(torch.Size([14863, 114, 2]),
 torch.Size([14863, 22]),
 torch.Size([14863]),
 torch.Size([14863, 115]))

# Transformer - Define

In [24]:
import torch
import torch.nn as nn

class ETA_Transformer(nn.Module):
    def __init__(self, seq_dim=2, scalar_dim=22, d_model=256, nhead=8, num_layers=3):
        super().__init__()
        self.input_proj = nn.Linear(seq_dim, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(500, d_model))  # max 500 points
        
        encoder_layer = nn.TransformerEncoderLayer(dim_feedforward=1024,d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.scalar_proj = nn.Linear(scalar_dim, d_model)
        self.output_head = nn.Sequential(
            nn.Linear(d_model * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
            # nn.ReLU(),
            # nn.Linear(16, 1)  # Final output for ETA prediction
        )

        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))


    def forward(self, route_seq, scalar_feats, attention_mask):
        """
        route_seq: (B, N, 2)
        scalar_feats: (B, F)
        """
        B, N, _ = route_seq.size()
        x = self.input_proj(route_seq)  # (B, N, d_model)

        
        cls_tokens = self.cls_token.expand(B, 1, -1)  # (B, 1, d_model)
        x = torch.cat((cls_tokens, x), dim=1)  # (B, N+1, d_model)

        x = x + self.pos_encoding[:N+1]  # add positional encoding, N+1 because of [CLS] token

        x = self.transformer(x, src_key_padding_mask=(attention_mask == 0))  # (B, N, d_model)
        route_embedding = x[:, 0, :]  # [CLS] token output
        
        mask = attention_mask.unsqueeze(-1)  # (B, L, 1)
        masked_x = x * mask  # zero out padding
        route_embedding = masked_x.sum(1) / mask.sum(1)  # (B, d_model)
        #route_embedding = x.mean(dim=1)  # (B, d_model)

        scalar_embedding = self.scalar_proj(scalar_feats)  # (B, d_model)
        combined = torch.cat([route_embedding, scalar_embedding], dim=1)  # (B, 2*d_model)

        eta_pred = self.output_head(combined).squeeze(-1)  # (B,)
        return eta_pred


In [25]:
model = ETA_Transformer()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
loss_fn = nn.L1Loss()
epochs = 28

In [26]:
for epoch in range(epochs):
    model.train()
    #sample a 5% batch
    batch_size = int(0.05 * seq_input.shape[0])
    indices = torch.randperm(seq_input.shape[0])[:batch_size]
    seq_input_batch = seq_input[indices]
    scalar_input_batch = scalar_input[indices]
    eta_target_batch = eta_target[indices]
    attention_mask_batch = attention_mask[indices]

    pred = model(seq_input_batch, scalar_input_batch,attention_mask_batch)
    loss = loss_fn(pred, eta_target_batch)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.2f}")
    # print validation loss
    with torch.no_grad():
        model.eval()
        pred_test = model(seq_test, scalar_test, attention_mask_test)
        test_loss = loss_fn(pred_test, eta_test)
        print(f"Validation Loss: {test_loss.item():.2f}")


Epoch 1, Loss: 679.38
Validation Loss: 616.64
Epoch 2, Loss: 618.92
Validation Loss: 554.95
Epoch 3, Loss: 533.67
Validation Loss: 496.09
Epoch 4, Loss: 504.80
Validation Loss: 441.51
Epoch 5, Loss: 429.88
Validation Loss: 393.88
Epoch 6, Loss: 380.67
Validation Loss: 353.29
Epoch 7, Loss: 346.31
Validation Loss: 319.40
Epoch 8, Loss: 322.05
Validation Loss: 292.03
Epoch 9, Loss: 276.48
Validation Loss: 272.67
Epoch 10, Loss: 258.75
Validation Loss: 261.44
Epoch 11, Loss: 262.74
Validation Loss: 256.51
Epoch 12, Loss: 238.18
Validation Loss: 256.16
Epoch 13, Loss: 251.34
Validation Loss: 257.98
Epoch 14, Loss: 241.94
Validation Loss: 261.53
Epoch 15, Loss: 262.32
Validation Loss: 266.45
Epoch 16, Loss: 254.66
Validation Loss: 270.49
Epoch 17, Loss: 265.38
Validation Loss: 272.82
Epoch 18, Loss: 263.27
Validation Loss: 273.71
Epoch 19, Loss: 286.95
Validation Loss: 272.34
Epoch 20, Loss: 242.86
Validation Loss: 269.49
Epoch 21, Loss: 246.91
Validation Loss: 265.19
Epoch 22, Loss: 223.16

In [27]:
# save the model
import pickle
pickle.dump(model,open('trained_transformer.pkl', 'wb'))

# Performance eval

In [28]:
with torch.no_grad():
    model.eval()
    pred_va = model(seq_va, scalar_va, attention_mask_va)
    va_loss = loss_fn(pred_va, eta_va)
    print(f"Test Loss: {va_loss.item():.2f}")

Test Loss: 228.75
