In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import haversine_distances

# --- PARAMETERS ---
history_len = 48
pred_horizon = 4
k_neighbors = 2
val_ratio = 0.15
holdout_ratio = 0.15
epochs = 50
batch_size = 64
hidden_dim = 128
learning_rate = 0.0005
patience = 8

# --- LOAD DATA ---
train_df = pd.read_csv('../data/bicikelj_train.csv')
meta = pd.read_csv('../data/bicikelj_metadata.csv')
station_cols = train_df.columns[1:]

for col in station_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")
train_df = train_df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- NEIGHBOR DETECTION ---
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:k_neighbors]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- BUILD JOINT DATASET ---
class BicikeljDataset(Dataset):
    def __init__(self, df, station_cols, neighbors, history_len, pred_horizon, station_filter=None):
        self.samples = []
        timestamps = pd.to_datetime(df['timestamp'])
        hours = (timestamps.dt.hour / 23.0).values
        dows = (timestamps.dt.dayofweek / 6.0).values
        bikes = df[station_cols].values.astype(float)

        name_to_idx = {name: i for i, name in enumerate(station_cols)}

        for s_name in station_cols:
            if station_filter is not None and s_name not in station_filter:
                continue
            s_idx = name_to_idx[s_name]
            nn_idx = [name_to_idx[nn] for nn in neighbors[s_name]]
            for i in range(history_len, len(df) - pred_horizon + 1):
                seq = []
                for t in range(i - history_len, i):
                    row = [bikes[t, s_idx]]
                    row += [bikes[t, j] for j in nn_idx]
                    row += [hours[t], dows[t]]
                    seq.append(row)
                seq = np.stack(seq)
                target = bikes[i:i + pred_horizon, s_idx]
                self.samples.append((seq, target))
        
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# --- TRANSFORMER MODEL ---
class TimeSeriesTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_heads, num_layers, output_dim, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=n_heads,
                                                   dim_feedforward=hidden_dim*4,
                                                   dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        x = self.input_proj(x)  # [B, T, hidden]
        x = self.transformer(x)  # [B, T, hidden]
        x = x.mean(dim=1)        # mean pooling
        return self.fc(x)

# --- TRAINING FUNCTION ---
def train_transformer(train_loader, val_loader, input_dim, output_dim, hidden_dim, n_heads, num_layers, lr, device):
    model = TimeSeriesTransformer(input_dim, hidden_dim, n_heads, num_layers, output_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    best_val_loss = float('inf')
    patience_counter = 0
    best_state = None

    for epoch in range(epochs):
        model.train()
        train_losses = []
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                loss = loss_fn(preds, yb)
                val_losses.append(loss.item())

        avg_train = np.mean(train_losses)
        avg_val = np.mean(val_losses)
        print(f"Epoch {epoch+1:03d} | Train: {avg_train:.4f} | Val: {avg_val:.4f}")

        if avg_val < best_val_loss:
            best_val_loss = avg_val
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping.")
                break

    model.load_state_dict(best_state)
    return model

# --- MAIN SCRIPT ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Build full dataset
full_dataset = BicikeljDataset(train_df, station_cols, neighbors, history_len, pred_horizon)

# Split into train/val/holdout
total = len(full_dataset)
train_end = int(total * (1 - val_ratio - holdout_ratio))
val_end = int(total * (1 - holdout_ratio))
train_set = torch.utils.data.Subset(full_dataset, range(0, train_end))
val_set = torch.utils.data.Subset(full_dataset, range(train_end, val_end))
holdout_set = torch.utils.data.Subset(full_dataset, range(val_end, total))

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size)
holdout_loader = DataLoader(holdout_set, batch_size=batch_size)

# Infer input/output dims
sample_x, sample_y = full_dataset[0]
input_dim = sample_x.shape[1]
output_dim = sample_y.shape[0]

# Train
model = train_transformer(train_loader, val_loader, input_dim=input_dim, output_dim=output_dim,
                          hidden_dim=hidden_dim, n_heads=4, num_layers=2, lr=learning_rate, device=device)

# Holdout evaluation
model.eval()
holdout_losses = []
with torch.no_grad():
    for xb, yb in holdout_loader:
        xb, yb = xb.to(device), yb.to(device)
        preds = model(xb)
        loss = nn.functional.mse_loss(preds, yb)
        holdout_losses.append(loss.item())

print(f"\nHoldout MSE: {np.mean(holdout_losses):.4f}")


  train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")


KeyboardInterrupt: 