In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import haversine_distances

# Load metadata
meta = pd.read_csv('bicikelj_metadata.csv')
coords = np.deg2rad(meta[['latitude', 'longitude']].values)  # radians for haversine

# Compute pairwise distances (in km)
dists = haversine_distances(coords, coords) * 6371

# Use exponential decay for adjacency weights
sigma = 1.0  # adjust this to control the "spread" (try 0.5-2.0)
adj = np.exp(-dists / sigma)

# Optionally zero-out self-edges (or set to 1)
np.fill_diagonal(adj, 1.0)

# Optionally threshold to k-nearest
k = 4
for i in range(adj.shape[0]):
    adj[i, np.argsort(adj[i])[:-k-1]] = 0


In [None]:
adj = adj / (adj.sum(axis=1, keepdims=True) + 1e-8)

In [None]:
# For a hard threshold:
adj = (dists < 1.0).astype(float)
np.fill_diagonal(adj, 1.0)
adj = adj / (adj.sum(axis=1, keepdims=True) + 1e-8)


# STGCN

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# ------------------------- MODEL DEFINITIONS -------------------------

class GraphConv(nn.Module):
    def __init__(self, in_features, out_features, adj):
        super().__init__()
        self.register_buffer('adj', torch.FloatTensor(adj))
        self.lin = nn.Linear(in_features, out_features)

    def forward(self, x):
        adj = self.adj.to(x.device)
        x = torch.einsum('ij,bjf->bif', adj, x)
        x = self.lin(x)
        return x

class STGCNBlock(nn.Module):
    def __init__(self, in_features, out_features, adj):
        super().__init__()
        self.temp_conv = nn.Conv2d(
            in_channels=in_features, out_channels=out_features,
            kernel_size=(3, 1), padding=(1, 0)
        )
        self.gconv = GraphConv(out_features, out_features, adj)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.permute(0, 3, 1, 2)  # [batch, in_features, seq_len, nodes]
        x = self.temp_conv(x)       # [batch, out_features, seq_len, nodes]
        x = self.relu(x)
        x = x.permute(0, 2, 3, 1)  # [batch, seq_len, nodes, out_features]

        out = []
        for t in range(x.shape[1]):
            xt = x[:, t, :, :]
            xt = self.gconv(xt)
            out.append(xt)
        x = torch.stack(out, dim=1)
        return self.relu(x)

class STGCN(nn.Module):
    def __init__(self, adj, num_nodes, input_len=48, output_len=4, in_features=3, hidden=8):
        super().__init__()
        self.block1 = STGCNBlock(in_features, hidden, adj)
        # Comment out block2 for a much smaller model
        # self.block2 = STGCNBlock(hidden, hidden, adj)
        self.fc = nn.Linear(input_len * hidden, output_len)

    def forward(self, x):
        x = self.block1(x)
        # x = self.block2(x)  # REMOVE THIS LINE
        B, T, N, H = x.shape
        x = x.permute(0, 2, 1, 3).reshape(B, N, T * H)
        out = self.fc(x)
        return out.permute(0, 2, 1)


# ------------------------- DATASET & UTILS -------------------------

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def train_val_split(X, y, val_ratio=0.2):
    N = X.shape[0]
    split = int(N * (1 - val_ratio))
    X_train, X_val = X[:split], X[split:]
    y_train, y_val = y[:split], y[split:]
    return X_train, X_val, y_train, y_val

def mse(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()

# ------------------------- FEATURE ENGINEERING -------------------------

def add_time_features(df, window_size, station_cols):
    n_samples = len(df) - window_size - 4 + 1  # "-4+1" ensures we have room for pred window
    n_stations = len(station_cols)
    X, y = [], []
    for i in range(n_samples):
        window = df.iloc[i:i+window_size]
        bikes = window[station_cols].astype(float).values
        timestamps = pd.to_datetime(window['timestamp'])
        hour_of_day = timestamps.dt.hour.values[:, None] / 23.0
        day_of_week = timestamps.dt.dayofweek.values[:, None] / 6.0
        hour_feat = np.tile(hour_of_day, (1, n_stations))
        dow_feat = np.tile(day_of_week, (1, n_stations))
        features = np.stack([bikes, hour_feat, dow_feat], axis=-1)  # [win, stations, 3]
        X.append(features)
        y_window = df.iloc[i+window_size:i+window_size+4][station_cols].astype(float).values
        y.append(y_window)
    return np.array(X), np.array(y)

# ------------------------- TRAINING LOOP -------------------------

def train_stgcn_with_val(X_train, y_train, X_val, y_val, adj,
                         num_epochs=100, lr=0.001, patience=10, batch_size=32):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    num_nodes = X_train.shape[2]
    in_features = X_train.shape[3]
    input_len = X_train.shape[1]
    output_len = y_train.shape[1]
    model = STGCN(adj, num_nodes, input_len, output_len, in_features).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    train_dataset = TimeSeriesDataset(X_train, y_train)
    val_dataset = TimeSeriesDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=False)

    best_val_loss = float('inf')
    patience_counter = 0
    best_state = None

    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            out = model(xb)
            loss = loss_fn(out, yb)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                out = model(xb)
                val_loss = loss_fn(out, yb)
                val_losses.append(val_loss.item())
        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)
        print(f"Epoch {epoch+1}/{num_epochs} "
              f"Train Loss: {avg_train_loss:.4f} "
              f"Val Loss: {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break



    model.load_state_dict(best_state)
    return model

def predict_batches(model, X, batch_size=32, device='cpu'):
    model.eval()
    dataset = torch.utils.data.TensorDataset(torch.FloatTensor(X))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    preds = []
    with torch.no_grad():
        for xb, in loader:
            xb = xb.to(device)
            out = model(xb)
            preds.append(out.cpu().numpy())
    return np.concatenate(preds, axis=0)

# ------------------------- TEST SET PREDICTION & EXPORT -------------------------

def predict_and_fill(model, test_df, adj, station_cols, device='cpu'):
    filled_test = test_df.copy()
    mask_missing = test_df[station_cols].isnull() | (test_df[station_cols] == '')

    # Find all indices where a new hole starts
    rows_with_holes = mask_missing.any(axis=1)
    idx_missing = np.where(rows_with_holes)[0]
    header = list(test_df.columns)
    legend_row = pd.DataFrame([header], columns=header)

    i = 0
    while i < len(test_df):
        if rows_with_holes.iloc[i]:
            # Start of a gap
            obs_idx = i - 48
            if obs_idx < 0:
                i += 1
                continue  # not enough history
            window = filled_test.iloc[obs_idx:i]
            timestamps = pd.to_datetime(window['timestamp'])
            hour_of_day = timestamps.dt.hour.values[:, None] / 23.0
            day_of_week = timestamps.dt.dayofweek.values[:, None] / 6.0
            bikes = window[station_cols].astype(float).values
            hour_feat = np.tile(hour_of_day, (1, len(station_cols)))
            dow_feat = np.tile(day_of_week, (1, len(station_cols)))
            features = np.stack([bikes, hour_feat, dow_feat], axis=-1)
            model_input = np.expand_dims(features, axis=0)  # [1, 48, num_stations, 3]

            # Predict next 4 hours
            model.eval()
            with torch.no_grad():
                inp = torch.FloatTensor(model_input).to(device)
                pred = model(inp).cpu().numpy()  # [1, 4, num_stations]
            pred = pred[0]
            for j in range(4):
                row_idx = i + j
                if row_idx >= len(filled_test):
                    break
                for k, station in enumerate(station_cols):
                    if pd.isnull(filled_test.loc[row_idx, station]) or filled_test.loc[row_idx, station] == '':
                        filled_test.loc[row_idx, station] = float(pred[j, k])
            i += 4
        else:
            i += 1

    # Output only rows where there were missing predictions
    rows_with_preds = rows_with_holes
    final_df = filled_test[rows_with_preds]
    final_with_legend = pd.concat([legend_row, final_df], ignore_index=True)
    final_with_legend.to_csv('napovedi.csv', index=False, header=False)
    print("Saved napovedi.csv with legend and only rows containing predictions.")

# ------------------------- MAIN PIPELINE -------------------------

if __name__ == '__main__':
    # --- Load your training data ---
    train_df = pd.read_csv('bicikelj_train.csv')
    station_cols = list(train_df.columns)[1:]  # skip timestamp

    # --- Simple adjacency: identity (replace with real graph for best results) ---
    adj = np.eye(len(station_cols))

    # --- Feature engineering for train ---
    # Normalize bike counts for all data columns
    bike_min = train_df[station_cols].astype(float).min().min()
    bike_max = train_df[station_cols].astype(float).max().max()
    for col in station_cols:
        train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
        train_df[station_cols] = train_df[station_cols].fillna(0)

    train_df[station_cols] = (train_df[station_cols] - bike_min) / (bike_max - bike_min + 1e-8)

    X, y = add_time_features(train_df, window_size=48, station_cols=station_cols)

    # --- Chronological split ---
    X_train, X_val, y_train, y_val = train_val_split(X, y, val_ratio=0.2)
    print("Any NaN in X_train?", np.isnan(X_train).any())
    print("Any NaN in y_train?", np.isnan(y_train).any())
    print("X_train min/max", X_train.min(), X_train.max())
    print("y_train min/max", y_train.min(), y_train.max())

    # --- Train with early stopping ---
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = train_stgcn_with_val(
        X_train, y_train, X_val, y_val, adj,
        num_epochs=100, patience=10, batch_size=64)

    # --- Validation MSE ---
    val_pred = predict_batches(model, X_val, batch_size=64, device=device)
    val_mse = mse(y_val, val_pred)
    print(f'Validation MSE: {val_mse:.4f}')

    # --- Test set prediction and export ---
    test_df = pd.read_csv('bicikelj_test.csv', dtype=str)
    predict_and_fill(model, test_df, adj, station_cols, device=device)


Any NaN in X_train? False
Any NaN in y_train? False
X_train min/max 0.0 1.0
y_train min/max 0.0 0.9999999996153845
Epoch 1/100 Train Loss: 0.0323 Val Loss: 0.0202
Epoch 2/100 Train Loss: 0.0178 Val Loss: 0.0179
Epoch 3/100 Train Loss: 0.0168 Val Loss: 0.0174
Epoch 4/100 Train Loss: 0.0165 Val Loss: 0.0172
Epoch 5/100 Train Loss: 0.0164 Val Loss: 0.0171
Epoch 6/100 Train Loss: 0.0163 Val Loss: 0.0171
Epoch 7/100 Train Loss: 0.0162 Val Loss: 0.0170
Epoch 8/100 Train Loss: 0.0162 Val Loss: 0.0169
Epoch 9/100 Train Loss: 0.0161 Val Loss: 0.0168
Epoch 10/100 Train Loss: 0.0161 Val Loss: 0.0168
Epoch 11/100 Train Loss: 0.0160 Val Loss: 0.0170
Epoch 12/100 Train Loss: 0.0160 Val Loss: 0.0167
Epoch 13/100 Train Loss: 0.0160 Val Loss: 0.0169
Epoch 14/100 Train Loss: 0.0159 Val Loss: 0.0170
Epoch 15/100 Train Loss: 0.0159 Val Loss: 0.0166
Epoch 16/100 Train Loss: 0.0159 Val Loss: 0.0166
Epoch 17/100 Train Loss: 0.0159 Val Loss: 0.0167
Epoch 18/100 Train Loss: 0.0159 Val Loss: 0.0166
Epoch 19/100

# MLP Per station

In [None]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, in_features, hidden=64, out_features=4):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, out_features)
        )
    def forward(self, x):
        return self.net(x)


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import haversine_distances
import torch
from torch.utils.data import Dataset, DataLoader

# --- Parameters ---
history_len = 48
pred_horizon = 4
k_neighbors = 2
hidden_dim = 64
epochs = 25
batch_size = 128
learning_rate = 0.001

# --- Load data ---
train_df = pd.read_csv('bicikelj_train.csv')
test_df = pd.read_csv('bicikelj_test.csv')
meta = pd.read_csv('bicikelj_metadata.csv')

station_cols = train_df.columns[1:]
for col in station_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")
train_df = train_df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- Neighbor detection ---
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371  # km
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:k_neighbors]
    neighbors[name] = [station_names[j] for j in nn_idx]


  train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")


In [None]:
class BikeDataset(Dataset):
    def __init__(self, features, targets):
        self.X = torch.tensor(features, dtype=torch.float32)
        self.y = torch.tensor(targets, dtype=torch.float32)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
features_dict = {}
targets_dict = {}

for station in station_cols:
    features = []
    targets = []
    nn_stations = neighbors[station]
    for i in range(history_len, len(train_df) - pred_horizon + 1):
        own_hist = train_df[station].iloc[i-history_len:i].values.astype(float)
        nn_hist = []
        for nn in nn_stations:
            nn_hist.append(train_df[nn].iloc[i-history_len:i].values.astype(float))
        nn_hist = np.concatenate(nn_hist) if nn_hist else np.zeros(0)
        hour = pd.to_datetime(train_df['timestamp'].iloc[i]).hour / 23.0
        dow = pd.to_datetime(train_df['timestamp'].iloc[i]).dayofweek / 6.0
        f = np.concatenate([own_hist, nn_hist, [hour, dow]])
        features.append(f)
        target = train_df[station].iloc[i:i+pred_horizon].values.astype(float)
        targets.append(target)
    features_dict[station] = np.array(features)
    targets_dict[station] = np.array(targets)


KeyboardInterrupt: 

In [None]:
timestamps = pd.to_datetime(train_df['timestamp'])
hours = (timestamps.dt.hour / 23.0).values
dows = (timestamps.dt.dayofweek / 6.0).values


In [None]:
bikes = train_df[station_cols].values.astype(float)
timestamps = pd.to_datetime(train_df['timestamp'])
hours = (timestamps.dt.hour / 23.0).values
dows = (timestamps.dt.dayofweek / 6.0).values
features_dict = {}
targets_dict = {}

neighbor_indices = {station: [station_cols.get_loc(nn) for nn in neighbors[station]] for station in station_cols}

for s_idx, station in enumerate(station_cols):
    features = []
    targets = []
    nn_idx = neighbor_indices[station]
    for i in range(history_len, len(bikes) - pred_horizon + 1):
        own_hist = bikes[i-history_len:i, s_idx]
        nn_hist = bikes[i-history_len:i][:, nn_idx].flatten()  # shape: history_len * k_neighbors
        hour = hours[i]
        dow = dows[i]
        f = np.concatenate([own_hist, nn_hist, [hour, dow]])
        features.append(f)
        targets.append(bikes[i:i+pred_horizon, s_idx])
    features_dict[station] = np.array(features)
    targets_dict[station] = np.array(targets)


In [None]:
models = {}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 50

for station in station_cols:
    X = features_dict[station]
    y = targets_dict[station]
    dataset = BikeDataset(X, y)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    in_features = X.shape[1]
    model = MLP(in_features=in_features, hidden=hidden_dim, out_features=pred_horizon).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.MSELoss()

    model.train()
    for epoch in range(epochs):
        losses = []
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        if epoch % 5 == 0 or epoch == epochs-1:
            print(f"Station: {station} | Epoch {epoch+1}/{epochs} | Loss: {np.mean(losses):.4f}")
    models[station] = model.cpu()  # Save to CPU for later use


Station: LIDL BEŽIGRAD | Epoch 1/50 | Loss: 17.4032
Station: LIDL BEŽIGRAD | Epoch 6/50 | Loss: 10.9166
Station: LIDL BEŽIGRAD | Epoch 11/50 | Loss: 10.0934
Station: LIDL BEŽIGRAD | Epoch 16/50 | Loss: 9.2770
Station: LIDL BEŽIGRAD | Epoch 21/50 | Loss: 8.5763
Station: LIDL BEŽIGRAD | Epoch 26/50 | Loss: 8.0435
Station: LIDL BEŽIGRAD | Epoch 31/50 | Loss: 7.5349
Station: LIDL BEŽIGRAD | Epoch 36/50 | Loss: 7.1985
Station: LIDL BEŽIGRAD | Epoch 41/50 | Loss: 6.8938
Station: LIDL BEŽIGRAD | Epoch 46/50 | Loss: 6.5745
Station: LIDL BEŽIGRAD | Epoch 50/50 | Loss: 6.5241
Station: ŠMARTINSKI PARK | Epoch 1/50 | Loss: 15.9760
Station: ŠMARTINSKI PARK | Epoch 6/50 | Loss: 7.6463
Station: ŠMARTINSKI PARK | Epoch 11/50 | Loss: 7.2727
Station: ŠMARTINSKI PARK | Epoch 16/50 | Loss: 7.1046
Station: ŠMARTINSKI PARK | Epoch 21/50 | Loss: 6.8524
Station: ŠMARTINSKI PARK | Epoch 26/50 | Loss: 6.7038
Station: ŠMARTINSKI PARK | Epoch 31/50 | Loss: 6.4399
Station: ŠMARTINSKI PARK | Epoch 36/50 | Loss: 6.1

In [None]:
test_pred = test_df.copy()
test_pred[station_cols] = test_pred[station_cols].astype(str)
i = 0
while i < len(test_df):
    window = test_df.iloc[i:i+history_len]
    pred_start = i + history_len
    if pred_start + pred_horizon > len(test_df):
        break
    to_pred = test_df.iloc[pred_start:pred_start+pred_horizon]
    mask = to_pred[station_cols].isnull() | (to_pred[station_cols] == '')
    if mask.values.any():
        for sidx, station in enumerate(station_cols):
            if mask[station].any():
                own_hist = window[station].values.astype(float)
                nn_hist = []
                for nn in neighbors[station]:
                    nn_hist.append(window[nn].values.astype(float))
                nn_hist = np.concatenate(nn_hist) if nn_hist else np.zeros(0)
                hour = pd.to_datetime(to_pred['timestamp'].iloc[0]).hour / 23.0
                dow = pd.to_datetime(to_pred['timestamp'].iloc[0]).dayofweek / 6.0
                f = np.concatenate([own_hist, nn_hist, [hour, dow]])[None, :]
                # Predict using PyTorch model
                model = models[station]
                with torch.no_grad():
                    pred = model(torch.tensor(f, dtype=torch.float32)).numpy().flatten()
                for h in range(pred_horizon):
                    if mask[station].iloc[h]:
                        test_pred.loc[pred_start + h, station] = pred[h]
    i += history_len + pred_horizon


In [None]:
rows_with_preds = test_df[station_cols].isnull() | (test_df[station_cols] == '')
rows_with_preds = rows_with_preds.any(axis=1)
header = pd.DataFrame([test_df.columns], columns=test_df.columns)
final = pd.concat([header, test_pred[rows_with_preds]], ignore_index=True)
final.to_csv("pytorch_per_station.csv", index=False, header=False)
print("Predictions saved as pytorch_per_station.csv")


Predictions saved as pytorch_per_station.csv


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import haversine_distances

# --- PARAMETERS ---
history_len = 48
pred_horizon = 4
k_neighbors = 2
val_ratio = 0.15
holdout_ratio = 0.15
epochs = 80
batch_size = 32
hidden_dim = 64
dropout_p = 0.3
learning_rate = 0.001
patience = 10

# --- LOAD AND CLEAN DATA ---
train_df = pd.read_csv('bicikelj_train.csv')
meta = pd.read_csv('bicikelj_metadata.csv')
station_cols = train_df.columns[1:]

for col in station_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")
train_df = train_df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- NEIGHBOR DETECTION ---
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:k_neighbors]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- FEATURE & TARGET GENERATION ---
def make_features_targets(df, station_cols, neighbors, history_len, pred_horizon):
    T = len(df)
    features_dict = {}
    targets_dict = {}
    timestamps = pd.to_datetime(df['timestamp'])
    hours = timestamps.dt.hour.values / 23.0
    dows = timestamps.dt.dayofweek.values / 6.0
    for station in station_cols:
        own_vals = df[station].values.astype(float)
        nn_arrays = [df[nn].values.astype(float) for nn in neighbors[station]]
        all_feat = []
        all_tgt = []
        for i in range(history_len, T - pred_horizon + 1):
            main_hist = own_vals[i-history_len:i]
            nn_hist = np.concatenate([arr[i-history_len:i] for arr in nn_arrays]) if nn_arrays else np.zeros(0)
            f = np.concatenate([main_hist, nn_hist, [hours[i], dows[i]]])
            t = own_vals[i:i+pred_horizon]
            all_feat.append(f)
            all_tgt.append(t)
        features_dict[station] = np.stack(all_feat)
        targets_dict[station] = np.stack(all_tgt)
    return features_dict, targets_dict

features_dict, targets_dict = make_features_targets(train_df, station_cols, neighbors, history_len, pred_horizon)

# --- FIXED SPLITTING FUNCTION ---
def get_splits(n, val_ratio, holdout_ratio):
    train_end = int(n * (1 - val_ratio - holdout_ratio))
    val_end = int(n * (1 - holdout_ratio))
    return slice(0, train_end), slice(train_end, val_end), slice(val_end, n)

splits = {}
for station in station_cols:
    n = features_dict[station].shape[0]
    train_idx, val_idx, holdout_idx = get_splits(n, val_ratio, holdout_ratio)
    splits[station] = {
        "train": train_idx,
        "val": val_idx,
        "holdout": holdout_idx
    }

# --- DATASET CLASS ---
class BikeDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# --- MODEL ---
class MLP(nn.Module):
    def __init__(self, in_features, hidden=64, out_features=4, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features, hidden),
            nn.BatchNorm1d(hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden),
            nn.BatchNorm1d(hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, out_features)
        )
    def forward(self, x):
        return self.net(x)

# --- TRAINING LOOP ---
def train_mlp(X_train, y_train, X_val, y_val, in_features, out_features,
              epochs=80, batch_size=32, hidden_dim=64, dropout=0.3,
              lr=1e-3, patience=10, device='cpu', weight_decay=0.0):
    model = MLP(in_features, hidden_dim, out_features, dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.MSELoss()
    train_set = BikeDataset(X_train, y_train)
    val_set = BikeDataset(X_val, y_val)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size)
    best_val_loss = float('inf')
    patience_counter = 0
    best_state = None

    for epoch in range(epochs):
        model.train()
        train_losses = []
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                val_loss = loss_fn(preds, yb)
                val_losses.append(val_loss.item())

        avg_train = np.mean(train_losses)
        avg_val = np.mean(val_losses) if val_losses else float('nan')
        print(f"Epoch {epoch+1:03d} | Train: {avg_train:.4f} | Val: {avg_val:.4f}")
        if avg_val < best_val_loss:
            best_val_loss = avg_val
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping.")
                break
    model.load_state_dict(best_state)
    return model

# --- TRAINING PER STATION ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
mlp_models = {}
holdout_mse = {}

# for station in station_cols:
#     feat = features_dict[station]
#     targ = targets_dict[station]
#     train_idx, val_idx, holdout_idx = splits[station]["train"], splits[station]["val"], splits[station]["holdout"]
#     X_train, y_train = feat[train_idx], targ[train_idx]
#     X_val, y_val = feat[val_idx], targ[val_idx]
#     X_hold, y_hold = feat[holdout_idx], targ[holdout_idx]

#     print(f"\n--- Training station {station} ---")
#     print("Train samples:", len(X_train), "| Val samples:", len(X_val), "| Holdout samples:", len(X_hold))

#     model = train_mlp(X_train, y_train, X_val, y_val, in_features=X_train.shape[1], out_features=pred_horizon,
#                       epochs=epochs, batch_size=batch_size, hidden_dim=hidden_dim, dropout=dropout_p,
#                       lr=learning_rate, patience=patience, device=device)
#     mlp_models[station] = model.cpu()

#     # Evaluate on holdout
#     model.eval()
#     with torch.no_grad():
#         hold_pred = model(torch.from_numpy(X_hold).float())
#         mse = ((hold_pred.numpy() - y_hold) ** 2).mean()
#         holdout_mse[station] = mse
#     print(f"Holdout MSE for {station}: {mse:.4f}")

# print("Mean Holdout MSE across stations:", np.mean(list(holdout_mse.values())))

# Use a small fixed subset for faster iteration
# selected_stations = [
#     'LIDL BEŽIGRAD',
#     'CITYPARK',
# ]


# for station in station_cols:
#     if station not in selected_stations:
#         continue  # Skip stations not in the selected subset

#     feat = features_dict[station]
#     targ = targets_dict[station]
#     train_idx, val_idx, holdout_idx = splits[station]["train"], splits[station]["val"], splits[station]["holdout"]
#     X_train, y_train = feat[train_idx], targ[train_idx]
#     X_val, y_val = feat[val_idx], targ[val_idx]
#     X_hold, y_hold = feat[holdout_idx], targ[holdout_idx]

#     print(f"\n--- Training station {station} ---")
#     print("Train samples:", len(X_train), "| Val samples:", len(X_val), "| Holdout samples:", len(X_hold))

#     model = train_mlp(X_train, y_train, X_val, y_val, in_features=X_train.shape[1], out_features=pred_horizon,
#                       epochs=epochs, batch_size=batch_size, hidden_dim=hidden_dim, dropout=dropout_p,
#                       lr=learning_rate, patience=patience, device=device)
#     mlp_models[station] = model.cpu()

#     # Evaluate on holdout
#     model.eval()
#     with torch.no_grad():
#         hold_pred = model(torch.from_numpy(X_hold).float())
#         mse = ((hold_pred.numpy() - y_hold) ** 2).mean()
#         holdout_mse[station] = mse
#     print(f"Holdout MSE for {station}: {mse:.4f}")


  train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")


## Grid search

In [None]:
from itertools import product
import random

# --- Define parameter grid ---
hidden_dims = [32, 48, 64]
dropouts = [0.15, 0.2, 0.25]
lrs = [0.001, 0.0007]
weight_decays = [0.0, 1e-5, 5e-5]
batch_sizes = [32]

grid = list(product(hidden_dims, dropouts, lrs, weight_decays, batch_sizes))
grid = random.sample(grid, 20)
selected_stations = ['LIDL BEŽIGRAD', 'CITYPARK']
results = {}

# --- Grid search per selected station ---
for station in selected_stations:
    print(f"\n### Grid search for {station} ###")
    feat = features_dict[station]
    targ = targets_dict[station]
    train_idx, val_idx, holdout_idx = splits[station]["train"], splits[station]["val"], splits[station]["holdout"]
    X_train, y_train = feat[train_idx], targ[train_idx]
    X_val, y_val = feat[val_idx], targ[val_idx]
    X_hold, y_hold = feat[holdout_idx], targ[holdout_idx]

    best_mse = float('inf')
    best_config = None
    best_model = None

    for hdim, drop, lr, wd, bs in grid:
        print(f"\nTrying hidden={hdim}, dropout={drop}, lr={lr}, weight_decay={wd}, batch_size={bs}")
        model = train_mlp(
            X_train, y_train, X_val, y_val,
            in_features=X_train.shape[1], out_features=pred_horizon,
            epochs=epochs, batch_size=bs,
            hidden_dim=hdim, dropout=drop,
            lr=lr, patience=patience, device=device,
            weight_decay=wd
        )
        model.eval()
        with torch.no_grad():
            X_hold_tensor = torch.from_numpy(X_hold).float().to(device)
            hold_pred = model(X_hold_tensor).cpu()
            mse = ((hold_pred.numpy() - y_hold) ** 2).mean()

        print(f"Holdout MSE: {mse:.4f}")
        if mse < best_mse:
            best_mse = mse
            best_config = (hdim, drop, lr, wd, bs)
            best_model = model.cpu()

    results[station] = {
        "best_mse": best_mse,
        "best_config": {
            "hidden_dim": best_config[0],
            "dropout": best_config[1],
            "learning_rate": best_config[2],
            "weight_decay": best_config[3],
            "batch_size": best_config[4]
        },
        "model": best_model
    }

    print(f"\n✅ Best config for {station}: {results[station]['best_config']}")
    print(f"📉 Best holdout MSE: {best_mse:.4f}")



### Grid search for LIDL BEŽIGRAD ###

Trying hidden=64, dropout=0.15, lr=0.0007, weight_decay=1e-05, batch_size=32
Epoch 001 | Train: 22.9553 | Val: 13.8325
Epoch 002 | Train: 11.9611 | Val: 12.7888
Epoch 003 | Train: 11.5383 | Val: 12.9310
Epoch 004 | Train: 11.2091 | Val: 12.9310
Epoch 005 | Train: 11.0979 | Val: 12.6823
Epoch 006 | Train: 11.0463 | Val: 12.7624
Epoch 007 | Train: 10.7517 | Val: 12.9262
Epoch 008 | Train: 10.5677 | Val: 12.7931
Epoch 009 | Train: 10.5151 | Val: 13.1522
Epoch 010 | Train: 10.4811 | Val: 13.1094
Epoch 011 | Train: 10.3638 | Val: 12.9281
Epoch 012 | Train: 10.3393 | Val: 13.4025
Epoch 013 | Train: 10.1763 | Val: 13.0831
Epoch 014 | Train: 10.2087 | Val: 13.0782
Epoch 015 | Train: 9.9839 | Val: 12.9553
Early stopping.
Holdout MSE: 11.7827

Trying hidden=48, dropout=0.15, lr=0.001, weight_decay=1e-05, batch_size=32
Epoch 001 | Train: 21.7702 | Val: 13.5399
Epoch 002 | Train: 12.0815 | Val: 13.1615
Epoch 003 | Train: 11.6832 | Val: 12.9949
Epoch 004 | Tr

In [None]:
# --- PREDICT ON TEST SET ---
test_df = pd.read_csv("bicikelj_test.csv")
test_pred = test_df.copy()
test_pred[station_cols] = test_pred[station_cols].astype(str)  # to match LightGBM logic

i = 0
while i < len(test_df):
    window = test_df.iloc[i:i+history_len]
    pred_start = i + history_len
    if pred_start + pred_horizon > len(test_df):
        break
    to_pred = test_df.iloc[pred_start:pred_start+pred_horizon]
    mask = to_pred[station_cols].isnull() | (to_pred[station_cols] == '')

    timestamps = pd.to_datetime(to_pred['timestamp'].iloc[0])
    hour = timestamps.hour / 23.0
    dow = timestamps.dayofweek / 6.0

    for station in station_cols:
        if station not in mlp_models:
            continue
        if not mask[station].any():
            continue

        try:
            own_hist = window[station].astype(float).values
            nn_hist = []
            for nn in neighbors[station]:
                nn_hist.append(window[nn].astype(float).values)
            nn_hist = np.concatenate(nn_hist) if nn_hist else np.zeros(0)
            f = np.concatenate([own_hist, nn_hist, [hour, dow]])[None, :]
        except:
            continue  # Skip if any required input is missing

        model = mlp_models[station]
        model.eval()
        with torch.no_grad():
            pred = model(torch.from_numpy(f).float()).numpy()[0]
        for h in range(pred_horizon):
            if mask[station].iloc[h]:
                test_pred.loc[pred_start + h, station] = pred[h]
    i += history_len + pred_horizon

# --- EXPORT PREDICTIONS ---
rows_with_preds = test_df[station_cols].isnull() | (test_df[station_cols] == '')
rows_with_preds = rows_with_preds.any(axis=1)
header = pd.DataFrame([test_df.columns], columns=test_df.columns)
final = pd.concat([header, test_pred[rows_with_preds]], ignore_index=True)
final.to_csv("mlp_with_neighbors.csv", index=False, header=False)
print("Predictions saved to mlp_with_neighbors.csv")


Predictions saved to mlp_with_neighbors.csv


In [None]:
import re

def safe_filename(name):
    return re.sub(r'[^\w\-_.]', '_', name)

for station, model in mlp_models.items():
    safe_name = safe_filename(station)
    path = f"mlp_models/{safe_name}.pt"
    torch.save(model.state_dict(), path)
    print(f"Saved {station} model to {path}")


Saved LIDL BEŽIGRAD model to mlp_models/LIDL_BEŽIGRAD.pt
Saved ŠMARTINSKI PARK model to mlp_models/ŠMARTINSKI_PARK.pt
Saved SAVSKO NASELJE 1-ŠMARTINSKA CESTA model to mlp_models/SAVSKO_NASELJE_1-ŠMARTINSKA_CESTA.pt
Saved ČRNUČE model to mlp_models/ČRNUČE.pt
Saved VILHARJEVA CESTA model to mlp_models/VILHARJEVA_CESTA.pt
Saved MASARYKOVA DDC model to mlp_models/MASARYKOVA_DDC.pt
Saved POGAČARJEV TRG-TRŽNICA model to mlp_models/POGAČARJEV_TRG-TRŽNICA.pt
Saved CANKARJEVA UL.-NAMA model to mlp_models/CANKARJEVA_UL.-NAMA.pt
Saved ANTONOV TRG model to mlp_models/ANTONOV_TRG.pt
Saved PRUŠNIKOVA model to mlp_models/PRUŠNIKOVA.pt
Saved TEHNOLOŠKI PARK model to mlp_models/TEHNOLOŠKI_PARK.pt
Saved KOSEŠKI BAJER model to mlp_models/KOSEŠKI_BAJER.pt
Saved TIVOLI model to mlp_models/TIVOLI.pt
Saved TRŽNICA MOSTE model to mlp_models/TRŽNICA_MOSTE.pt
Saved GRUDNOVO NABREŽJE-KARLOVŠKA C. model to mlp_models/GRUDNOVO_NABREŽJE-KARLOVŠKA_C..pt
Saved LIDL-LITIJSKA CESTA model to mlp_models/LIDL-LITIJSKA_CES

In [None]:
!zip -r mlp_models.zip mlp_models

  adding: mlp_models/ (stored 0%)
  adding: mlp_models/KOPRSKA_ULICA.pt (deflated 12%)
  adding: mlp_models/ANTONOV_TRG.pt (deflated 12%)
  adding: mlp_models/BRATOVŠEVA_PLOŠČAD.pt (deflated 11%)
  adding: mlp_models/HOFER_-_POLJE.pt (deflated 12%)
  adding: mlp_models/BS4-STOŽICE.pt (deflated 11%)
  adding: mlp_models/BAVARSKI_DVOR.pt (deflated 12%)
  adding: mlp_models/RAKOVNIK.pt (deflated 12%)
  adding: mlp_models/GH_ŠENTPETER-NJEGOŠEVA_C..pt (deflated 11%)
  adding: mlp_models/KINO_ŠIŠKA.pt (deflated 11%)
  adding: mlp_models/MIKLOŠIČEV_PARK.pt (deflated 11%)
  adding: mlp_models/ČRNUČE.pt (deflated 11%)
  adding: mlp_models/POVŠETOVA_-_KAJUHOVA.pt (deflated 11%)
  adding: mlp_models/PREŠERNOV_TRG-PETKOVŠKOVO_NABREŽJE.pt (deflated 11%)
  adding: mlp_models/KONGRESNI_TRG-ŠUBIČEVA_ULICA.pt (deflated 11%)
  adding: mlp_models/SREDNJA_FRIZERSKA_ŠOLA.pt (deflated 11%)
  adding: mlp_models/ALEJA_-_CELOVŠKA_CESTA.pt (deflated 11%)
  adding: mlp_models/POLJANSKA-POTOČNIKOVA.pt (deflated 1

# Transformer

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset, ConcatDataset
from sklearn.metrics.pairwise import haversine_distances
from itertools import product
import random

# --- PARAMETERS ---
history_len = 48
pred_horizon = 4
k_neighbors = 2
val_ratio = 0.15
holdout_ratio = 0.15
epochs = 50
patience = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

selected_stations = ['LIDL BEŽIGRAD', 'CITYPARK']  # use any subset of station names

# --- HYPERPARAMETER GRID ---
hidden_dims = [32, 64, 128]
dropouts = [0.1, 0.2]
lrs = [1e-3, 5e-4]
weight_decays = [0.0, 1e-5]
batch_sizes = [32, 64]
n_heads_list = [2, 4]
num_layers_list = [1, 2]

grid = list(product(hidden_dims, dropouts, lrs, weight_decays, batch_sizes, n_heads_list, num_layers_list))
random.shuffle(grid)
num_trials = 10  # limit to 10 random configs
grid = grid[:num_trials]

# --- LOAD DATA ---
train_df = pd.read_csv('bicikelj_train.csv')
meta = pd.read_csv('bicikelj_metadata.csv')
station_cols = train_df.columns[1:]
for col in station_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")
train_df = train_df.dropna(subset=station_cols, how='all').reset_index(drop=True)

coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:k_neighbors]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- DATASET ---
class BicikeljDataset(Dataset):
    def __init__(self, df, station_cols, neighbors, history_len, pred_horizon, station_filter=None):
        self.samples = []
        timestamps = pd.to_datetime(df['timestamp'])
        hours = (timestamps.dt.hour / 23.0).values
        dows = (timestamps.dt.dayofweek / 6.0).values
        bikes = df[station_cols].values.astype(float)
        name_to_idx = {name: i for i, name in enumerate(station_cols)}

        for s_name in station_filter:
            s_idx = name_to_idx[s_name]
            nn_idx = [name_to_idx[nn] for nn in neighbors[s_name]]
            for i in range(history_len, len(df) - pred_horizon + 1):
                seq = []
                for t in range(i - history_len, i):
                    row = [bikes[t, s_idx]]
                    row += [bikes[t, j] for j in nn_idx]
                    row += [hours[t], dows[t]]
                    seq.append(row)
                seq = np.stack(seq)
                target = bikes[i:i + pred_horizon, s_idx]
                self.samples.append((seq, target))

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# --- MODEL ---
class TimeSeriesTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_heads, num_layers, output_dim, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=n_heads,
                                                   dim_feedforward=hidden_dim*4,
                                                   dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        x = self.input_proj(x)
        x = self.transformer(x)
        x = x.mean(dim=1)
        return self.fc(x)

# --- TRAIN FUNCTION ---
def train_transformer(train_loader, val_loader, input_dim, output_dim, hidden_dim, n_heads, num_layers,
                      dropout, lr, weight_decay, device):
    model = TimeSeriesTransformer(input_dim, hidden_dim, n_heads, num_layers, output_dim, dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.MSELoss()
    best_val_loss = float('inf')
    patience_counter = 0
    best_state = None

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()

        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                loss = loss_fn(preds, yb)
                val_losses.append(loss.item())

        avg_val = np.mean(val_losses)
        if avg_val < best_val_loss:
            best_val_loss = avg_val
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break

    model.load_state_dict(best_state)
    return model

# --- BUILD DATASET ---
datasets = [BicikeljDataset(train_df, station_cols, neighbors, history_len, pred_horizon, station_filter=[s]) for s in selected_stations]
full_dataset = ConcatDataset(datasets)
total = len(full_dataset)
train_end = int(total * (1 - val_ratio - holdout_ratio))
val_end = int(total * (1 - holdout_ratio))
train_set = Subset(full_dataset, range(0, train_end))
val_set = Subset(full_dataset, range(train_end, val_end))
holdout_set = Subset(full_dataset, range(val_end, total))

# --- DIMENSIONS ---
x_sample, y_sample = full_dataset[0]
input_dim = x_sample.shape[1]
output_dim = y_sample.shape[0]

# --- GRID SEARCH ---
results = []
for i, (hd, dp, lr, wd, bs, nh, nl) in enumerate(grid):
    print(f"\nTrial {i+1}/{len(grid)}: hidden={hd}, dropout={dp}, lr={lr}, weight_decay={wd}, batch_size={bs}, heads={nh}, layers={nl}")

    train_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=bs)
    holdout_loader = DataLoader(holdout_set, batch_size=bs)

    model = train_transformer(train_loader, val_loader, input_dim, output_dim, hd, nh, nl, dp, lr, wd, device)

    # Evaluate
    model.eval()
    holdout_losses = []
    with torch.no_grad():
        for xb, yb in holdout_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = nn.functional.mse_loss(preds, yb)
            holdout_losses.append(loss.item())
    mse = np.mean(holdout_losses)
    print(f"Holdout MSE: {mse:.4f}")

    results.append({
        "hidden_dim": hd,
        "dropout": dp,
        "lr": lr,
        "weight_decay": wd,
        "batch_size": bs,
        "n_heads": nh,
        "num_layers": nl,
        "holdout_mse": mse
    })

# --- RESULTS ---
results = sorted(results, key=lambda r: r['holdout_mse'])
print("\nBest Config:")
print(results[0])


  train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")



Trial 1/10: hidden=128, dropout=0.1, lr=0.001, weight_decay=0.0, batch_size=32, heads=4, layers=1
Holdout MSE: 27.3257

Trial 2/10: hidden=128, dropout=0.2, lr=0.001, weight_decay=1e-05, batch_size=64, heads=2, layers=2
Holdout MSE: 23.8305

Trial 3/10: hidden=32, dropout=0.1, lr=0.0005, weight_decay=1e-05, batch_size=32, heads=4, layers=2
Holdout MSE: 28.1930

Trial 4/10: hidden=128, dropout=0.2, lr=0.001, weight_decay=0.0, batch_size=64, heads=4, layers=2
Holdout MSE: 25.2420

Trial 5/10: hidden=128, dropout=0.1, lr=0.0005, weight_decay=1e-05, batch_size=32, heads=4, layers=2
Holdout MSE: 25.6478

Trial 6/10: hidden=64, dropout=0.1, lr=0.0005, weight_decay=1e-05, batch_size=32, heads=4, layers=2
Holdout MSE: 29.4097

Trial 7/10: hidden=64, dropout=0.1, lr=0.001, weight_decay=0.0, batch_size=32, heads=2, layers=2
Holdout MSE: 27.7303

Trial 8/10: hidden=64, dropout=0.2, lr=0.0005, weight_decay=0.0, batch_size=64, heads=4, layers=1
Holdout MSE: 25.4549

Trial 9/10: hidden=128, dropout

# LSTM Per station

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import haversine_distances
from itertools import product
import random

# --- PARAMETERS ---
history_len = 48
pred_horizon = 4
k_neighbors = 2
val_ratio = 0.15
holdout_ratio = 0.15
epochs = 50
patience = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

selected_stations = ['LIDL BEŽIGRAD', 'CITYPARK']

# --- HYPERPARAMETER GRID ---
hidden_dims = [32, 64, 128]
dropouts = [0.2, 0.3]
lrs = [1e-3, 5e-4]
weight_decays = [0.0, 1e-5]
batch_sizes = [32, 64]

grid = list(product(hidden_dims, dropouts, lrs, weight_decays, batch_sizes))
random.shuffle(grid)
grid = grid[:10]  # Try only 10 configurations

# --- LOAD DATA ---
train_df = pd.read_csv('bicikelj_train.csv')
meta = pd.read_csv('bicikelj_metadata.csv')
station_cols = train_df.columns[1:]
for col in station_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")
train_df = train_df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- NEIGHBOR DETECTION ---
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:k_neighbors]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- DATASET CLASS ---
class BicikeljDataset(Dataset):
    def __init__(self, df, station_name, station_cols, neighbors, history_len, pred_horizon):
        self.samples = []
        timestamps = pd.to_datetime(df['timestamp'])
        hours = (timestamps.dt.hour / 23.0).values
        dows = (timestamps.dt.dayofweek / 6.0).values
        bikes = df[station_cols].values.astype(float)
        name_to_idx = {name: i for i, name in enumerate(station_cols)}

        s_idx = name_to_idx[station_name]
        nn_idx = [name_to_idx[nn] for nn in neighbors[station_name]]

        for i in range(history_len, len(df) - pred_horizon + 1):
            seq = []
            for t in range(i - history_len, i):
                row = [bikes[t, s_idx]]
                row += [bikes[t, j] for j in nn_idx]
                row += [hours[t], dows[t]]
                seq.append(row)
            seq = np.stack(seq)
            target = bikes[i:i + pred_horizon, s_idx]
            self.samples.append((seq, target))

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# --- LSTM MODEL ---
class LSTMForecast(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        h = self.dropout(h_n[-1])
        return self.fc(h)

# --- TRAINING FUNCTION ---
def train_lstm(train_loader, val_loader, input_dim, output_dim, hidden_dim, dropout, lr, weight_decay):
    model = LSTMForecast(input_dim, hidden_dim, output_dim, dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.MSELoss()
    best_val_loss = float('inf')
    patience_counter = 0
    best_state = None

    for epoch in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()

        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                loss = loss_fn(preds, yb)
                val_losses.append(loss.item())

        val_loss = np.mean(val_losses)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break

    model.load_state_dict(best_state)
    return model

# --- PER STATION LOOP ---
results = {}
for station in selected_stations:
    print(f"\n### Grid search for {station} ###")
    dataset = BicikeljDataset(train_df, station, station_cols, neighbors, history_len, pred_horizon)
    total = len(dataset)
    train_end = int(total * (1 - val_ratio - holdout_ratio))
    val_end = int(total * (1 - holdout_ratio))
    train_set = torch.utils.data.Subset(dataset, range(0, train_end))
    val_set = torch.utils.data.Subset(dataset, range(train_end, val_end))
    holdout_set = torch.utils.data.Subset(dataset, range(val_end, total))

    sample_x, sample_y = dataset[0]
    input_dim = sample_x.shape[1]
    output_dim = sample_y.shape[0]

    best_mse = float('inf')
    best_config = None
    best_model = None

    for hdim, drop, lr, wd, bs in grid:
        print(f"\nTrying hidden={hdim}, dropout={drop}, lr={lr}, wd={wd}, bs={bs}")
        train_loader = DataLoader(train_set, batch_size=bs, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=bs)
        holdout_loader = DataLoader(holdout_set, batch_size=bs)

        model = train_lstm(train_loader, val_loader, input_dim, output_dim, hdim, drop, lr, wd)

        # Evaluate on holdout
        model.eval()
        holdout_losses = []
        with torch.no_grad():
            for xb, yb in holdout_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                loss = nn.functional.mse_loss(preds, yb)
                holdout_losses.append(loss.item())
        mse = np.mean(holdout_losses)
        print(f"Holdout MSE: {mse:.4f}")

        if mse < best_mse:
            best_mse = mse
            best_config = (hdim, drop, lr, wd, bs)
            best_model = model.cpu()

    results[station] = {
        "best_mse": best_mse,
        "best_config": {
            "hidden_dim": best_config[0],
            "dropout": best_config[1],
            "lr": best_config[2],
            "weight_decay": best_config[3],
            "batch_size": best_config[4],
        },
        "model": best_model
    }

    print(f"\nBest config for {station}: {results[station]['best_config']}")
    print(f"Best holdout MSE: {best_mse:.4f}")


  train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")



### Grid search for LIDL BEŽIGRAD ###

Trying hidden=64, dropout=0.3, lr=0.001, wd=1e-05, bs=32
Holdout MSE: 11.0719

Trying hidden=32, dropout=0.2, lr=0.0005, wd=1e-05, bs=64
Holdout MSE: 10.5347

Trying hidden=64, dropout=0.3, lr=0.001, wd=0.0, bs=32
Holdout MSE: 10.9735

Trying hidden=32, dropout=0.3, lr=0.0005, wd=0.0, bs=64
Holdout MSE: 10.3531

Trying hidden=128, dropout=0.3, lr=0.001, wd=0.0, bs=32
Holdout MSE: 11.4727

Trying hidden=128, dropout=0.2, lr=0.001, wd=1e-05, bs=64
Holdout MSE: 11.7817

Trying hidden=32, dropout=0.3, lr=0.001, wd=0.0, bs=64
Holdout MSE: 10.5880

Trying hidden=32, dropout=0.2, lr=0.001, wd=0.0, bs=64
Holdout MSE: 10.6158

Trying hidden=128, dropout=0.3, lr=0.001, wd=1e-05, bs=32
Holdout MSE: 11.1964

Trying hidden=32, dropout=0.3, lr=0.001, wd=1e-05, bs=64
Holdout MSE: 10.3310

Best config for LIDL BEŽIGRAD: {'hidden_dim': 32, 'dropout': 0.3, 'lr': 0.001, 'weight_decay': 1e-05, 'batch_size': 64}
Best holdout MSE: 10.3310

### Grid search for CITYPARK

# LSTM Shared

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import haversine_distances
from tqdm import tqdm

# --- Hyperparameters ---
HISTORY_LEN = 48
PRED_HORIZON = 4
K_NEIGHBORS = 2
HIDDEN_DIM = 32
DROPOUT = 0.3
LR = 0.001
WEIGHT_DECAY = 1e-5
BATCH_SIZE = 64
EPOCHS = 50
PATIENCE = 8
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Load data ---
df = pd.read_csv('bicikelj_train.csv')
meta = pd.read_csv('bicikelj_metadata.csv')
station_cols = df.columns[1:]

for col in station_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df[station_cols] = df[station_cols].fillna(method="ffill").fillna(method="bfill")
df = df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- Neighbors ---
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:K_NEIGHBORS]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- Dataset ---
class SharedLSTMDataset(Dataset):
    def __init__(self, df, station_cols, neighbors, history_len, pred_horizon):
        self.samples = []
        self.station_indices = []

        timestamps = pd.to_datetime(df['timestamp'])
        hours = (timestamps.dt.hour / 23.0).values
        dows = (timestamps.dt.dayofweek / 6.0).values
        bikes = df[station_cols].values.astype(np.float32)
        name_to_idx = {name: i for i, name in enumerate(station_cols)}
        N = len(df)

        # Precompute extra features
        time_feats = np.stack([hours, dows], axis=1)

        for s_name in station_cols:
            s_idx = name_to_idx[s_name]
            nn_idx = [name_to_idx[nn] for nn in neighbors[s_name]]

            series = bikes[:, [s_idx] + nn_idx]  # [T, 1 + k_neighbors]
            all_feats = np.concatenate([series, time_feats], axis=1)  # [T, D]

            # Create sequences
            for i in range(history_len, N - pred_horizon + 1):
                seq = all_feats[i - history_len:i]
                target = bikes[i:i + pred_horizon, s_idx]
                self.samples.append((seq, target))
                self.station_indices.append(s_name)

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# --- Model ---
class LSTMForecast(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.dropout(h_n[-1])
        return self.fc(out)

# --- Train ---
def train_lstm(model, train_loader, val_loader, device):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    criterion = nn.MSELoss()
    best_loss = float('inf')
    patience_counter = 0
    best_state = None

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        batch_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Training]", leave=False)
        for xb, yb in batch_bar:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            batch_bar.set_postfix(loss=loss.item())

        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                val_losses.append(criterion(pred, yb).item())

        val_loss = np.mean(val_losses)
        print(f"Epoch {epoch+1}: Val Loss = {val_loss:.4f}")

        if val_loss < best_loss:
            best_loss = val_loss
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(f"Early stopping at epoch {epoch+1}")
                break

    model.load_state_dict(best_state)
    return model


# --- Data splitting ---
dataset = SharedLSTMDataset(df, station_cols, neighbors, HISTORY_LEN, PRED_HORIZON)
N = len(dataset)
val_size = int(0.15 * N)
holdout_size = int(0.15 * N)
train_size = N - val_size - holdout_size

train_set = torch.utils.data.Subset(dataset, range(0, train_size))
val_set = torch.utils.data.Subset(dataset, range(train_size, train_size + val_size))
holdout_set = torch.utils.data.Subset(dataset, range(train_size + val_size, N))

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE)
holdout_loader = DataLoader(holdout_set, batch_size=BATCH_SIZE)

# --- Train model ---
input_dim = 1 + K_NEIGHBORS + 2  # station + neighbors + time features
output_dim = PRED_HORIZON

model = LSTMForecast(input_dim, HIDDEN_DIM, output_dim, DROPOUT)
model = train_lstm(model, train_loader, val_loader, DEVICE)

# --- Evaluate on holdout ---
model.eval()
holdout_preds = []
with torch.no_grad():
    for xb, yb in holdout_loader:
        xb = xb.to(DEVICE)
        pred = model(xb).cpu().numpy()
        holdout_preds.append(pred)

holdout_preds = np.vstack(holdout_preds)
np.save("shared_lstm_holdout_preds.npy", holdout_preds)
print("Saved shared model predictions to 'shared_lstm_holdout_preds.npy'")


  df[station_cols] = df[station_cols].fillna(method="ffill").fillna(method="bfill")


Epoch 1: Val Loss = 11.8076




Epoch 2: Val Loss = 11.5019




Epoch 3: Val Loss = 11.6014




Epoch 4: Val Loss = 11.3264




Epoch 5: Val Loss = 11.1160




Epoch 6: Val Loss = 11.4553




Epoch 7: Val Loss = 11.0925




Epoch 8: Val Loss = 11.1123




Epoch 9: Val Loss = 11.1308




Epoch 10: Val Loss = 11.0821




Epoch 11: Val Loss = 11.2118




Epoch 12: Val Loss = 11.2074




Epoch 13: Val Loss = 11.0327




Epoch 14: Val Loss = 11.0280




Epoch 15: Val Loss = 10.9593




Epoch 16: Val Loss = 10.9804




Epoch 17: Val Loss = 11.0089




Epoch 18: Val Loss = 10.9491




Epoch 19: Val Loss = 11.0127




Epoch 20: Val Loss = 11.0551




Epoch 21: Val Loss = 10.9530




Epoch 22: Val Loss = 11.0295




Epoch 23: Val Loss = 11.0662




Epoch 24: Val Loss = 11.0872




Epoch 25: Val Loss = 10.8617




Epoch 26: Val Loss = 10.8690




Epoch 27: Val Loss = 10.9643




Epoch 28: Val Loss = 10.9061




Epoch 29: Val Loss = 10.8802




Epoch 30: Val Loss = 10.9756




Epoch 31: Val Loss = 10.7911




Epoch 32: Val Loss = 10.9400




Epoch 33: Val Loss = 10.9721




Epoch 34: Val Loss = 10.7299




Epoch 35: Val Loss = 10.9597




Epoch 36: Val Loss = 10.9000




Epoch 37: Val Loss = 10.9011




Epoch 38: Val Loss = 10.9743




Epoch 39: Val Loss = 10.9573




Epoch 40: Val Loss = 11.0736




Epoch 41: Val Loss = 11.0157




Epoch 42: Val Loss = 10.9561
Early stopping at epoch 42
Saved shared model predictions to 'shared_lstm_holdout_preds.npy'


In [None]:
torch.save(model.state_dict(), "lstm_model_added_features.pt")


In [None]:
# Recreate the same architecture
model = LSTMForecast(input_dim=5, hidden_dim=32, output_dim=4, dropout=0.3)
model.load_state_dict(torch.load("lstm_model_added_features.pt"))
model.to(DEVICE)
model.eval()


LSTMForecast(
  (lstm): LSTM(5, 32, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=32, out_features=4, bias=True)
)

In [None]:
# --- Predict only the unknown rows in bicikelj_test.csv using a single shared model ---

# Recreate model and load the saved weights
input_dim = 1 + K_NEIGHBORS + 2  # own station + neighbors + hour + dow
output_dim = PRED_HORIZON

model = LSTMForecast(input_dim, 32, output_dim, 0.3).to(DEVICE)
model.load_state_dict(torch.load("lstm_model.pt"))
model.eval()

# Prepare helper maps and features
test_feats = test_df[station_cols].values.astype(np.float32)
timestamps = pd.to_datetime(test_df["timestamp"])
hours = (timestamps.dt.hour / 23.0).values
dows = (timestamps.dt.dayofweek / 6.0).values
name_to_idx = {name: i for i, name in enumerate(station_cols)}

# Collect predictions
pred_matrix = np.full_like(test_feats, np.nan)

with torch.no_grad():
    for i in range(HISTORY_LEN, len(test_df) - PRED_HORIZON + 1):
        # Predict only if all stations are missing at the target range
        if np.isnan(test_feats[i:i + PRED_HORIZON]).all(axis=0).all():
            for station in station_cols:
                s_idx = name_to_idx[station]
                nn_idx = [name_to_idx[nn] for nn in neighbors[station]]

                # Prepare input sequence
                seq = []
                for t in range(i - HISTORY_LEN, i):
                    row = [test_feats[t, s_idx]]
                    row += [test_feats[t, j] for j in nn_idx]
                    row += [hours[t], dows[t]]
                    seq.append(row)
                seq = torch.tensor([seq], dtype=torch.float32).to(DEVICE)

                # Predict and insert into matrix
                pred = model(seq).cpu().numpy().flatten()
                for j in range(PRED_HORIZON):
                    pred_matrix[i + j, s_idx] = pred[j]

# Build output DataFrame
pred_df = pd.DataFrame(pred_matrix, columns=station_cols)
pred_df.insert(0, "timestamp", test_df["timestamp"])

# Only include rows where predictions were made (i.e. they were originally all NaN)
rows_to_output = test_df[station_cols].isna().all(axis=1)
pred_df_filtered = pred_df[rows_to_output].copy()

# Save final predictions
pred_df_filtered.to_csv("bicikelj_test_predictions_128.csv", index=False)


NameError: name 'test_df' is not defined

# Added holidays

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import haversine_distances
from tqdm import tqdm
import holidays

# --- Hyperparameters ---
HISTORY_LEN = 48
PRED_HORIZON = 4
K_NEIGHBORS = 2
HIDDEN_DIM = 128
DROPOUT = 0.1
LR = 0.001
WEIGHT_DECAY = 1e-5
BATCH_SIZE = 64
EPOCHS = 50
PATIENCE = 8
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Load data ---
df = pd.read_csv('bicikelj_train.csv')
meta = pd.read_csv('bicikelj_metadata.csv')
station_cols = df.columns[1:]

# Clean and fill NaNs
for col in station_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df[station_cols] = df[station_cols].fillna(method="ffill").fillna(method="bfill")
df = df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- Neighbors ---
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:K_NEIGHBORS]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- Dataset with time features ---
class SharedLSTMDataset(Dataset):
    def __init__(self, df, station_cols, neighbors, history_len, pred_horizon):
        self.samples = []
        self.station_indices = []

        timestamps = pd.to_datetime(df['timestamp'])
        hour_sin = np.sin(2 * np.pi * timestamps.dt.hour / 24)
        hour_cos = np.cos(2 * np.pi * timestamps.dt.hour / 24)
        is_weekend = (timestamps.dt.dayofweek >= 5).astype(float)
        slo_holidays = holidays.Slovenia()
        is_holiday = timestamps.dt.date.astype(str).isin([str(d) for d in slo_holidays]).astype(float)

        bikes = df[station_cols].values.astype(np.float32)
        name_to_idx = {name: i for i, name in enumerate(station_cols)}
        N = len(df)

        # Precompute extra features
        time_feats = np.stack([hour_sin, hour_cos, is_weekend, is_holiday], axis=1)

        for s_name in station_cols:
            s_idx = name_to_idx[s_name]
            nn_idx = [name_to_idx[nn] for nn in neighbors[s_name]]

            series = bikes[:, [s_idx] + nn_idx]  # [T, 1 + k_neighbors]
            all_feats = np.concatenate([series, time_feats], axis=1)  # [T, D]

            for i in range(history_len, N - pred_horizon + 1):
                seq = all_feats[i - history_len:i]
                target = bikes[i:i + pred_horizon, s_idx]
                self.samples.append((seq, target))
                self.station_indices.append(s_name)

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# --- Model ---
class LSTMForecast(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.dropout(h_n[-1])
        return self.fc(out)

# --- Train ---
def train_lstm(model, train_loader, val_loader, device):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    criterion = nn.MSELoss()
    best_loss = float('inf')
    patience_counter = 0
    best_state = None

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        batch_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Training]", leave=False)
        for xb, yb in batch_bar:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            batch_bar.set_postfix(loss=loss.item())

        model.eval()
        val_losses = []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb)
                val_losses.append(criterion(pred, yb).item())

        val_loss = np.mean(val_losses)
        print(f"Epoch {epoch+1}: Val Loss = {val_loss:.4f}")

        if val_loss < best_loss:
            best_loss = val_loss
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(f"Early stopping at epoch {epoch+1}")
                break

    model.load_state_dict(best_state)
    return model

# --- Data splitting ---
dataset = SharedLSTMDataset(df, station_cols, neighbors, HISTORY_LEN, PRED_HORIZON)
N = len(dataset)
val_size = int(0.15 * N)
holdout_size = int(0.15 * N)
train_size = N - val_size - holdout_size

train_set = torch.utils.data.Subset(dataset, range(0, train_size))
val_set = torch.utils.data.Subset(dataset, range(train_size, train_size + val_size))
holdout_set = torch.utils.data.Subset(dataset, range(train_size + val_size, N))

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE)
holdout_loader = DataLoader(holdout_set, batch_size=BATCH_SIZE)

# --- Train model ---
input_dim = 1 + K_NEIGHBORS + 4  # station + neighbors + 4 time features
output_dim = PRED_HORIZON

model = LSTMForecast(input_dim, HIDDEN_DIM, output_dim, DROPOUT)
model = train_lstm(model, train_loader, val_loader, DEVICE)

# --- Evaluate on holdout ---
model.eval()
holdout_preds = []
with torch.no_grad():
    for xb, yb in holdout_loader:
        xb = xb.to(DEVICE)
        pred = model(xb).cpu().numpy()
        holdout_preds.append(pred)

holdout_preds = np.vstack(holdout_preds)
np.save("shared_lstm_holdout_preds.npy", holdout_preds)
print("✅ Saved shared model predictions to 'shared_lstm_holdout_preds.npy'")


  df[station_cols] = df[station_cols].fillna(method="ffill").fillna(method="bfill")


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), "lstm_model_added_features.pt")


In [None]:
# Recreate the same architecture
modell = LSTMForecast(input_dim=7, hidden_dim=128, output_dim=4, dropout=0.1)
modell.load_state_dict(torch.load("lstm_model_added_features.pt"))
modell.to(DEVICE)
modell.eval()


LSTMForecast(
  (lstm): LSTM(7, 128, batch_first=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=128, out_features=4, bias=True)
)

In [None]:
# --- Predict only the unknown rows in bicikelj_test.csv using a single shared model ---

import numpy as np
import pandas as pd
import torch
from datetime import datetime
import holidays

# --- Constants ---
HISTORY_LEN = 48
PRED_HORIZON = 4
K_NEIGHBORS = 2
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Load data ---
test_df = pd.read_csv("bicikelj_test.csv")
meta = pd.read_csv("bicikelj_metadata.csv")
station_cols = test_df.columns[1:]

# --- Neighbors ---
from sklearn.metrics.pairwise import haversine_distances
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:K_NEIGHBORS]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- Load model ---
input_dim = 1 + K_NEIGHBORS + 4  # 1 station + 2 neighbors + 4 time features
output_dim = PRED_HORIZON

class LSTMForecast(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.dropout = torch.nn.Dropout(dropout)
        self.fc = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.dropout(h_n[-1])
        return self.fc(out)

model = LSTMForecast(input_dim, 128, output_dim, 0.1).to(DEVICE)
model.load_state_dict(torch.load("lstm_model_added_features.pt"))
model.eval()

# --- Prepare features ---
test_feats = test_df[station_cols].values.astype(np.float32)
timestamps = pd.to_datetime(test_df["timestamp"])

hour_sin = np.sin(2 * np.pi * timestamps.dt.hour / 24)
hour_cos = np.cos(2 * np.pi * timestamps.dt.hour / 24)
is_weekend = (timestamps.dt.dayofweek >= 5).astype(float)
slo_holidays = holidays.Slovenia()
is_holiday = timestamps.dt.date.astype(str).isin([str(d) for d in slo_holidays]).astype(float)

name_to_idx = {name: i for i, name in enumerate(station_cols)}
time_feats = np.stack([hour_sin, hour_cos, is_weekend, is_holiday], axis=1)

# --- Predict ---
pred_matrix = np.full_like(test_feats, np.nan)

with torch.no_grad():
    for i in range(HISTORY_LEN, len(test_df) - PRED_HORIZON + 1):
        if np.isnan(test_feats[i:i + PRED_HORIZON]).all(axis=0).all():
            for station in station_cols:
                s_idx = name_to_idx[station]
                nn_idx = [name_to_idx[nn] for nn in neighbors[station]]

                seq = []
                for t in range(i - HISTORY_LEN, i):
                    row = [test_feats[t, s_idx]]
                    row += [test_feats[t, j] for j in nn_idx]
                    row += list(time_feats[t])
                    seq.append(row)
                seq = torch.tensor([seq], dtype=torch.float32).to(DEVICE)

                pred = model(seq).cpu().numpy().flatten()
                for j in range(PRED_HORIZON):
                    pred_matrix[i + j, s_idx] = pred[j]

# --- Save output ---
pred_df = pd.DataFrame(pred_matrix, columns=station_cols)
pred_df.insert(0, "timestamp", test_df["timestamp"])

rows_to_output = test_df[station_cols].isna().all(axis=1)
pred_df_filtered = pred_df[rows_to_output].copy()

pred_df_filtered.to_csv("bicikelj_test_predictions_128.csv", index=False)
print("✅ Saved predictions to 'bicikelj_test_predictions_128.csv'")


✅ Saved predictions to 'bicikelj_test_predictions_128.csv'


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import haversine_distances
import holidays
from tqdm import tqdm

# --- Hyperparameters ---
HISTORY_LEN = 48
PRED_HORIZON = 4
K_NEIGHBORS = 2
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Model ---
class LSTMForecast(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.dropout(h_n[-1])
        return self.fc(out)

# --- Load data ---
df = pd.read_csv('bicikelj_train.csv')
meta = pd.read_csv('bicikelj_metadata.csv')
station_cols = df.columns[1:]

# Clean and fill NaNs
for col in station_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df[station_cols] = df[station_cols].fillna(method="ffill").fillna(method="bfill")
df = df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- Neighbors ---
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:K_NEIGHBORS]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- Dataset with time features ---
class SharedLSTMDataset(Dataset):
    def __init__(self, df, station_cols, neighbors, history_len, pred_horizon):
        self.samples = []
        self.station_indices = []

        timestamps = pd.to_datetime(df['timestamp'])
        hour_sin = np.sin(2 * np.pi * timestamps.dt.hour / 24)
        hour_cos = np.cos(2 * np.pi * timestamps.dt.hour / 24)
        is_weekend = (timestamps.dt.dayofweek >= 5).astype(float)
        slo_holidays = holidays.Slovenia()
        is_holiday = timestamps.dt.date.astype(str).isin([str(d) for d in slo_holidays]).astype(float)

        bikes = df[station_cols].values.astype(np.float32)
        name_to_idx = {name: i for i, name in enumerate(station_cols)}
        N = len(df)

        time_feats = np.stack([hour_sin, hour_cos, is_weekend, is_holiday], axis=1)

        for s_name in station_cols:
            s_idx = name_to_idx[s_name]
            nn_idx = [name_to_idx[nn] for nn in neighbors[s_name]]

            series = bikes[:, [s_idx] + nn_idx]
            all_feats = np.concatenate([series, time_feats], axis=1)

            for i in range(history_len, N - pred_horizon + 1):
                seq = all_feats[i - history_len:i]
                target = bikes[i:i + pred_horizon, s_idx]
                self.samples.append((seq, target))
                self.station_indices.append(s_name)

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# --- Prepare holdout set ---
dataset = SharedLSTMDataset(df, station_cols, neighbors, HISTORY_LEN, PRED_HORIZON)
N = len(dataset)
val_size = int(0.15 * N)
holdout_size = int(0.15 * N)
train_size = N - val_size - holdout_size

holdout_set = torch.utils.data.Subset(dataset, range(train_size + val_size, N))
holdout_loader = DataLoader(holdout_set, batch_size=64)

# --- Load model ---
input_dim = 1 + K_NEIGHBORS + 4
output_dim = PRED_HORIZON
hidden_dim = 128
dropout = 0.1

model = LSTMForecast(input_dim, hidden_dim, output_dim, dropout).to(DEVICE)
model.load_state_dict(torch.load("lstm_model_added_features.pt"))
model.eval()

# --- Evaluate MSE ---
mse_loss = nn.MSELoss()
all_losses = []

with torch.no_grad():
    for xb, yb in tqdm(holdout_loader, desc="Evaluating on holdout"):
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        preds = model(xb)
        loss = mse_loss(preds, yb)
        all_losses.append(loss.item())

final_mse = np.mean(all_losses)
print(f"✅ Final Holdout MSE = {final_mse:.6f}")


  df[station_cols] = df[station_cols].fillna(method="ffill").fillna(method="bfill")
Evaluating on holdout: 100%|██████████| 4019/4019 [00:07<00:00, 537.36it/s]

✅ Final Holdout MSE = 9.457019





## Grid search LSTM Shared

In [None]:
import itertools
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.metrics.pairwise import haversine_distances
from tqdm import tqdm
import holidays
import random

# --- Static Params ---
HISTORY_LEN = 48
PRED_HORIZON = 4
K_NEIGHBORS = 2
EPOCHS = 20
PATIENCE = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_COMBINATIONS = 20
TRAIN_FRACTION = 0.01

# --- Load data ---
df = pd.read_csv('bicikelj_train.csv')
meta = pd.read_csv('bicikelj_metadata.csv')
station_cols = df.columns[1:]

# Clean and fill
for col in station_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df[station_cols] = df[station_cols].ffill().bfill()
df = df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- Neighbors ---
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()
dists = haversine_distances(coords, coords) * 6371
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:K_NEIGHBORS]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- Dataset ---
class SharedLSTMDataset(Dataset):
    def __init__(self, df, station_cols, neighbors, history_len, pred_horizon):
        self.samples = []
        timestamps = pd.to_datetime(df['timestamp'])

        hour_sin = np.sin(2 * np.pi * timestamps.dt.hour / 24)
        hour_cos = np.cos(2 * np.pi * timestamps.dt.hour / 24)
        is_weekend = (timestamps.dt.dayofweek >= 5).astype(float)
        slo_holidays = holidays.Slovenia()
        is_holiday = timestamps.dt.date.astype(str).isin([str(d) for d in slo_holidays]).astype(float)
        time_feats = np.stack([hour_sin, hour_cos, is_weekend, is_holiday], axis=1)

        bikes = df[station_cols].values.astype(np.float32)
        name_to_idx = {name: i for i, name in enumerate(station_cols)}
        N = len(df)

        for s_name in station_cols:
            s_idx = name_to_idx[s_name]
            nn_idx = [name_to_idx[nn] for nn in neighbors[s_name]]
            series = bikes[:, [s_idx] + nn_idx]
            full_feats = np.concatenate([series, time_feats], axis=1)

            for i in range(history_len, N - pred_horizon + 1):
                seq = full_feats[i - history_len:i]
                target = bikes[i:i + pred_horizon, s_idx]
                self.samples.append((seq, target))

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

# --- Model ---
class LSTMForecast(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        return self.fc(self.dropout(h_n[-1]))

# --- Train function ---
def train_lstm(model, train_loader, val_loader, lr, weight_decay):
    model = model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.MSELoss()
    best_loss = float('inf')
    best_state = None
    patience_counter = 0

    for epoch in range(EPOCHS):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                val_loss += criterion(model(xb), yb).item()
        val_loss /= len(val_loader)

        if val_loss < best_loss:
            best_loss = val_loss
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                break

    model.load_state_dict(best_state)
    return model, best_loss

# --- Grid search space ---
param_grid = {
    # 'hidden_dim': [16, 32, 64],
    # 'dropout': [0.1, 0.3, 0.5],
    # 'lr': [1e-3, 5e-4],
    # 'weight_decay': [1e-5, 1e-4]
    'hidden_dim': [128],
    'dropout': [0.0, 0.1],
    'lr': [1e-3],
    'weight_decay': [1e-5]
}
param_combos = list(itertools.product(*param_grid.values()))
random.shuffle(param_combos)
param_combos = param_combos[:MAX_COMBINATIONS]

# --- Prepare dataset ---
dataset = SharedLSTMDataset(df, station_cols, neighbors, HISTORY_LEN, PRED_HORIZON)
N = len(dataset)
reduced_N = int(N * TRAIN_FRACTION)
indices = list(range(N))
random.shuffle(indices)

train_size = int(reduced_N * 0.7)
val_size = int(reduced_N * 0.15)
holdout_size = reduced_N - train_size - val_size

train_set = Subset(dataset, indices[:train_size])
val_set = Subset(dataset, indices[train_size:train_size + val_size])
holdout_set = Subset(dataset, indices[train_size + val_size:train_size + val_size + holdout_size])

train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
val_loader = DataLoader(val_set, batch_size=64)
holdout_loader = DataLoader(holdout_set, batch_size=64)

# --- Run grid search ---
input_dim = 1 + K_NEIGHBORS + 4
output_dim = PRED_HORIZON

results = []
print(f"⏳ Running grid search over {len(param_combos)} combinations...")
for i, (hdim, dr, lr, wd) in enumerate(param_combos):
    print(f"\n🔍 Combo {i+1}: hidden_dim={hdim}, dropout={dr}, lr={lr}, weight_decay={wd}")
    model = LSTMForecast(input_dim, hdim, output_dim, dr)
    model, val_loss = train_lstm(model, train_loader, val_loader, lr, wd)

    # Evaluate on holdout
    model.eval()
    holdout_loss = 0.0
    criterion = nn.MSELoss()
    with torch.no_grad():
        for xb, yb in holdout_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            holdout_loss += criterion(model(xb), yb).item()
    holdout_loss /= len(holdout_loader)

    print(f"✅ Val Loss: {val_loss:.4f}, Holdout Loss: {holdout_loss:.4f}")
    results.append({
        "hidden_dim": hdim,
        "dropout": dr,
        "lr": lr,
        "weight_decay": wd,
        "val_loss": val_loss,
        "holdout_loss": holdout_loss
    })

# --- Save results ---
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="holdout_loss")
results_df.to_csv("grid_search_results.csv", index=False)
print("\n📊 Top 5 Results:")
print(results_df.head())


⏳ Running grid search over 2 combinations...

🔍 Combo 1: hidden_dim=128, dropout=0.0, lr=0.001, weight_decay=1e-05
✅ Val Loss: 10.7869, Holdout Loss: 10.7490

🔍 Combo 2: hidden_dim=128, dropout=0.1, lr=0.001, weight_decay=1e-05
✅ Val Loss: 10.6357, Holdout Loss: 10.9593

📊 Top 5 Results:
   hidden_dim  dropout     lr  weight_decay   val_loss  holdout_loss
0         128      0.0  0.001       0.00001  10.786934     10.748992
1         128      0.1  0.001       0.00001  10.635716     10.959280
