In [6]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import haversine_distances
from tqdm import tqdm
import holidays
import random

# --- Hyperparameters ---
HISTORY_LEN = 48
PRED_HORIZON = 4
BATCH_SIZE = 128
VAL_FRAC = 0.1
STRIDE = 2  # Or whatever stride you want

# --- Load data ---
df = pd.read_csv("../data/bicikelj_train.csv")
meta = pd.read_csv("../data/bicikelj_metadata.csv")
station_cols = df.columns[1:]

# Clean and fill
for col in station_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")
df[station_cols] = df[station_cols].ffill().bfill()
df = df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- Load weather ---
weather_df = pd.read_csv("../data/weather_ljubljana.csv", skiprows=2)
weather_df = weather_df.rename(columns={
    'temperature_2m (Â°C)': 'temperature_2m',
    'precipitation (mm)': 'precipitation',
    'windspeed_10m (km/h)': 'windspeed_10m',
    'cloudcover (%)': 'cloudcover'
})
weather_df['time'] = pd.to_datetime(weather_df['time'])
df['timestamp'] = pd.to_datetime(df['timestamp']).dt.tz_localize(None)
df_merged = pd.merge(df, weather_df, left_on='timestamp', right_on='time', how='left')
weather_features = ['temperature_2m', 'precipitation', 'windspeed_10m', 'cloudcover']
df_merged[weather_features] = df_merged[weather_features].ffill().bfill()
N = len(df_merged)
print("sample len: ", N)

# --- Randomly select validation cut regions ---
BLOCK_SIZE = HISTORY_LEN + PRED_HORIZON
np.random.seed(42)
all_possible_starts = np.arange(0, N - BLOCK_SIZE + 1)
val_mask = np.zeros(N, dtype=bool)
val_starts = []
target_val_coverage = int(VAL_FRAC * N)
covered = 0

np.random.shuffle(all_possible_starts)

for start in all_possible_starts:
    if val_mask[start:start + BLOCK_SIZE].any():
        continue  # Skip if this region overlaps with any already taken
    val_mask[start:start + BLOCK_SIZE] = True
    val_starts.append(start)
    covered += BLOCK_SIZE
    if covered >= target_val_coverage:
        break

train_mask = ~val_mask

print(f"Validation coverage: {val_mask.sum()} ({val_mask.sum()/N:.3f})")
print(f"Train coverage:      {train_mask.sum()} ({train_mask.sum()/N:.3f})")
print(f"Number of val sequences: {len(val_starts)}")

# --- Strided window sample selection ---
def make_sample_indices(mask, history_len, pred_horizon, stride=1):
    N = len(mask)
    indices = []
    for i in range(history_len, N - pred_horizon + 1, stride):
        if mask[i - history_len:i + pred_horizon].all():
            indices.append(i)
    return indices

train_indices = make_sample_indices(train_mask, HISTORY_LEN, PRED_HORIZON, stride=STRIDE)
val_indices = make_sample_indices(val_mask, HISTORY_LEN, PRED_HORIZON, stride=1)

print(f"Train samples: {len(train_indices)} | Val samples: {len(val_indices)}")

# --- Normalize using only train region statistics ---
station_means = df_merged.loc[train_mask, station_cols].mean()

sample len:  20463
Validation coverage: 2080 (0.102)
Train coverage:      18383 (0.898)
Number of val sequences: 40
Train samples: 8257 | Val samples: 40
