In [4]:
import pandas as pd
import numpy as np
import lightgbm as lgb

# Load data
train_df = pd.read_csv('../data/bicikelj_train.csv')
test_df = pd.read_csv('../data/bicikelj_test.csv')

station_cols = train_df.columns[1:]
for col in station_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")


# --- Prepare features and targets for each station ---
history_len = 48  # previous hours to use
pred_horizon = 4  # how many hours to predict

models = {}
features_list = []
targets_list = []

for station in station_cols:
    features = []
    targets = []
    for i in range(history_len, len(train_df) - pred_horizon + 1):
        hist = train_df[station].iloc[i-history_len:i].values.astype(float)
        target = train_df[station].iloc[i:i+pred_horizon].values.astype(float)
        hour = pd.to_datetime(train_df['timestamp'].iloc[i]).hour / 23.0
        dow = pd.to_datetime(train_df['timestamp'].iloc[i]).dayofweek / 6.0
        f = np.concatenate([hist, [hour, dow]])
        features.append(f)
        targets.append(target)
    features = np.array(features)
    targets = np.array(targets)
    features_list.append(features)
    targets_list.append(targets)

# --- Train LightGBM for each station and prediction hour ---
models = {}
for idx, station in enumerate(station_cols):
    features = features_list[idx]
    targets = targets_list[idx]
    models[station] = []
    for h in range(pred_horizon):
        m = lgb.LGBMRegressor(n_estimators=100)
        m.fit(features, targets[:, h])
        models[station].append(m)

# --- Predict missing values in the test set ---
test_pred = test_df.copy()
test_pred[station_cols] = test_pred[station_cols].astype(str)
i = 0
while i < len(test_df):
    window = test_df.iloc[i:i+history_len]
    pred_start = i + history_len
    if pred_start + pred_horizon > len(test_df):
        break
    to_pred = test_df.iloc[pred_start:pred_start+pred_horizon]
    # Check if we have missing here (hole)
    mask = to_pred[station_cols].isnull() | (to_pred[station_cols] == '')
    if mask.values.any():
        for sidx, station in enumerate(station_cols):
            # Only predict if at least one missing
            if mask[station].any():
                hist = window[station].values.astype(float)
                hour = pd.to_datetime(to_pred['timestamp'].iloc[0]).hour / 23.0
                dow = pd.to_datetime(to_pred['timestamp'].iloc[0]).dayofweek / 6.0
                f = np.concatenate([hist, [hour, dow]])[None, :]
                preds = [mdl.predict(f)[0] for mdl in models[station]]
                # Fill only missing
                for h in range(pred_horizon):
                    if mask[station].iloc[h]:
                        test_pred.loc[pred_start + h, station] = preds[h]
    i += history_len + pred_horizon

# --- Export only rows with predictions and header row ---
rows_with_preds = test_df[station_cols].isnull() | (test_df[station_cols] == '')
rows_with_preds = rows_with_preds.any(axis=1)
header = pd.DataFrame([test_df.columns], columns=test_df.columns)
final = pd.concat([header, test_pred[rows_with_preds]], ignore_index=True)
final.to_csv("lgbm_per_station.csv", index=False, header=False)
print("Predictions saved as lgbm_per_station.csv")


  train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003738 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 20412, number of used features: 50
[LightGBM] [Info] Start training from score 5.710464
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 20412, number of used features: 50
[LightGBM] [Info] Start training from score 5.710317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000516 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bi

# With k nearest

In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics.pairwise import haversine_distances

# --- PARAMETERS ---
history_len = 48   # Length of history to use as features
pred_horizon = 4   # How many hours to predict
k_neighbors = 2    # Number of nearest neighbors to use

# --- LOAD DATA ---
train_df = pd.read_csv('../data/bicikelj_train.csv')
test_df = pd.read_csv('../data/bicikelj_test.csv')
meta = pd.read_csv('../data/bicikelj_metadata.csv')

# --- CLEAN DATA ---
station_cols = train_df.columns[1:]
for col in station_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")
# Remove rows with all NaNs (if any slipped through)
train_df = train_df.dropna(subset=station_cols, how='all').reset_index(drop=True)

# --- NEIGHBOR DETECTION ---
coords = np.deg2rad(meta[['latitude', 'longitude']].values)
station_names = meta['name'].tolist()

dists = haversine_distances(coords, coords) * 6371  # km
neighbors = {}
for i, name in enumerate(station_names):
    order = np.argsort(dists[i])
    nn_idx = [j for j in order if j != i][:k_neighbors]
    neighbors[name] = [station_names[j] for j in nn_idx]

# --- FEATURE ENGINEERING ---
features_list = []
targets_list = []

for station in station_cols:
    features = []
    targets = []
    nn_stations = neighbors[station]
    for i in range(history_len, len(train_df) - pred_horizon + 1):
        own_hist = train_df[station].iloc[i-history_len:i].values.astype(float)
        nn_hist = []
        for nn in nn_stations:
            nn_hist.append(train_df[nn].iloc[i-history_len:i].values.astype(float))
        nn_hist = np.concatenate(nn_hist) if nn_hist else np.zeros(0)
        hour = pd.to_datetime(train_df['timestamp'].iloc[i]).hour / 23.0
        dow = pd.to_datetime(train_df['timestamp'].iloc[i]).dayofweek / 6.0
        f = np.concatenate([own_hist, nn_hist, [hour, dow]])
        features.append(f)
        target = train_df[station].iloc[i:i+pred_horizon].values.astype(float)
        targets.append(target)
    features = np.array(features)
    targets = np.array(targets)
    features_list.append(features)
    targets_list.append(targets)

# --- TRAIN LIGHTGBM MODELS ---
models = {}
for idx, station in enumerate(station_cols):
    features = features_list[idx]
    targets = targets_list[idx]
    models[station] = []
    for h in range(pred_horizon):
        m = lgb.LGBMRegressor(n_estimators=100)
        m.fit(features, targets[:, h])
        models[station].append(m)

# --- PREDICTION ON TEST SET ---
test_pred = test_df.copy()
test_pred[station_cols] = test_pred[station_cols].astype(str)
i = 0
while i < len(test_df):
    window = test_df.iloc[i:i+history_len]
    pred_start = i + history_len
    if pred_start + pred_horizon > len(test_df):
        break
    to_pred = test_df.iloc[pred_start:pred_start+pred_horizon]
    mask = to_pred[station_cols].isnull() | (to_pred[station_cols] == '')
    if mask.values.any():
        for sidx, station in enumerate(station_cols):
            if mask[station].any():
                own_hist = window[station].values.astype(float)
                nn_hist = []
                for nn in neighbors[station]:
                    nn_hist.append(window[nn].values.astype(float))
                nn_hist = np.concatenate(nn_hist) if nn_hist else np.zeros(0)
                hour = pd.to_datetime(to_pred['timestamp'].iloc[0]).hour / 23.0
                dow = pd.to_datetime(to_pred['timestamp'].iloc[0]).dayofweek / 6.0
                f = np.concatenate([own_hist, nn_hist, [hour, dow]])[None, :]
                preds = [mdl.predict(f)[0] for mdl in models[station]]
                for h in range(pred_horizon):
                    if mask[station].iloc[h]:
                        test_pred.loc[pred_start + h, station] = preds[h]
    i += history_len + pred_horizon

# --- EXPORT ONLY ROWS WITH PREDICTIONS AND HEADER ROW ---
rows_with_preds = test_df[station_cols].isnull() | (test_df[station_cols] == '')
rows_with_preds = rows_with_preds.any(axis=1)
header = pd.DataFrame([test_df.columns], columns=test_df.columns)
final = pd.concat([header, test_pred[rows_with_preds]], ignore_index=True)
final.to_csv("lgbm_with_neighbors.csv", index=False, header=False)
print("Predictions saved as lgbm_with_neighbors.csv")


  train_df[station_cols] = train_df[station_cols].fillna(method="ffill").fillna(method="bfill")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3055
[LightGBM] [Info] Number of data points in the train set: 20412, number of used features: 146
[LightGBM] [Info] Start training from score 5.710464
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008527 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3055
[LightGBM] [Info] Number of data points in the train set: 20412, number of used features: 146
[LightGBM] [Info] Start training from score 5.710317
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3055
[LightGBM] [Info] Number of data points in the train set: 20412, number of used features: 146
[LightGBM] [Info] Start 

In [7]:
import os
import re

def safe_filename(name):
    # Replace anything that's not a letter, number, underscore, or hyphen with `_`
    return re.sub(r'[^\w\-_\. ]', '_', name)

save_dir = "lgbm_models"
os.makedirs(save_dir, exist_ok=True)

for station in models:
    safe_station = safe_filename(station)
    for h, model in enumerate(models[station]):
        path = os.path.join(save_dir, f"{safe_station}_h{h}.txt")
        model.booster_.save_model(path)


In [None]:
import os
import lightgbm as lgb

save_dir = "lgbm_models"
pred_horizon = 4  # Or whatever you used

loaded_models = {}

for station in station_cols:  # station_cols = your list of station names
    safe_station = safe_filename(station)
    loaded_models[station] = []
    for h in range(pred_horizon):
        path = os.path.join(save_dir, f"{safe_station}_h{h}.txt")
        booster = lgb.Booster(model_file=path)
        loaded_models[station].append(booster)
