In [1]:
import pandas as pd 
import numpy as np 

#load saved dataset
df = pd.read_parquet("processed_data/net_flow_30min_clean.parquet")

df.sort_values(["station_id", "time_bucket"], inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,station_id,time_bucket,net_flow
0,959,2022-09-12 06:30:00,-1
1,959,2022-09-12 07:00:00,-1
2,959,2022-09-12 07:30:00,-1
3,959,2022-09-12 08:00:00,0
4,959,2022-09-12 08:30:00,5


- Defining thresholds so I chose 5, where a net_flow of -5 indicates shortage of bikes at a particular station and a net_flow of +5 indicates a surplus of bikes at a particular station. 

In [2]:
shortage_th = -5
surplus_th = 5

def label_flow(x):
    if x <= shortage_th:
        return 'shortage'
    elif x >= surplus_th:
        return 'surplus'
    else:
        return 'balanced'

In [3]:
#next_period net flow and labels per station
df["net_flow_tplus1"] = (
    df.groupby("station_id")["net_flow"].shift(-1)
)

df["target"] = df["net_flow_tplus1"].apply(label_flow)

#drop rows where next period doesnt exist
df = df.dropna(subset=["net_flow_tplus1"]).copy()

df["target"].value_counts(normalize=True)


target
balanced    0.965177
surplus     0.018129
shortage    0.016695
Name: proportion, dtype: float64

In [5]:
#adding temporal features 
from datetime import datetime 

df['hour'] = df['time_bucket'].dt.hour 
df['dow'] = df['time_bucket'].dt.dayofweek 

In [6]:
#lagged features 
lags = [1, 2, 3, 6] # 30 min, 1 hr, 1.5 hr, 3h 

for lag in lags:
    df[f'net_flow_lag_{lag}'] = (
        df.groupby('station_id')['net_flow'].shift(lag)
    )

#drop rows with missing lags
lag_cols = [f'net_flow_lag_{lag}' for lag in lags]
df = df.dropna(subset = lag_cols).copy()

In [7]:
#rolling features
df["roll_mean_3"] = (
    df.groupby("station_id")["net_flow"]
      .shift(1)
      .rolling(3)
      .mean()
)

df["roll_std_3"] = (
    df.groupby("station_id")["net_flow"]
      .shift(1)
      .rolling(3)
      .std()
)

df = df.dropna(subset=["roll_mean_3", "roll_std_3"]).copy()


In [9]:
#final feature set
feature_cols = (
    ["hour", "dow"] +
    [f"net_flow_lag_{lag}" for lag in lags] +
    ["roll_mean_3", "roll_std_3"]
)

X = df[feature_cols]
y = df["target"]

print(X.shape, y.shape)


(17744644, 8) (17744644,)


In [10]:
from pathlib import Path 

out_dir = Path('processed_data')
out_dir.mkdir(exist_ok = True)

df_model = df[['station_id', 'time_bucket'] + feature_cols + ['target']]
df_model.to_parquet(
    out_dir / 'net_flow_model_table.parquet',
    index = False
)

## Output 
This notebook creates causal, station-level features and a next-period
classification target (shortage/surplus/balanced). The resulting modelling
table is saved as `net_flow_model_table.parquet` and used for forecasting
in the next notebook.
