In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso


In [2]:
df = pd.read_csv("../data/processed_dataset.csv", parse_dates=["vehicle_timestamp"])
df = df.sort_values("vehicle_timestamp")

FEATURES = ["delay_seconds_calc", "hour", "weekday"]
TARGET = "y_end_delay_calc"

df = df.dropna(subset=FEATURES + [TARGET]).copy()

TRAIN_END = df["vehicle_timestamp"].quantile(0.70)
VAL_END   = df["vehicle_timestamp"].quantile(0.85)

train_df = df[df["vehicle_timestamp"] < TRAIN_END].copy()
val_df   = df[(df["vehicle_timestamp"] >= TRAIN_END) & (df["vehicle_timestamp"] < VAL_END)].copy()
test_df  = df[df["vehicle_timestamp"] >= VAL_END].copy()

print("rows:", len(df))
print("min/max:", df["vehicle_timestamp"].min(), df["vehicle_timestamp"].max())
print("train/val/test:", len(train_df), len(val_df), len(test_df))


  df = pd.read_csv("../data/processed_dataset.csv", parse_dates=["vehicle_timestamp"])


rows: 6170467
min/max: 2025-11-10 03:16:27 2025-11-24 23:15:24
train/val/test: 4319310 925566 925591


In [3]:
def describe_cols(d, cols):
    out = d[cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]).T
    return out[["count", "mean", "std", "min", "1%", "5%", "50%", "95%", "99%", "max"]]

print("TARGET stats (train):")
display(describe_cols(train_df, [TARGET]))

print("delay_seconds stats (train):")
display(describe_cols(train_df, ["delay_seconds_calc"]))

print("TARGET stats (test):")
display(describe_cols(test_df, [TARGET]))


TARGET stats (train):


Unnamed: 0,count,mean,std,min,1%,5%,50%,95%,99%,max
y_end_delay_calc,4319310.0,-3529.801658,338.198781,-19166.0,-5015.0,-3791.0,-3557.0,-3080.0,-2554.0,8915.0


delay_seconds stats (train):


Unnamed: 0,count,mean,std,min,1%,5%,50%,95%,99%,max
delay_seconds_calc,4319310.0,-3530.063876,429.180087,-19682.0,-4747.0,-3938.0,-3562.0,-3032.0,-2184.0,13316.0


TARGET stats (test):


Unnamed: 0,count,mean,std,min,1%,5%,50%,95%,99%,max
y_end_delay_calc,925591.0,-3560.709768,313.361869,-10683.0,-5097.0,-3811.0,-3576.0,-3171.0,-2673.0,3275.0


In [4]:
y_test = test_df[TARGET].values.astype(np.float32)

pred_mean = np.full_like(y_test, fill_value=train_df[TARGET].mean(), dtype=np.float32)

mae_mean = mean_absolute_error(y_test, pred_mean)
rmse_mean = mean_squared_error(y_test, pred_mean, squared=False)

print("Baseline(mean)  MAE:", mae_mean, "RMSE:", rmse_mean)


Baseline(mean)  MAE: 169.79579 RMSE: 314.88232




In [5]:
#baseline 0: predict mean end delay
y_test = test_df[TARGET].values.astype(np.float32)

pred_mean = np.full_like(y_test, fill_value=train_df[TARGET].mean(), dtype=np.float32)

mae_mean = mean_absolute_error(y_test, pred_mean)
rmse_mean = mean_squared_error(y_test, pred_mean, squared=False)

print("Baseline(mean)  MAE:", mae_mean, "RMSE:", rmse_mean)


Baseline(mean)  MAE: 169.79579 RMSE: 314.88232




In [6]:
#baseline 1: current delay = end delay
pred_current = test_df["delay_seconds_calc"].values.astype(np.float32)

mae_cur = mean_absolute_error(y_test, pred_current)
rmse_cur = mean_squared_error(y_test, pred_current, squared=False)

print("Baseline(current_delay)  MAE:", mae_cur, "RMSE:", rmse_cur)


Baseline(current_delay)  MAE: 182.8283 RMSE: 387.02478




In [9]:
X_train_raw = train_df[FEATURES].values
y_train = train_df[TARGET].values.astype(np.float32)

X_val_raw = val_df[FEATURES].values
y_val = val_df[TARGET].values.astype(np.float32)

X_test_raw = test_df[FEATURES].values

x_scaler = StandardScaler()
X_train = x_scaler.fit_transform(X_train_raw)
X_val   = x_scaler.transform(X_val_raw)
X_test  = x_scaler.transform(X_test_raw)

lr = LinearRegression()
lr.fit(X_train, y_train)

pred_val = lr.predict(X_val)
pred_test = lr.predict(X_test)

val_mae = mean_absolute_error(y_val, pred_val)
val_rmse = mean_squared_error(y_val, pred_val, squared=False)

test_mae = mean_absolute_error(y_test, pred_test)
test_rmse = mean_squared_error(y_test, pred_test, squared=False)

print("LinearRegression VAL  MAE:", val_mae, "RMSE:", val_rmse)
print("LinearRegression TEST MAE:", test_mae, "RMSE:", test_rmse)


LinearRegression VAL  MAE: 155.46814940184208 RMSE: 287.95881840810506
LinearRegression TEST MAE: 157.90903166053425 RMSE: 286.2267978545924




In [10]:
baseline_results = {
    "meta": {
        "features": FEATURES,
        "target": TARGET,
        "timestamp_col": "vehicle_timestamp"
    },
    "baseline_mean": {
        "model_type": "statistical",
        "mae": float(mae_mean),
        "rmse": float(rmse_mean)
    },
    "baseline_current_delay": {
        "model_type": "heuristic",
        "mae": float(mae_cur),
        "rmse": float(rmse_cur)
    },
    "linear_regression": {
        "model_type": "linear",
        "mae": float(test_mae),
        "rmse": float(test_rmse)
    }
}


In [12]:
import json
from pathlib import Path
import pandas as pd

out_dir = Path("../data/baselines")
out_dir.mkdir(exist_ok=True)

with open(out_dir / "baseline_results.json", "w") as f:
    json.dump(baseline_results, f, indent=2)

rows = []
for name, res in baseline_results.items():
    if name == "meta":
        continue
    rows.append({
        "model": name,
        "model_type": res["model_type"],
        "MAE_seconds": res["mae"],
        "RMSE_seconds": res["rmse"]
    })

baseline_df = pd.DataFrame(rows)
baseline_df.to_csv(out_dir / "baseline_results.csv", index=False)

print("Saved baselines to:", out_dir)
display(baseline_df)


Saved baselines to: ../data/baselines


Unnamed: 0,model,model_type,MAE_seconds,RMSE_seconds
0,baseline_mean,statistical,169.795792,314.882324
1,baseline_current_delay,heuristic,182.828293,387.02478
2,linear_regression,linear,157.909032,286.226798
