In [1]:
# scripts/tri_compare_predictions.py
from __future__ import annotations
import os, json, time, argparse, threading
from pathlib import Path
from typing import Iterable, Tuple
import sys
import numpy as np
import pandas as pd
import requests
import joblib

# Add the repo root (parent of `scripts/`) to sys.path
try:
    ROOT = Path(__file__).resolve().parents[1]  # When run as .py
except NameError:
    ROOT = Path().resolve().parents[0]          # When run in Jupyter

if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

# Import from your package (assumes `pip install -e .`)
from traffic_flow.service.app import create_app
from traffic_flow.service.runtime import InferenceRuntime
from traffic_flow.pipeline.data_pipeline_orchestrator import TrafficDataPipelineOrchestrator
from traffic_flow.inference.prediction_protocol import make_prediction_frame
from traffic_flow.evaluation.model_comparison import ModelEvaluator

In [2]:
# ---------------- Build test RAW rows with same cleaning params ----------------

def load_artifact(artifact: str | Path) -> dict:
    b = joblib.load(artifact)
    return {"bundle": b, "states": b["states"], "horizon": int(b.get("horizon", 15))}

def make_orchestrator_from_states(raw_path: str | Path, states: dict) -> TrafficDataPipelineOrchestrator:
    clean = states["clean_state"]
    tdp = TrafficDataPipelineOrchestrator(file_path=str(raw_path), sensor_encoding_type="mean")
    tdp.prepare_base_features(
        window_size=clean["smoothing_window"],
        filter_extreme_changes=True,
        smooth_speeds=True,
        relative_threshold=clean["relative_threshold"],
        use_median_instead_of_mean_smoothing=clean["use_median"],
    )
    return tdp

def get_raw_test(raw_path: str | Path, states: dict) -> pd.DataFrame:
    tdp = make_orchestrator_from_states(raw_path, states)
    raw = pd.read_parquet(raw_path)
    raw = raw.drop(columns=[c for c in BAD_WEATHER_COLS if c in raw.columns])  # drop problematic cols
    test = raw.loc[raw["date"] >= tdp.first_test_timestamp].copy()
    test.sort_values(["date","sensor_id"], kind="mergesort", inplace=True)
    return test,tdp

In [3]:
# Paths (adapt as needed)
artifact_path = Path("../../artifacts/traffic_pipeline_h-15.joblib")
raw_path = Path("../../data/NDW/ndw_three_weeks.parquet")
url = "http://127.0.0.1:8080"
start_server = True
batch_size = 20000
tolerance = 1e-6
save_outputs = False

In [4]:
art = load_artifact(artifact_path)
states, horizon = art["states"], art["horizon"]
batch_size = 600000
clean = art['states']['clean_state']
# 1) Rebuild training split & X/y using same params; keep df (full)
tdp = TrafficDataPipelineOrchestrator(file_path=str(raw_path), sensor_encoding_type="mean")

tdp.prepare_base_features(
        window_size=clean["smoothing_window"],
        filter_extreme_changes=True,
        smooth_speeds=True,
        relative_threshold=clean["relative_threshold"],
        use_median_instead_of_mean_smoothing=clean["use_median"],
    )
tdp.finalise_for_horizon(horizon=horizon, drop_datetime=False)  # keep 'date' visible
X_train, X_test, y_train, y_test = tdp.X_train, tdp.X_test, tdp.y_train, tdp.y_test
df_all = tdp.df.copy()
X_test = X_test.iloc[:batch_size]
y_test = y_test.iloc[:batch_size]
print(f"X_test columns len before resorting them: {len(X_test.columns)}")
print(f"len X_test before resorting: {len(X_test)}")
last_timestamp = X_test['date'].max()
df_all = df_all[df_all['date']<=last_timestamp]
X_test = X_test[art['bundle']["feature_cols"]]

print(f"X_test columns len after resorting them: {len(X_test.columns)}")
print(f"len X_test after resorting: {len(X_test)}")
# 2) Evaluator + canonical preds (test only)
me = ModelEvaluator(
    X_test=X_test,
    df_for_ML=df_all,       # evaluator will internally take test_set
    y_train=y_train,
    y_test=y_test,
    target_is_gman_error_prediction=False,
    y_is_normalized=False,
    rounding=6,
)
model = art["bundle"]["model"]
pred_df = me.to_canonical_predictions(model=model, states=states, horizon_min=horizon)
offline_results = me.evaluate_model_from_path(saved_model=model)




Running prepare_base_features!!!!!!!!!!!!!!!!
[MeanSensorEncoder] Mean encoding learned for 204 sensors. Global mean=93.85.
[AdjacentSensorFeatureAdder] Adding adjacent sensor features.
[AdjacentSensorFeatureAdder] Added features: downstream_sensor_1, upstream_sensor_1




[PreviousWeekdayWindowFeatureEngineer] horizon=15′  window=[-0,+0]′ step=1′  aggs=-  mode=local
[WeatherFeatureDropper] Will drop ['incremental_id', 'Per_cent_frozen_precipitation_surface', 'Precipitable_water_entire_atmosphere_single_layer', 'Precipitation_rate_surface_3_Hour_Average', 'Storm_relative_helicity_height_above_ground_layer', 'Total_precipitation_surface_3_Hour_Accumulation', 'Categorical_Rain_surface_3_Hour_Average', 'Categorical_Freezing_Rain_surface_3_Hour_Average', 'Categorical_Ice_Pellets_surface_3_Hour_Average', 'Categorical_Snow_surface_3_Hour_Average', 'Convective_Precipitation_Rate_surface_3_Hour_Average', 'Convective_precipitation_surface_3_Hour_Accumulation', 'U-Component_Storm_Motion_height_above_ground_layer', 'V-Component_Storm_Motion_height_above_ground_layer', 'Geopotential_height_highest_tropospheric_freezing', 'Relative_humidity_highest_tropospheric_freezing', 'Ice_cover_surface', 'Snow_depth_surface', 'Water_equivalent_of_accumulated_snow_depth_surface',

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_predictions['y_pred'] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df_for_ML['y_pred'] = self.y_pred


In [7]:
pred_df

Unnamed: 0,sensor_id,input_time,prediction_time,y_pred_delta,horizon,y_pred_total,y_act_total
0,RWS01_MONIBAS_0040vwe0633ra,2023-03-15 00:00:00,2023-03-15 00:15:00,-0.348393,15,86.451610,91.080002
1,RWS01_MONIBAS_0040vwe0637ra,2023-03-15 00:00:00,2023-03-15 00:15:00,-5.848662,15,91.351335,93.760002
2,RWS01_MONIBAS_0040vwe0755ra,2023-03-15 00:00:00,2023-03-15 00:15:00,7.703918,15,94.213920,92.078003
3,RWS01_MONIBAS_0040vwe0757ra,2023-03-15 00:00:00,2023-03-15 00:15:00,2.079455,15,88.079455,90.788002
4,RWS01_MONIBAS_0040vwe0758ra,2023-03-15 00:00:00,2023-03-15 00:15:00,1.761346,15,91.901345,95.225998
...,...,...,...,...,...,...,...
599995,RWS01_MONIBAS_0041hrr0611rb,2023-03-17 01:01:00,2023-03-17 01:16:00,0.336931,15,97.616930,93.599998
599996,RWS01_MONIBAS_0041hrr0613ra,2023-03-17 01:01:00,2023-03-17 01:16:00,2.894517,15,96.604516,93.641998
599997,RWS01_MONIBAS_0041hrr0613rb,2023-03-17 01:01:00,2023-03-17 01:16:00,-0.008139,15,96.551858,91.599998
599998,RWS01_MONIBAS_0041hrr0615ra,2023-03-17 01:01:00,2023-03-17 01:16:00,2.647310,15,98.205309,95.517998


In [13]:
tdp.first_test_timestamp

Timestamp('2023-03-15 00:00:00')

In [18]:
raw_df_test = pd.read_parquet(raw_path)
raw_df_test = raw_df_test.loc[(raw_df_test['date']>=tdp.first_test_timestamp) & (raw_df_test['date']<= last_timestamp)]
rt = InferenceRuntime(artifact_path) 
pred_df_local,_ = rt.predict_df(raw_df_test)
pred_df_local
#raw_df_test

[PreviousWeekdayWindowFeatureEngineer] horizon=15′  window=[-0,+0]′ step=1′  aggs=-  mode=local
[AdjacentSensorFeatureAdder] Adding adjacent sensor features.
[AdjacentSensorFeatureAdder] Added features: downstream_sensor_1, upstream_sensor_1


Unnamed: 0,sensor_id,input_time,prediction_time,y_pred_delta,horizon,y_pred_total
0,RWS01_MONIBAS_0040vwe0633ra,2023-03-15 00:00:00,2023-03-15 00:15:00,6.834698,15,93.634701
1,RWS01_MONIBAS_0040vwe0637ra,2023-03-15 00:00:00,2023-03-15 00:15:00,6.468986,15,103.668983
2,RWS01_MONIBAS_0040vwe0755ra,2023-03-15 00:00:00,2023-03-15 00:15:00,3.320071,15,89.830074
3,RWS01_MONIBAS_0040vwe0757ra,2023-03-15 00:00:00,2023-03-15 00:15:00,1.946928,15,87.946928
4,RWS01_MONIBAS_0040vwe0758ra,2023-03-15 00:00:00,2023-03-15 00:15:00,2.287275,15,92.427275
...,...,...,...,...,...,...
600163,RWS01_MONIBAS_0201hrr0461ra,2023-03-17 01:01:00,2023-03-17 01:16:00,-2.365949,15,102.894053
600164,RWS01_MONIBAS_0201hrr0465ra,2023-03-17 01:01:00,2023-03-17 01:16:00,-1.121641,15,106.040361
600165,RWS01_MONIBAS_0201hrr0470ra,2023-03-17 01:01:00,2023-03-17 01:16:00,-5.010976,15,101.537020
600166,RWS01_MONIBAS_0201hrr0475ra,2023-03-17 01:01:00,2023-03-17 01:16:00,-3.083875,15,102.144130


In [None]:
len()

In [6]:
me.df_for_ML

NameError: name 'me' is not defined

In [12]:
offline_results

{'metrics': {'MAE': 4.60136,
  'Median_AE': 2.241791,
  'RMSE': 9.285774,
  'MAPE': 7.752077,
  'SMAPE': 6.208843,
  'inference_time': 0.486934,
  'inference_time_per_sample': 1e-06},
 'metrics_std': {'MAE_std': 8.064352,
  'Median_AE_std': 8.064352,
  'RMSE_std': 406.359192,
  'MAPE_std': 29.625842,
  'SMAPE_std': 13.987556},
 'naive_metrics': {'Naive_MAE': 4.875232,
  'Naive_Median_AE': 2.120003,
  'Naive_RMSE': 9.939258,
  'Naive_MAPE': 7.624079,
  'Naive_SMAPE': 6.706569},
 'naive_metrics_std': {'Naive_MAE_std': 8.660919,
  'Naive_Median_AE_std': 8.660919,
  'Naive_RMSE_std': 440.95224,
  'Naive_MAPE_std': 27.074996,
  'Naive_SMAPE_std': 15.148398}}

In [13]:
pred_df

Unnamed: 0,sensor_id,input_time,prediction_time,y_pred_delta,horizon,y_pred_total
0,RWS01_MONIBAS_0040vwe0633ra,2023-03-15 00:00:00,2023-03-15 00:15:00,-0.348393,15,86.451610
1,RWS01_MONIBAS_0040vwe0637ra,2023-03-15 00:00:00,2023-03-15 00:15:00,-5.848662,15,91.351335
2,RWS01_MONIBAS_0040vwe0755ra,2023-03-15 00:00:00,2023-03-15 00:15:00,7.703918,15,94.213920
3,RWS01_MONIBAS_0040vwe0757ra,2023-03-15 00:00:00,2023-03-15 00:15:00,2.079455,15,88.079455
4,RWS01_MONIBAS_0040vwe0758ra,2023-03-15 00:00:00,2023-03-15 00:15:00,1.761346,15,91.901345
...,...,...,...,...,...,...
599995,RWS01_MONIBAS_0041hrr0611rb,2023-03-17 01:01:00,2023-03-17 01:16:00,0.336931,15,97.616930
599996,RWS01_MONIBAS_0041hrr0613ra,2023-03-17 01:01:00,2023-03-17 01:16:00,2.894517,15,96.604516
599997,RWS01_MONIBAS_0041hrr0613rb,2023-03-17 01:01:00,2023-03-17 01:16:00,-0.008139,15,96.551858
599998,RWS01_MONIBAS_0041hrr0615ra,2023-03-17 01:01:00,2023-03-17 01:16:00,2.647310,15,98.205309


In [11]:
y_test

4112640    4.279999
4112641   -3.439995
4112642    5.568001
4112643    4.788002
4112644    5.085999
             ...   
4712635   -3.680000
4712636   -0.068001
4712637   -4.959999
4712638   -0.040001
4712639   -7.360001
Name: target, Length: 600000, dtype: float32