In [None]:
!pip install pandas numpy joblib pyarrow scikit-learn networkx shapely



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Forest (lightweight version)
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
import joblib

DATA_PATH = "here_speed_timeseries.parquet"
W = 5
H = 3
MODEL_OUT = "rf_speed_model_light.joblib"
STATE_OUT = "rf_last_window.parquet"

def build_supervised(df: pd.DataFrame, W: int, H: int):
    X_list, Y_list = [], []

    df = df.sort_values(["item_id", "timestamp"]).reset_index(drop=True)

    for item_id, g in df.groupby("item_id", sort=False):
        y = g["target"].to_numpy(dtype=np.float32)
        if len(y) < W + H:
            continue

        for t in range(W - 1, len(y) - H):
            x = y[t - (W - 1): t + 1]
            y_future = y[t + 1: t + 1 + H]

            if np.any(~np.isfinite(x)) or np.any(~np.isfinite(y_future)):
                continue

            X_list.append(x)
            Y_list.append(y_future)

    X = np.vstack(X_list)
    Y = np.vstack(Y_list)
    return X, Y

def build_last_window_state(df: pd.DataFrame, W: int):
    df = df.sort_values(["item_id", "timestamp"]).reset_index(drop=True)
    rows = []

    for item_id, g in df.groupby("item_id", sort=False):
        y = g["target"].to_numpy(dtype=np.float32)
        if len(y) < W:
            continue
        last = y[-W:]
        row = {"item_id": str(item_id)}
        for i in range(W):
            row[f"lag_{i+1}"] = float(last[i])
        rows.append(row)

    return pd.DataFrame(rows)

def main():
    df = pd.read_parquet(DATA_PATH)
    df["item_id"] = df["item_id"].astype(str)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["target"] = pd.to_numeric(df["target"], errors="coerce")
    df = df.dropna(subset=["item_id", "timestamp", "target"])
    df = df[df["target"] > 0]

    print("Loaded df:", df.shape, "unique items:", df["item_id"].nunique())

    X, Y = build_supervised(df, W=W, H=H)
    print("Supervised X:", X.shape, "Y:", Y.shape)

    base = RandomForestRegressor(
        n_estimators=80,
        max_depth=10,
        min_samples_leaf=50,
        max_features="sqrt",
        random_state=42,
        n_jobs=1
    )

    model = MultiOutputRegressor(base, n_jobs=1)

    print("Training lightweight RF...")
    model.fit(X, Y)
    print("Training done.")

    joblib.dump({"model": model, "W": W, "H": H}, MODEL_OUT)
    print("Saved model to:", MODEL_OUT)

    state = build_last_window_state(df, W=W)
    state.to_parquet(STATE_OUT, index=False)
    print("Saved last-window state:", len(state))

if __name__ == "__main__":
    main()


Loaded df: (112403, 3) unique items: 3762
Supervised X: (86122, 5) Y: (86122, 3)
Training lightweight RF...
Training done.
Saved model to: rf_speed_model_light.joblib
Saved last-window state: 3755


In [None]:
import pickle
from datetime import datetime
import networkx as nx

# ------------------ paste function start ------------------
def traffic_light_delay(u, v, data):
    return 0

DEFAULT_SPEED_MPH = 25.0

def ml_predict_future_graph(
    G_in: nx.MultiDiGraph,
    current_time: datetime,
    minutes_ahead: int = 30
) -> nx.MultiDiGraph:
    if not hasattr(ml_predict_future_graph, "_loaded"):
        import pandas as pd
        import joblib

        bundle = joblib.load("rf_speed_model_light.joblib")

        ml_predict_future_graph._rf_model = bundle["model"]
        ml_predict_future_graph._W = int(bundle["W"])
        ml_predict_future_graph._H = int(bundle["H"])

        state = pd.read_parquet("rf_last_window.parquet")
        state["item_id"] = state["item_id"].astype(str)
        ml_predict_future_graph._rf_state = state.set_index("item_id")

        map_df = pd.read_parquet("here_segment_to_osm_edge_filtered.parquet")
        map_df["segment_id"] = map_df["segment_id"].astype(str)
        ml_predict_future_graph._segment_to_edge = (
            map_df.set_index("segment_id")[["u", "v", "key"]].to_dict("index")
        )

        ml_predict_future_graph._loaded = True

    rf_model = ml_predict_future_graph._rf_model
    W = ml_predict_future_graph._W
    H = ml_predict_future_graph._H
    rf_state = ml_predict_future_graph._rf_state
    segment_to_edge = ml_predict_future_graph._segment_to_edge

    G_pred = G_in.copy()

    n_steps = int(round(float(minutes_ahead)))
    if n_steps < 1:
        n_steps = 1

    import numpy as np

    total = len(segment_to_edge)

    for i, (seg_id, info) in enumerate(segment_to_edge.items()):
        if i % 200 == 0:
            print(f"[RF] {i}/{total} segments processed")
        u, v, k = info["u"], info["v"], info["key"]
        if not G_pred.has_edge(u, v, k):
            continue

        data = G_pred[u][v][k]
        cur_speed_mph = data.get("speed", DEFAULT_SPEED_MPH)
        cur_speed_kmh = float(cur_speed_mph) * 1.609344
        if not np.isfinite(cur_speed_kmh) or cur_speed_kmh <= 0:
            cur_speed_kmh = 40.0

        if seg_id in rf_state.index:
            try:
                window = rf_state.loc[seg_id, [f"lag_{i+1}" for i in range(W)]].to_numpy(dtype=np.float32)
                if window.shape[0] != W or np.any(~np.isfinite(window)):
                    raise ValueError("bad window")
                window[-1] = cur_speed_kmh
            except Exception:
                window = np.full((W,), cur_speed_kmh, dtype=np.float32)
        else:
            window = np.full((W,), cur_speed_kmh, dtype=np.float32)

        remaining = n_steps
        last_pred_kmh = cur_speed_kmh

        while remaining > 0:
            y = rf_model.predict(window.reshape(1, -1))[0]  # (H,)
            y = np.asarray(y, dtype=np.float32)
            if y.ndim != 1 or y.shape[0] < 1:
                break

            take = min(H, remaining)
            for j in range(take):
                pred_kmh = float(y[j])
                if not np.isfinite(pred_kmh) or pred_kmh <= 0:
                    pred_kmh = last_pred_kmh
                last_pred_kmh = pred_kmh
                window = np.concatenate([window[1:], np.array([pred_kmh], dtype=np.float32)])

            remaining -= take

        pred_mph = max(last_pred_kmh * 0.621371, 1.0)
        data["speed"] = float(pred_mph)

        length_m = data.get("length", 1.0)
        speed_m_s = max(pred_mph * 0.44704, 1.0)
        base_time = float(length_m / speed_m_s)
        tl_penalty = traffic_light_delay(u, v, data)
        data["travel_time"] = base_time + tl_penalty

    return G_pred
# ------------------ paste function end ------------------


def main():
    with open("manhattan_graph.gpickle", "rb") as f:
        G = pickle.load(f)

    DEFAULT_SPEED = 25.0
    for u, v, k, data in G.edges(keys=True, data=True):
        if "speed" not in data:
            data["speed"] = DEFAULT_SPEED
        if "travel_time" not in data:
            length_m = data.get("length", 1.0)
            speed_m_s = max(data["speed"] * 0.44704, 1.0)
            data["travel_time"] = float(length_m / speed_m_s)

    sample_edges = list(G.edges(keys=True, data=True))[:5]
    print("\n=== BEFORE ===")
    for (u, v, k, d) in sample_edges:
        print((u, v, k), "speed(mph)=", d.get("speed"), "travel_time(s)=", d.get("travel_time"))

    now = datetime.utcnow()
    t = 7.8
    G2 = ml_predict_future_graph(G, now, minutes_ahead=t)

    sample_edges2 = list(G2.edges(keys=True, data=True))[:5]
    print("\n=== AFTER (t=%.2f min) ===" % t)
    for (u, v, k, d) in sample_edges2:
        print((u, v, k), "speed(mph)=", d.get("speed"), "travel_time(s)=", d.get("travel_time"))

    changed = 0
    total = 0
    for (u, v, k, d1) in G.edges(keys=True, data=True):
        d2 = G2.get_edge_data(u, v, k)
        if d2 is None:
            continue
        total += 1
        sp1 = d1.get("speed")
        sp2 = d2.get("speed")
        if sp1 != sp2:
            changed += 1

    print("\nTotal edges checked:", total)
    print("Edges with changed speed:", changed)
    print("Done.")


if __name__ == "__main__":
    main()



=== BEFORE ===
(42421728, 42435337, 0) speed(mph)= 25.0 travel_time(s)= 7.636161094365019
(42421728, 42421731, 0) speed(mph)= 25.0 travel_time(s)= 12.350566807024558
(42421728, 42432736, 0) speed(mph)= 25.0 travel_time(s)= 7.718150179116524
(42421731, 42437916, 0) speed(mph)= 25.0 travel_time(s)= 7.708410966615876
(42421731, 42432737, 0) speed(mph)= 25.0 travel_time(s)= 7.692265833780549


  now = datetime.utcnow()


[RF] 0/3742 segments processed
[RF] 200/3742 segments processed
[RF] 400/3742 segments processed
[RF] 600/3742 segments processed
[RF] 800/3742 segments processed
[RF] 1000/3742 segments processed
[RF] 1200/3742 segments processed
[RF] 1400/3742 segments processed
[RF] 1600/3742 segments processed
[RF] 1800/3742 segments processed
[RF] 2000/3742 segments processed
[RF] 2200/3742 segments processed
[RF] 2400/3742 segments processed
[RF] 2600/3742 segments processed
[RF] 2800/3742 segments processed
[RF] 3000/3742 segments processed
[RF] 3200/3742 segments processed
[RF] 3400/3742 segments processed
[RF] 3600/3742 segments processed

=== AFTER (t=7.80 min) ===
(42421728, 42435337, 0) speed(mph)= 25.0 travel_time(s)= 7.636161094365019
(42421728, 42421731, 0) speed(mph)= 25.0 travel_time(s)= 12.350566807024558
(42421728, 42432736, 0) speed(mph)= 25.0 travel_time(s)= 7.718150179116524
(42421731, 42437916, 0) speed(mph)= 25.0 travel_time(s)= 7.708410966615876
(42421731, 42432737, 0) speed(mp