In [None]:
import requests
import pandas as pd
from datetime import datetime
import time

API_KEY = ""  # API Key

BBOX_MANHATTAN = (-74.03, 40.70, -73.90, 40.88)  # (west, south, east, north)

# HERE Traffic Flow v7 endpoint
FLOW_URL = "https://data.traffic.hereapi.com/v7/flow"


In [None]:
def fetch_flow_snapshot(api_key, bbox):
    west, south, east, north = bbox
    params = {
        "in": f"bbox:{west},{south},{east},{north}",
        "locationReferencing": "shape",
        "fields": "location,flow",
        "units": "metric",
        "apiKey": api_key,
    }

    resp = requests.get(FLOW_URL, params=params, timeout=30)
    if resp.status_code != 200:
        raise RuntimeError(f"HTTP {resp.status_code}: {resp.text}")

    data = resp.json()
    results = data.get("results", [])

    snapshot_time = datetime.utcnow()

    rows = []
    for item in results:
        loc = item.get("location", {})
        cur = item.get("currentFlow", {})
        hist = item.get("historicSpeed", {}) or item.get("historicFlow", {})

        polyline = loc.get("polyline")
        segment_id = loc.get("id") or polyline

        row = {
            "snapshot_time": snapshot_time,
            "segment_id": segment_id,
            "polyline": polyline,
            "length_m": loc.get("length"),
            "primary_ccode": loc.get("primaryCountryCode"),
            "current_speed_kmh": cur.get("speed"),
            "current_free_flow_kmh": cur.get("freeFlowSpeed"),
            "current_jam_factor": cur.get("jamFactor"),
            "current_confidence": cur.get("confidence"),
            "historic_speed_kmh": hist.get("speed"),
            "historic_free_flow_kmh": hist.get("freeFlowSpeed"),
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    return df


In [None]:
test_df = fetch_flow_snapshot(API_KEY, BBOX_MANHATTAN)
print("Catched data:", len(test_df))
test_df.head()


本次快照抓到的道路条数: 3763


  snapshot_time = datetime.utcnow()


Unnamed: 0,snapshot_time,segment_id,polyline,length_m,primary_ccode,current_speed_kmh,current_free_flow_kmh,current_jam_factor,current_confidence,historic_speed_kmh,historic_free_flow_kmh
0,2025-12-12 01:15:47.273920,,,602.0,,9.722222,,6.4,0.99,,
1,2025-12-12 01:15:47.273920,,,889.0,,1.666667,,9.1,0.99,,
2,2025-12-12 01:15:47.273920,,,359.0,,3.888889,,0.0,0.87,,
3,2025-12-12 01:15:47.273920,,,46.0,,9.444445,,0.0,0.88,,
4,2025-12-12 01:15:47.273920,,,141.0,,4.444445,,4.0,0.9,,


In [None]:
INTERVAL_SECONDS = 60
NUM_SNAPSHOTS = 30

all_snapshots = []

for i in range(NUM_SNAPSHOTS):
    print(f"Catching {i+1}/{NUM_SNAPSHOTS} ")
    try:
        df_snapshot = fetch_flow_snapshot(API_KEY, BBOX_MANHATTAN)
        print(f"  Catched {len(df_snapshot)} lines of data")
        all_snapshots.append(df_snapshot)
    except Exception as e:
        print("  catch failed:", e)

    if i < NUM_SNAPSHOTS - 1:
        print(f"  waiting for {INTERVAL_SECONDS} ")
        time.sleep(INTERVAL_SECONDS)

if all_snapshots:
    df_all = pd.concat(all_snapshots, ignore_index=True)
else:
    df_all = pd.DataFrame()


正在抓取第 1/30 次快照...


  snapshot_time = datetime.utcnow()  # 你也可以用 data.get("creationTime") 之类字段


  本次抓到 3770 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 2/30 次快照...
  本次抓到 3770 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 3/30 次快照...
  本次抓到 3770 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 4/30 次快照...
  本次抓到 3772 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 5/30 次快照...
  本次抓到 3774 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 6/30 次快照...
  本次抓到 3771 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 7/30 次快照...
  本次抓到 3771 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 8/30 次快照...
  本次抓到 3770 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 9/30 次快照...
  本次抓到 3768 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 10/30 次快照...
  本次抓到 3768 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 11/30 次快照...
  本次抓到 3770 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 12/30 次快照...
  本次抓到 3766 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 13/30 次快照...
  本次抓到 3767 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 14/30 次快照...
  本次抓到 3766 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 15/30 次快照...
  本次抓到 3765 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 16/30 次快照...
  本次抓到 3762 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 17/30 次快照...
  本次抓到 3762 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 18/30 次快照...
  本次抓到 3764 条路段数据
  等待 60 秒后抓取下一次...
正在抓取第 19/30 次快照...


In [None]:
print("catched data amount:", len(df_all))
df_all.head()

output_path = "/content/here_manhattan_30min_2.parquet"
df_all.to_parquet(output_path)
print("Saved:", output_path)


总共抓到行数: 113032
保存完成: /content/here_manhattan_30min_flow.parquet


In [None]:
def fetch_flow_snapshot_raw(api_key, bbox):
    west, south, east, north = bbox
    params = {
        "in": f"bbox:{west},{south},{east},{north}",
        "locationReferencing": "shape",
        "units": "metric",
        "apiKey": api_key,
    }

    resp = requests.get(FLOW_URL, params=params, timeout=30)
    print("HTTP status:", resp.status_code)
    if resp.status_code != 200:
        raise RuntimeError(f"HTTP {resp.status_code}: {resp.text}")

    data = resp.json()
    results = data.get("results", [])

    snapshot_time = datetime.utcnow()

    rows = []
    for item in results:
        rows.append({
            "snapshot_time": snapshot_time,
            "location": item.get("location"),
            "currentFlow": item.get("currentFlow"),
            "historicFlow": item.get("historicFlow"),
            "historicSpeed": item.get("historicSpeed"),
        })

    return pd.DataFrame(rows)


In [None]:
df_raw = fetch_flow_snapshot_raw(API_KEY, BBOX_MANHATTAN)
print("Returned data:", len(df_raw))
df_raw.head()


HTTP status: 200
本次快照返回条数: 3759


  snapshot_time = datetime.utcnow()


Unnamed: 0,snapshot_time,location,currentFlow,historicFlow,historicSpeed
0,2025-12-12 01:26:01.255379,"{'description': '3rd Ave/Exit 3', 'length': 60...","{'speed': 9.722222, 'speedUncapped': 9.722222,...",,
1,2025-12-12 01:26:01.255379,"{'description': '42nd St', 'length': 889.0, 's...","{'speed': 1.388889, 'speedUncapped': 1.388889,...",,
2,2025-12-12 01:26:01.255379,"{'description': 'Christopher St', 'length': 35...","{'speed': 3.888889, 'speedUncapped': 3.888889,...",,
3,2025-12-12 01:26:01.255379,"{'description': 'CR-17/Mt Vernon St', 'length'...","{'speed': 5.555556, 'speedUncapped': 5.555556,...",,
4,2025-12-12 01:26:01.255379,"{'description': '161st St', 'length': 141.0, '...","{'speed': 7.777778, 'speedUncapped': 7.777778,...",,




In [None]:
import time

INTERVAL_SECONDS = 60
NUM_SNAPSHOTS = 30

all_snapshots = []

for i in range(NUM_SNAPSHOTS):
    print(f"Catching {i+1}/{NUM_SNAPSHOTS} ")
    try:
        df_snap = fetch_flow_snapshot_raw(API_KEY, BBOX_MANHATTAN)
        print("Catched:", len(df_snap))
        all_snapshots.append(df_snap)
    except Exception as e:
        print("Catch failed:", e)

    if i < NUM_SNAPSHOTS - 1:
        print(f"Waiting for{INTERVAL_SECONDS}\n")
        time.sleep(INTERVAL_SECONDS)

if all_snapshots:
    df_all = pd.concat(all_snapshots, ignore_index=True)
else:
    df_all = pd.DataFrame()

print("30min catch amount:", len(df_all))
df_all.head()


[正式] 抓取第 1/30 次快照...
HTTP status: 200
  本次返回条数: 3759
  等待 60 秒再抓下一次...



  snapshot_time = datetime.utcnow()


[正式] 抓取第 2/30 次快照...
HTTP status: 200
  本次返回条数: 3757
  等待 60 秒再抓下一次...

[正式] 抓取第 3/30 次快照...
HTTP status: 200
  本次返回条数: 3755
  等待 60 秒再抓下一次...

[正式] 抓取第 4/30 次快照...
HTTP status: 200
  本次返回条数: 3756
  等待 60 秒再抓下一次...

[正式] 抓取第 5/30 次快照...
HTTP status: 200
  本次返回条数: 3755
  等待 60 秒再抓下一次...

[正式] 抓取第 6/30 次快照...
HTTP status: 200
  本次返回条数: 3755
  等待 60 秒再抓下一次...

[正式] 抓取第 7/30 次快照...
HTTP status: 200
  本次返回条数: 3756
  等待 60 秒再抓下一次...

[正式] 抓取第 8/30 次快照...
HTTP status: 200
  本次返回条数: 3756
  等待 60 秒再抓下一次...

[正式] 抓取第 9/30 次快照...
HTTP status: 200
  本次返回条数: 3756
  等待 60 秒再抓下一次...

[正式] 抓取第 10/30 次快照...
HTTP status: 200
  本次返回条数: 3754
  等待 60 秒再抓下一次...

[正式] 抓取第 11/30 次快照...
HTTP status: 200
  本次返回条数: 3753
  等待 60 秒再抓下一次...

[正式] 抓取第 12/30 次快照...
HTTP status: 200
  本次返回条数: 3753
  等待 60 秒再抓下一次...

[正式] 抓取第 13/30 次快照...
HTTP status: 200
  本次返回条数: 3755
  等待 60 秒再抓下一次...

[正式] 抓取第 14/30 次快照...
HTTP status: 200
  本次返回条数: 3754
  等待 60 秒再抓下一次...

[正式] 抓取第 15/30 次快照...
HTTP status: 200
  本次返回条数: 3755
  等待 

Unnamed: 0,snapshot_time,location,currentFlow,historicFlow,historicSpeed
0,2025-12-12 01:30:50.544090,"{'description': '3rd Ave/Exit 3', 'length': 60...","{'speed': 10.555556, 'speedUncapped': 10.55555...",,
1,2025-12-12 01:30:50.544090,"{'description': '42nd St', 'length': 889.0, 's...","{'speed': 1.6666667, 'speedUncapped': 1.666666...",,
2,2025-12-12 01:30:50.544090,"{'description': 'Christopher St', 'length': 35...","{'speed': 3.6111112, 'speedUncapped': 3.611111...",,
3,2025-12-12 01:30:50.544090,"{'description': 'CR-17/Mt Vernon St', 'length'...","{'speed': 9.444445, 'speedUncapped': 9.444445,...",,
4,2025-12-12 01:30:50.544090,"{'description': '161st St', 'length': 141.0, '...","{'speed': 9.444445, 'speedUncapped': 9.444445,...",,


In [None]:
output_path = "/content/here_manhattan_30min_raw.parquet"
df_all.to_parquet(output_path)
print("Saved:", output_path)


保存完成: /content/here_manhattan_30min_raw.parquet


In [None]:
import pandas as pd

raw_path = "/content/here_manhattan_30min_raw.parquet"
df_raw = pd.read_parquet(raw_path)

print("Raw data amount:", len(df_raw))
df_raw.head()


原始数据行数: 112683


Unnamed: 0,snapshot_time,location,currentFlow,historicFlow,historicSpeed
0,2025-12-12 01:30:50.544090,"{'description': '3rd Ave/Exit 3', 'length': 60...","{'confidence': 0.99, 'freeFlow': 20.0, 'jamFac...",,
1,2025-12-12 01:30:50.544090,"{'description': '42nd St', 'length': 889.0, 's...","{'confidence': 0.99, 'freeFlow': 8.055555, 'ja...",,
2,2025-12-12 01:30:50.544090,"{'description': 'Christopher St', 'length': 35...","{'confidence': 0.87, 'freeFlow': 4.7222223, 'j...",,
3,2025-12-12 01:30:50.544090,"{'description': 'CR-17/Mt Vernon St', 'length'...","{'confidence': 0.89, 'freeFlow': 8.333334, 'ja...",,
4,2025-12-12 01:30:50.544090,"{'description': '161st St', 'length': 141.0, '...","{'confidence': 0.88, 'freeFlow': 9.722222, 'ja...",,


In [None]:
from shapely.geometry import LineString
import geopandas as gpd

def shape_to_linestring(shape_dict):
    """HERE shape.links[].points → shapely LineString"""
    if not shape_dict or "links" not in shape_dict:
        return None

    coords = []
    for link in shape_dict["links"]:
        for p in link.get("points", []):
            lat = p.get("lat")
            lng = p.get("lng")
            if lat is None or lng is None:
                continue
            coords.append((lng, lat))  # shapely uses (lon, lat)

    if len(coords) < 2:
        return None

    return LineString(coords)


df_raw["geometry"] = df_raw["location"].apply(
    lambda loc: shape_to_linestring(loc.get("shape")) if isinstance(loc, dict) else None
)

df_raw = df_raw.dropna(subset=["geometry"]).reset_index(drop=True)

print("Effective Road amounts:", len(df_raw))
df_raw.head()


有效道路数: 112683


Unnamed: 0,snapshot_time,location,currentFlow,historicFlow,historicSpeed,geometry
0,2025-12-12 01:30:50.544090,"{'description': '3rd Ave/Exit 3', 'length': 60...","{'confidence': 0.99, 'freeFlow': 20.0, 'jamFac...",,,"LINESTRING (-73.90271 40.84455, -73.90136 40.8..."
1,2025-12-12 01:30:50.544090,"{'description': '42nd St', 'length': 889.0, 's...","{'confidence': 0.99, 'freeFlow': 8.055555, 'ja...",,,"LINESTRING (-73.96714 40.75679, -73.96761 40.7..."
2,2025-12-12 01:30:50.544090,"{'description': 'Christopher St', 'length': 35...","{'confidence': 0.87, 'freeFlow': 4.7222223, 'j...",,,"LINESTRING (-74.00839 40.72966, -74.0082 40.73..."
3,2025-12-12 01:30:50.544090,"{'description': 'CR-17/Mt Vernon St', 'length'...","{'confidence': 0.89, 'freeFlow': 8.333334, 'ja...",,,"LINESTRING (-74.02515 40.85664, -74.02525 40.8..."
4,2025-12-12 01:30:50.544090,"{'description': '161st St', 'length': 141.0, '...","{'confidence': 0.88, 'freeFlow': 9.722222, 'ja...",,,"LINESTRING (-73.92772 40.83002, -73.92796 40.8..."


In [None]:
import json
import hashlib
import numpy as np

def to_builtin(obj):
    if isinstance(obj, dict):
        return {k: to_builtin(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [to_builtin(v) for v in obj]
    elif isinstance(obj, np.ndarray):
        return [to_builtin(v) for v in obj.tolist()]
    elif isinstance(obj, (np.generic,)):
        return obj.item()
    else:
        return obj

def generate_segment_id(location_dict):
    if not isinstance(location_dict, dict):
        return None
    shape = location_dict.get("shape")
    if shape is None:
        return None
    shape_clean = to_builtin(shape)
    shape_str = json.dumps(shape_clean, sort_keys=True)
    return hashlib.md5(shape_str.encode("utf-8")).hexdigest()


In [None]:
df_raw["segment_id"] = df_raw["location"].apply(generate_segment_id)
df_raw = df_raw.dropna(subset=["segment_id"])
print("Unique segment_id:", df_raw["segment_id"].nunique())
df_raw.head()


唯一 segment_id 数量: 3771


Unnamed: 0,snapshot_time,location,currentFlow,historicFlow,historicSpeed,geometry,segment_id
0,2025-12-12 01:30:50.544090,"{'description': '3rd Ave/Exit 3', 'length': 60...","{'confidence': 0.99, 'freeFlow': 20.0, 'jamFac...",,,"LINESTRING (-73.90271 40.84455, -73.90136 40.8...",89ff6251df05c277f17f5ca83ec2aee3
1,2025-12-12 01:30:50.544090,"{'description': '42nd St', 'length': 889.0, 's...","{'confidence': 0.99, 'freeFlow': 8.055555, 'ja...",,,"LINESTRING (-73.96714 40.75679, -73.96761 40.7...",ecb7eed5645ff558ec0ed0257e9c4068
2,2025-12-12 01:30:50.544090,"{'description': 'Christopher St', 'length': 35...","{'confidence': 0.87, 'freeFlow': 4.7222223, 'j...",,,"LINESTRING (-74.00839 40.72966, -74.0082 40.73...",fa29e445b0865fd4ea44852689544af1
3,2025-12-12 01:30:50.544090,"{'description': 'CR-17/Mt Vernon St', 'length'...","{'confidence': 0.89, 'freeFlow': 8.333334, 'ja...",,,"LINESTRING (-74.02515 40.85664, -74.02525 40.8...",e647745ab4b6ea1691ad0778b94bbd0c
4,2025-12-12 01:30:50.544090,"{'description': '161st St', 'length': 141.0, '...","{'confidence': 0.88, 'freeFlow': 9.722222, 'ja...",,,"LINESTRING (-73.92772 40.83002, -73.92796 40.8...",3f5686b388bda9cb823973a43fc323af


In [None]:
def extract_speed(flow):
    if not isinstance(flow, dict):
        return None
    return flow.get("speed")

df_raw["speed_kmh"] = df_raw["currentFlow"].apply(extract_speed)
df_raw = df_raw.dropna(subset=["speed_kmh"])

df_raw["timestamp"] = pd.to_datetime(df_raw["snapshot_time"]).dt.floor("1min")

df_speed = df_raw[["segment_id", "timestamp", "speed_kmh"]].rename(
    columns={
        "segment_id": "item_id",
        "timestamp": "timestamp",
        "speed_kmh": "target"
    }
).sort_values(["item_id", "timestamp"])


In [None]:
print(df_speed.head())
print("Unique segment_id:", df_raw["segment_id"].nunique())
df_speed.shape

                                item_id           timestamp    target
2549   000a1634cb8a6a7d169310433661d62a 2025-12-12 01:30:00  5.555556
6307   000a1634cb8a6a7d169310433661d62a 2025-12-12 01:31:00  6.388889
10062  000a1634cb8a6a7d169310433661d62a 2025-12-12 01:32:00  5.000000
13817  000a1634cb8a6a7d169310433661d62a 2025-12-12 01:33:00  5.277778
17572  000a1634cb8a6a7d169310433661d62a 2025-12-12 01:34:00  5.000000
唯一 segment_id 数量: 3762


(112403, 3)

In [None]:
import geopandas as gpd

gdf_roads = gpd.GeoDataFrame(
    df_raw[["segment_id", "geometry"]].drop_duplicates("segment_id"),
    geometry="geometry",
    crs="EPSG:4326"
)
gdf_roads.to_file("/content/here_roads_clean.geojson", driver="GeoJSON")

df_speed.to_parquet("/content/here_speed_timeseries.parquet")


In [None]:
!pip install osmnx

Collecting osmnx
  Downloading osmnx-2.0.7-py3-none-any.whl.metadata (4.9 kB)
Downloading osmnx-2.0.7-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: osmnx
Successfully installed osmnx-2.0.7


In [None]:
import geopandas as gpd
import osmnx as ox
import pandas as pd

here_path = "/content/here_roads_clean.geojson"

here_gdf = gpd.read_file(here_path)

here_gdf = here_gdf[["segment_id", "geometry"]].dropna(subset=["segment_id", "geometry"])

if here_gdf.crs is None:
    here_gdf = here_gdf.set_crs(epsg=4326)
else:
    here_gdf = here_gdf.to_crs(epsg=4326)

print("HERE segments:", len(here_gdf), "unique:", here_gdf["segment_id"].nunique())

def load_manhattan_graph():
    G = ox.graph_from_place(
        "Manhattan, New York City, New York, USA",
        network_type="drive"
    )
    G = ox.add_edge_speeds(G)
    G = ox.add_edge_travel_times(G)
    return G

G = load_manhattan_graph()

edges_gdf = ox.graph_to_gdfs(G, nodes=False, edges=True).reset_index()
edges_gdf = edges_gdf[["u", "v", "key", "geometry"]].dropna(subset=["geometry"])

if edges_gdf.crs is None:
    edges_gdf = edges_gdf.set_crs(epsg=4326)
else:
    edges_gdf = edges_gdf.to_crs(epsg=4326)

print("OSM edges:", len(edges_gdf))
edges_gdf.head()


HERE segments: 3762 unique: 3762
OSM edges: 9934


Unnamed: 0,u,v,key,geometry
0,42421728,42435337,0,"LINESTRING (-73.96004 40.79805, -73.96011 40.7..."
1,42421728,42421731,0,"LINESTRING (-73.96004 40.79805, -73.96017 40.7..."
2,42421728,42432736,0,"LINESTRING (-73.96004 40.79805, -73.95997 40.7..."
3,42421731,42437916,0,"LINESTRING (-73.96147 40.79865, -73.96154 40.7..."
4,42421731,42432737,0,"LINESTRING (-73.96147 40.79865, -73.9614 40.79..."


In [None]:
TARGET_CRS = "EPSG:2263"

here_proj = here_gdf.to_crs(TARGET_CRS)
edges_proj = edges_gdf.to_crs(TARGET_CRS)

print("HERE CRS:", here_proj.crs)
print("OSM CRS:", edges_proj.crs)


HERE CRS: EPSG:2263
OSM CRS: EPSG:2263


In [None]:
from geopandas import sjoin_nearest

matched = sjoin_nearest(
    here_proj,
    edges_proj,
    how="left",
    distance_col="dist_m"
)

print("Matched lines:", len(matched))
matched.head()


匹配结果行数: 12335


Unnamed: 0,segment_id,geometry,index_right,u,v,key,dist_m
0,89ff6251df05c277f17f5ca83ec2aee3,"LINESTRING (1011168.154 246984.695, 1011541.73...",7043,60925389,1764424345,0,6812.059184
1,ecb7eed5645ff558ec0ed0257e9c4068,"LINESTRING (993353.686 214997.411, 993223.564 ...",5034,42442955,42452882,0,0.0
1,ecb7eed5645ff558ec0ed0257e9c4068,"LINESTRING (993353.686 214997.411, 993223.564 ...",3538,42436714,42436710,0,0.0
1,ecb7eed5645ff558ec0ed0257e9c4068,"LINESTRING (993353.686 214997.411, 993223.564 ...",5018,42442937,11337295608,0,0.0
1,ecb7eed5645ff558ec0ed0257e9c4068,"LINESTRING (993353.686 214997.411, 993223.564 ...",9747,11337295608,42442937,0,0.0


In [None]:
matched = matched.dropna(subset=["u", "v", "key"]).copy()

matched = matched.sort_values(["segment_id", "dist_m"])

matched_unique = matched.drop_duplicates(subset=["segment_id"], keep="first")

print("HERE segment amount:", here_gdf["segment_id"].nunique())
print("OSM edge 的 HERE segments:", matched_unique["segment_id"].nunique())

print(matched_unique["dist_m"].describe())


HERE segment 总数: 3762
成功匹配到 OSM edge 的 HERE segments: 3762
count     3762.000000
mean      3412.080603
std       5210.164239
min          0.000000
25%          0.000000
50%          1.123160
75%       5222.055131
max      22084.793029
Name: dist_m, dtype: float64


In [None]:
final_matched = matched_unique

segment_to_edge = (
    final_matched
    .set_index("segment_id")[["u", "v", "key"]]
    .to_dict("index")
)

some_seg = next(iter(segment_to_edge.keys()))
print("sample segment_id:", some_seg)
print("OSM edge:", segment_to_edge[some_seg])


示例 segment_id: 000a1634cb8a6a7d169310433661d62a
对应 OSM edge: {'u': 8904913030, 'v': 42421785, 'key': 0}


In [None]:
map_df = final_matched[["segment_id", "u", "v", "key", "dist_m"]].copy()
map_path = "/content/here_segment_to_osm_edge.parquet"
map_df.to_parquet(map_path)
print("Sheet saved:", map_path)
map_df.head()


映射表已保存: /content/here_segment_to_osm_edge.parquet


Unnamed: 0,segment_id,u,v,key,dist_m
2533,000a1634cb8a6a7d169310433661d62a,8904913030,42421785,0,0.0
3680,00201e4a8e9785f26e9d6f39a54b4c51,11728146527,42429390,0,0.0
1346,0020a2bd140373be227d8fade90014f8,60927650,60927825,0,0.0
1741,002d980671f300434d7c7835c06e3b4f,42437368,42445001,0,0.0
2054,00454b474fac97b60255b264e0cddd82,8677891126,6166333257,0,0.0


In [None]:
!pip install "autogluon.timeseries[chronos]"


Collecting autogluon.timeseries[chronos]
  Downloading autogluon.timeseries-1.4.0-py3-none-any.whl.metadata (12 kB)
Collecting torch<2.8,>=2.2 (from autogluon.timeseries[chronos])
  Downloading torch-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting lightning<2.8,>=2.2 (from autogluon.timeseries[chronos])
  Downloading lightning-2.6.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytorch-lightning (from autogluon.timeseries[chronos])
  Downloading pytorch_lightning-2.6.0-py3-none-any.whl.metadata (21 kB)
Collecting transformers<4.50,>=4.38.0 (from transformers[sentencepiece]<4.50,>=4.38.0->autogluon.timeseries[chronos])
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting gluonts<0.17,>=0.15.0 (fro

In [None]:
df_speed.to_parquet("/content/here_speed_timeseries.parquet")


In [None]:
import pandas as pd

ts_path = "/content/here_speed_timeseries.parquet"
df_speed = pd.read_parquet(ts_path)

print(df_speed.head())
print(df_speed.shape)
print("Unique item_id:", df_speed["item_id"].nunique())


                                item_id           timestamp    target
2549   000a1634cb8a6a7d169310433661d62a 2025-12-12 01:30:00  5.555556
6307   000a1634cb8a6a7d169310433661d62a 2025-12-12 01:31:00  6.388889
10062  000a1634cb8a6a7d169310433661d62a 2025-12-12 01:32:00  5.000000
13817  000a1634cb8a6a7d169310433661d62a 2025-12-12 01:33:00  5.277778
17572  000a1634cb8a6a7d169310433661d62a 2025-12-12 01:34:00  5.000000
(112403, 3)
不同 item_id 数量: 3762


In [None]:
df_speed = df_speed.dropna(subset=["target"]).copy()

counts = df_speed["item_id"].value_counts()
valid_ids = counts[counts >= 10].index
df_speed = df_speed[df_speed["item_id"].isin(valid_ids)].copy()

df_speed = df_speed.sort_values(["item_id", "timestamp"]).reset_index(drop=True)

print("Filtered line:", len(df_speed))
print("Filtered item_id amount:", df_speed["item_id"].nunique())


过滤后行数: 112316
过滤后 item_id 数量: 3742


In [None]:
from autogluon.timeseries import TimeSeriesDataFrame

ts_data = TimeSeriesDataFrame.from_data_frame(
    df_speed,
    id_column="item_id",
    timestamp_column="timestamp"
)

ts_data.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,target
item_id,timestamp,Unnamed: 2_level_1
000a1634cb8a6a7d169310433661d62a,2025-12-12 01:30:00,5.555556
000a1634cb8a6a7d169310433661d62a,2025-12-12 01:31:00,6.388889
000a1634cb8a6a7d169310433661d62a,2025-12-12 01:32:00,5.0
000a1634cb8a6a7d169310433661d62a,2025-12-12 01:33:00,5.277778
000a1634cb8a6a7d169310433661d62a,2025-12-12 01:34:00,5.0


In [None]:
PREDICTION_LENGTH = 3
FREQ = "1min"


In [None]:
!pip install "autogluon.timeseries[chronos]==1.1.1" -q
!pip install "transformers==4.41.2" "peft==0.10.0" -q


[31mERROR: Ignored the following versions that require a different python version: 0.5.0 Requires-Python >=3.7,<3.10; 0.5.0b20220623 Requires-Python >=3.7,<3.10; 0.5.0rc1 Requires-Python >=3.7,<3.10; 0.5.1 Requires-Python >=3.7,<3.10; 0.5.1b20220624 Requires-Python >=3.7,<3.10; 0.5.1b20220625 Requires-Python >=3.7,<3.10; 0.5.1b20220626 Requires-Python >=3.7,<3.10; 0.5.1b20220627 Requires-Python >=3.7,<3.10; 0.5.1b20220628 Requires-Python >=3.7,<3.10; 0.5.1b20220629 Requires-Python >=3.7,<3.10; 0.5.1b20220630 Requires-Python >=3.7,<3.10; 0.5.1b20220701 Requires-Python >=3.7,<3.10; 0.5.1b20220702 Requires-Python >=3.7,<3.10; 0.5.1b20220703 Requires-Python >=3.7,<3.10; 0.5.1b20220704 Requires-Python >=3.7,<3.10; 0.5.1b20220705 Requires-Python >=3.7,<3.10; 0.5.1b20220706 Requires-Python >=3.7,<3.10; 0.5.1b20220707 Requires-Python >=3.7,<3.10; 0.5.1b20220708 Requires-Python >=3.7,<3.10; 0.5.1b20220709 Requires-Python >=3.7,<3.10; 0.5.1b20220710 Requires-Python >=3.7,<3.10; 0.5.1b20220711 R

In [None]:
import transformers, peft
print("transformers version:", transformers.__version__)
print("peft version:", peft.__version__)


transformers version: 4.41.2
peft version: 0.10.0


In [None]:
from autogluon.timeseries import TimeSeriesPredictor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

predictor = TimeSeriesPredictor(
    prediction_length=PREDICTION_LENGTH,
    freq=FREQ,
    path="/content/here_chronos_bolt_tiny",
)

hyperparameters = {
    "Chronos": {
        "model_path": "bolt_tiny",
        "fine_tune": True,
        "fine_tune_steps": 200,
        "fine_tune_lr": 1e-5,
        "device": device,
    }
}

TIME_LIMIT = 600

predictor = predictor.fit(
    train_data=ts_data,
    hyperparameters=hyperparameters,
    time_limit=TIME_LIMIT,
)


Frequency '1min' stored as 'min'
Beginning AutoGluon training... Time limit = 600s
AutoGluon will save models to '/content/here_chronos_bolt_tiny'
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
GPU Count:          1
Memory Avail:       10.69 GB / 12.67 GB (84.4%)
Disk Space Avail:   190.71 GB / 235.68 GB (80.9%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'min',
 'hyperparameters': {'Chronos': {'device': 'cuda',
                                 'fine_tune': True,
                                 'fine_tune_lr': 1e-05,
                                 'fine_tune_steps': 200,
                                 'model_path': 'bolt_tiny'}},
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 3,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n

使用设备: cuda


Provided train_data has 115746 rows (NaN fraction=3.2%), 3742 time series. Median time series length is 31 (min=10, max=31). 

Provided data contains following columns:
	target: 'target'

AutoGluon will gauge predictive performance using evaluation metric: 'WQL'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.

Starting training. Start time is 2025-12-12 03:21:53
Models that will be trained: ['Chronos[bolt_tiny]']
Training timeseries model Chronos[bolt_tiny]. Training for up to 591.1s of the 591.1s of remaining time.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access p

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/34.6M [00:00<?, ?B/s]

	Saving fine-tuned model to /content/here_chronos_bolt_tiny/models/Chronos[bolt_tiny]/W0/fine-tuned-ckpt
	-0.0675       = Validation score (-WQL)
	23.80   s     = Training runtime
	5.65    s     = Validation (prediction) runtime
Not fitting ensemble as only 1 model was trained.
Training complete. Models trained: ['Chronos[bolt_tiny]']
Total runtime: 29.48 s
Best model: Chronos[bolt_tiny]
Best model score: -0.0675


In [None]:
df_speed = df_speed.dropna(subset=["target"]).copy()

counts = df_speed["item_id"].value_counts()
valid_ids = counts[counts >= 10].index

df_speed = df_speed[df_speed["item_id"].isin(valid_ids)].copy()


In [None]:
valid_segments = set(df_speed["item_id"].astype(str).unique())

print("Training using segment amount:", len(valid_segments))


训练中使用的 segment 数量: 3742


In [None]:
map_df = pd.read_parquet("/content/here_segment_to_osm_edge.parquet")
print("Converted lines:", len(map_df))
map_df.head()


原始映射行数: 3762


Unnamed: 0,segment_id,u,v,key,dist_m
2533,000a1634cb8a6a7d169310433661d62a,8904913030,42421785,0,0.0
3680,00201e4a8e9785f26e9d6f39a54b4c51,11728146527,42429390,0,0.0
1346,0020a2bd140373be227d8fade90014f8,60927650,60927825,0,0.0
1741,002d980671f300434d7c7835c06e3b4f,42437368,42445001,0,0.0
2054,00454b474fac97b60255b264e0cddd82,8677891126,6166333257,0,0.0


In [None]:
map_df_filtered = map_df[map_df["segment_id"].astype(str).isin(valid_segments)].copy()

print("Converted lines amount:", len(map_df_filtered))
print("Filtered converted segment_id:", map_df_filtered["segment_id"].nunique())


过滤后映射行数: 3742
过滤后映射中 segment_id 数量: 3742


In [None]:
segment_to_edge = (
    map_df_filtered
    .set_index("segment_id")[["u", "v", "key"]]
    .to_dict("index")
)

print("Final dictionary size:", len(segment_to_edge))

some_seg = next(iter(segment_to_edge.keys()))
print("Sample segment_id:", some_seg)
print("Correlated OSM edge:", segment_to_edge[some_seg])


最终映射字典大小: 3742
示例 segment_id: 000a1634cb8a6a7d169310433661d62a
对应 OSM edge: {'u': 8904913030, 'v': 42421785, 'key': 0}


In [None]:
filtered_map_path = "/content/here_segment_to_osm_edge_filtered.parquet"
map_df_filtered.to_parquet(filtered_map_path)
print("filtered map saved:", filtered_map_path)


过滤后的映射表已保存到: /content/here_segment_to_osm_edge_filtered.parquet


In [None]:
!zip -r here_chronos_bolt_tiny.zip here_chronos_bolt_tiny

  adding: here_chronos_bolt_tiny/ (stored 0%)
  adding: here_chronos_bolt_tiny/utils/ (stored 0%)
  adding: here_chronos_bolt_tiny/utils/data/ (stored 0%)
  adding: here_chronos_bolt_tiny/utils/data/train.pkl (deflated 87%)
  adding: here_chronos_bolt_tiny/version.txt (stored 0%)
  adding: here_chronos_bolt_tiny/models/ (stored 0%)
  adding: here_chronos_bolt_tiny/models/trainer.pkl (deflated 45%)
  adding: here_chronos_bolt_tiny/models/Chronos[bolt_tiny]/ (stored 0%)
  adding: here_chronos_bolt_tiny/models/Chronos[bolt_tiny]/utils/ (stored 0%)
  adding: here_chronos_bolt_tiny/models/Chronos[bolt_tiny]/utils/oof.pkl (deflated 28%)
  adding: here_chronos_bolt_tiny/models/Chronos[bolt_tiny]/model.pkl (deflated 49%)
  adding: here_chronos_bolt_tiny/models/Chronos[bolt_tiny]/W0/ (stored 0%)
  adding: here_chronos_bolt_tiny/models/Chronos[bolt_tiny]/W0/model.pkl (deflated 43%)
  adding: here_chronos_bolt_tiny/models/Chronos[bolt_tiny]/W0/fine-tuned-ckpt/ (stored 0%)
  adding: here_chronos_b

In [None]:
pip install pandas pyarrow scikit-learn joblib



In [None]:
#Forest
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
import joblib

DATA_PATH = "here_speed_timeseries.parquet"
W = 5
H = 3
MODEL_OUT = "rf_speed_model.joblib"
STATE_OUT = "rf_last_window.parquet"
META_OUT = "rf_meta.json"

def build_supervised(df: pd.DataFrame, W: int, H: int) -> tuple[np.ndarray, np.ndarray]:
    X_list, Y_list = [], []

    df = df.sort_values(["item_id", "timestamp"]).reset_index(drop=True)

    for item_id, g in df.groupby("item_id", sort=False):
        y = g["target"].to_numpy(dtype=np.float32)

        if len(y) < W + H:
            continue

        for t in range(W - 1, len(y) - H):
            x = y[t - (W - 1): t + 1]
            y_future = y[t + 1: t + 1 + H]

            if np.any(~np.isfinite(x)) or np.any(~np.isfinite(y_future)):
                continue

            X_list.append(x)
            Y_list.append(y_future)

    X = np.vstack(X_list) if X_list else np.zeros((0, W), dtype=np.float32)
    Y = np.vstack(Y_list) if Y_list else np.zeros((0, H), dtype=np.float32)
    return X, Y

def build_last_window_state(df: pd.DataFrame, W: int) -> pd.DataFrame:
    df = df.sort_values(["item_id", "timestamp"]).reset_index(drop=True)

    rows = []
    for item_id, g in df.groupby("item_id", sort=False):
        y = g["target"].to_numpy(dtype=np.float32)
        if len(y) < W:
            continue
        last = y[-W:]
        row = {"item_id": str(item_id)}
        for i in range(W):
            row[f"lag_{i+1}"] = float(last[i])
        rows.append(row)

    return pd.DataFrame(rows)

def main():
    df = pd.read_parquet(DATA_PATH)
    df["item_id"] = df["item_id"].astype(str)
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["target"] = pd.to_numeric(df["target"], errors="coerce")

    df = df.dropna(subset=["item_id", "timestamp", "target"]).copy()
    df = df[df["target"] > 0].copy()

    print("Loaded df:", df.shape, "unique items:", df["item_id"].nunique())

    X, Y = build_supervised(df, W=W, H=H)
    print("Supervised X:", X.shape, "Y:", Y.shape)
    if X.shape[0] == 0:
        raise RuntimeError("No training sample generated")

    base = RandomForestRegressor(
        n_estimators=200,
        random_state=0,
        n_jobs=-1,
        max_depth=None,
    )
    model = MultiOutputRegressor(base, n_jobs=-1)

    print("Training RF...")
    model.fit(X, Y)
    print("Training done.")

    joblib.dump({"model": model, "W": W, "H": H}, MODEL_OUT)
    print("Saved model to:", MODEL_OUT)

    state = build_last_window_state(df, W=W)
    state.to_parquet(STATE_OUT, index=False)
    print("Saved last-window state to:", STATE_OUT, "rows:", len(state))

if __name__ == "__main__":
    main()


Loaded df: (112403, 3) unique items: 3762
Supervised X: (86122, 5) Y: (86122, 3)
Training RF...
Training done.
Saved model to: rf_speed_model.joblib
Saved last-window state to: rf_last_window.parquet rows: 3755
