In [None]:
!pip install -q sodapy pandas pyarrow
!pip install -q "autogluon.timeseries~=1.0.0"

In [None]:
from sodapy import Socrata
import pandas as pd
from datetime import datetime, timedelta

DOMAIN = "data.cityofnewyork.us"
DATASET_ID = "i4gi-tjb9"   # DOT-Traffic-Speeds-NBE

APP_TOKEN = None

start_dt = datetime.utcnow() - timedelta(days=30)
START_DATE_STR = start_dt.strftime("%Y-%m-%dT%H:%M:%S")

where_clause = (
    f"data_as_of >= '{START_DATE_STR}' "
    f"AND borough = 'Manhattan'"
)

print("SoQL WHERE limit:", where_clause)

client = Socrata(DOMAIN, APP_TOKEN, timeout=60)

limit = 50000
offset = 0
records = []

while True:
    batch = client.get(
        DATASET_ID,
        where=where_clause,
        limit=limit,
        offset=offset,
        order="data_as_of"
    )
    if not batch:
        break
    records.extend(batch)
    offset += limit
    print(f"Fetched {offset} rows so far...")

df = pd.DataFrame.from_records(records)
print("Total rows (Manhattan, last 30 days):", len(df))
df.head()


  start_dt = datetime.utcnow() - timedelta(days=30)


SoQL WHERE 条件： data_as_of >= '2025-11-11T21:38:23' AND borough = 'Manhattan'


KeyboardInterrupt: 

In [None]:
if df.empty:
    raise ValueError("API error")

df["data_as_of"] = pd.to_datetime(df["data_as_of"])
df["speed"] = pd.to_numeric(df["speed"], errors="coerce")
df["travel_time"] = pd.to_numeric(df["travel_time"], errors="coerce")

df = df[[
    "data_as_of",
    "link_id",
    "speed",
    "travel_time",
    "link_points",
    "borough",
    "link_name"
]].copy()

df = df.sort_values("data_as_of")

df.head()


Unnamed: 0,data_as_of,link_id,speed,travel_time,link_points,borough,link_name
0,2025-11-11 19:29:03,4456452,33.55,72,"40.8011005,-73.92846 40.80151,-73.93066 40.801...",Manhattan,TBB W - FDR S MANHATTAN TRUSS - E116TH STREET
13,2025-11-11 19:29:03,4620343,0.0,0,"40.77149,-73.99423 40.7719,-73.99401 40.77481,...",Manhattan,Westside Hwy N 57th St - GWB
12,2025-11-11 19:29:03,4616346,0.0,0,"40.8500304,-73.944831 40.8492,-73.945241 40.84...",Manhattan,Westside Hwy S GWB - 57th St
11,2025-11-11 19:29:03,4616342,10.56,237,"40.7081105,-73.99944 40.7084705,-73.99884 40.7...",Manhattan,BKN Bridge Manhattan Side - FDR N Catherine Slip
10,2025-11-11 19:29:03,4616341,15.53,168,"40.70908,-73.9959 40.70895,-73.996941 40.70882...",Manhattan,FDR S Catherine Slip - BKN Bridge Manhattan Side


In [None]:
output_path = "manhattan_1month_speed.parquet"
df.to_parquet(output_path)
print("saved:", output_path)


保存完成： manhattan_1month_speed.parquet


In [None]:
!pip install -q --upgrade pip
!pip install -q "autogluon==1.4.0"



In [None]:
!pip install transformers
!pip install peft




In [None]:
import pandas as pd

df = pd.read_parquet("/content/manhattan_1month_speed.parquet")
print(df.head())

df = df[df["travel_time"] > 0].copy()

df_ts = df.rename(columns={
    "link_id": "item_id",
    "data_as_of": "timestamp",
    "travel_time": "target",
})

df_ts["timestamp"] = pd.to_datetime(df_ts["timestamp"])
df_ts["target"] = pd.to_numeric(df_ts["target"], errors="coerce")

df_ts = df_ts.dropna(subset=["target"])
df_ts = df_ts.sort_values(["item_id", "timestamp"])

print(df_ts[["item_id", "timestamp", "target"]].head())
print(df_ts["target"].describe())


           data_as_of  link_id  speed  travel_time  \
0 2025-11-10 23:17:03  4329507  31.68          158   
1 2025-11-10 23:17:03  4620298  44.11          151   
2 2025-11-10 23:17:07  4456501  45.36          156   
3 2025-11-10 23:17:07  4456502  44.11          162   
4 2025-11-10 23:17:07  4456511  35.41          156   

                                         link_points    borough  \
0  40.75719,-73.99724 40.76017,-74.00382 40.76185...  Manhattan   
1  40.8462505,-73.932161 40.846951,-73.933641 40....  Manhattan   
2  40.68036,-74.00441001 40.6822,-74.0057201 40.6...  Manhattan   
3  40.70631,-74.01501 40.705380,-74.01528 40.7049...  Manhattan   
4  40.745726,-73.97359 40.745616,-73.97305 40.745...  Manhattan   

                                           link_name  
0                LINCOLN TUNNEL W NORTH TUBE NY - NJ  
1  GWB E LOWER LEVEL PLAZA - CBE E LOWER LEVEL AM...  
2                BBT W Toll Plaza - Manhattan Portal  
3                BBT E Manhattan Portal - Toll Plaza

In [None]:
from autogluon.timeseries import TimeSeriesDataFrame

ts_df = TimeSeriesDataFrame.from_data_frame(
    df_ts[["item_id", "timestamp", "target"]],
    id_column="item_id",
    timestamp_column="timestamp",
)

ts_df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,target
item_id,timestamp,Unnamed: 2_level_1
4329472,2025-11-11 05:38:06,170
4329472,2025-11-11 05:43:06,170
4329472,2025-11-11 05:48:06,182
4329472,2025-11-11 05:53:06,182
4329472,2025-11-11 06:33:06,243


In [None]:
ts_index = ts_df.index.get_level_values("timestamp").to_series()
freq_guess = ts_index.diff().mode()[0]
print("Infer:", freq_guess)

FREQ = "5min"


推测时间间隔: 0 days 00:05:00


In [None]:
import pandas as pd

cutoff = df_ts["timestamp"].max() - pd.Timedelta(days=1)

train_data = ts_df[ts_df.index.get_level_values("timestamp") <= cutoff]
test_data  = ts_df[ts_df.index.get_level_values("timestamp") > cutoff]

print("train amount:", len(train_data), "test amount:", len(test_data))


train 条数: 114825 test 条数: 3850


In [None]:
!pip install "transformers==4.49.0" "peft==0.17.0"

Collecting peft==0.17.0
  Downloading peft-0.17.0-py3-none-any.whl.metadata (14 kB)
Downloading peft-0.17.0-py3-none-any.whl (503 kB)
Installing collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.18.0
    Uninstalling peft-0.18.0:
      Successfully uninstalled peft-0.18.0
Successfully installed peft-0.17.0


In [None]:
from autogluon.timeseries import TimeSeriesPredictor

PRED_LEN = 12

predictor = TimeSeriesPredictor(
    prediction_length=PRED_LEN,
    freq=FREQ,
    target="target",
    path="ts_manhattan_travel_time_model",
)

predictor.fit(
    train_data=train_data,
    presets="bolt_tiny",   # Chronos-Bolt Tiny
)


Beginning AutoGluon training...
AutoGluon will save models to '/content/ts_manhattan_travel_time_model'
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
GPU Count:          1
Memory Avail:       10.54 GB / 12.67 GB (83.2%)
Disk Space Avail:   190.71 GB / 235.68 GB (80.9%)
Setting presets to: bolt_tiny

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': '5min',
 'hyperparameters': {'Chronos': {'model_path': 'bolt_tiny'}},
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 12,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': True,
 'target': 'target',
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampled to frequency '5min'.
Provided train_data has 146495 rows (NaN fracti

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7d15336989e0>

In [None]:
forecast = predictor.predict(test_data)

median_forecast = forecast.quantile(0.5)

median_df = median_forecast.to_frame(name="pred_travel_time").reset_index()

median_df.head()


data with frequency 'IRREG' has been resampled to frequency '5min'.
Model not specified in predict, will default to the model with the best validation score: Chronos[bolt_tiny]


Unnamed: 0,index,pred_travel_time
0,mean,414.384018
1,0.1,231.96608
2,0.2,338.994751
3,0.3,350.254883
4,0.4,374.181671


In [None]:
!pip install osmnx

Collecting osmnx
  Using cached osmnx-2.0.7-py3-none-any.whl.metadata (4.9 kB)
Downloading osmnx-2.0.7-py3-none-any.whl (101 kB)
Installing collected packages: osmnx
Successfully installed osmnx-2.0.7


In [None]:
import osmnx as ox

def load_manhattan_graph():
    # Load Manhattan road network
    G = ox.graph_from_place(
        "Manhattan, New York City, New York, USA",
        network_type="drive"
    )

    # Add edge speeds + travel times
    G = ox.add_edge_speeds(G)          # default speeds from OSM
    G = ox.add_edge_travel_times(G)    # adds "travel_time"
    return G

In [None]:
G = load_manhattan_graph()

In [None]:
G.edges(keys=True, data=True)

OutMultiEdgeDataView([(42421728, 42435337, 0, {'osmid': 195743153, 'highway': 'secondary', 'maxspeed': '25 mph', 'name': 'Central Park West', 'oneway': False, 'reversed': True, 'length': np.float64(85.34515470462713), 'geometry': <LINESTRING (-73.96 40.798, -73.96 40.798, -73.96 40.798, -73.96 40.797, -73...>, 'speed_kph': 40.2335, 'travel_time': 7.636485936760603}), (42421728, 42421731, 0, {'osmid': [420625565, 420625573, 5668966], 'highway': 'secondary', 'name': 'West 106th Street', 'oneway': False, 'reversed': False, 'length': np.float64(138.03308952853828), 'geometry': <LINESTRING (-73.96 40.798, -73.96 40.798, -73.96 40.798, -73.96 40.798, -73...>, 'speed_kph': 38.86751906141367, 'travel_time': 12.784945741393154}), (42421728, 42432736, 0, {'osmid': [1271523197, 1271523198], 'highway': 'secondary', 'maxspeed': '25 mph', 'name': 'Central Park West', 'oneway': False, 'reversed': False, 'length': np.float64(86.27431578887493), 'geometry': <LINESTRING (-73.96 40.798, -73.96 40.798, -7

In [None]:
import pandas as pd
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

df = pd.read_parquet("/content/manhattan_1month_speed.parquet")

df = df[df["speed"] > 0].copy()

df_speed = df.rename(columns={
    "link_id": "item_id",
    "data_as_of": "timestamp",
    "speed": "target",
})

df_speed["timestamp"] = pd.to_datetime(df_speed["timestamp"])
df_speed["target"] = pd.to_numeric(df_speed["target"], errors="coerce")

df_speed = df_speed.dropna(subset=["target"])
df_speed = df_speed.sort_values(["item_id", "timestamp"])

ts_speed = TimeSeriesDataFrame.from_data_frame(
    df_speed[["item_id", "timestamp", "target"]],
    id_column="item_id",
    timestamp_column="timestamp",
)

ts_speed.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,target
item_id,timestamp,Unnamed: 2_level_1
4329472,2025-11-11 05:38:06,32.31
4329472,2025-11-11 05:43:06,32.31
4329472,2025-11-11 05:48:06,30.44
4329472,2025-11-11 05:53:06,30.44
4329472,2025-11-11 06:33:06,22.36


In [None]:
ts_index = ts_speed.index.get_level_values("timestamp").to_series()
freq_guess = ts_index.diff().mode()[0]
print("Infer:", freq_guess)

FREQ = "5min"

import pandas as pd
cutoff = df_speed["timestamp"].max() - pd.Timedelta(days=1)

train_speed = ts_speed[ts_speed.index.get_level_values("timestamp") <= cutoff]
test_speed  = ts_speed[ts_speed.index.get_level_values("timestamp") > cutoff]


推测时间间隔: 0 days 00:05:00


In [None]:
!pip install transformers



In [None]:
PRED_LEN = 12

predictor_speed = TimeSeriesPredictor(
    prediction_length=PRED_LEN,
    freq=FREQ,
    target="target",
    path="chronos_bolt_manhattan_speed_model",
)

predictor_speed.fit(
    train_data=train_speed,
    presets="bolt_tiny",
)


Beginning AutoGluon training...
AutoGluon will save models to '/content/chronos_bolt_manhattan_speed_model'
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
GPU Count:          1
Memory Avail:       9.67 GB / 12.67 GB (76.3%)
Disk Space Avail:   190.75 GB / 235.68 GB (80.9%)
Setting presets to: bolt_tiny

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': '5min',
 'hyperparameters': {'Chronos': {'model_path': 'bolt_tiny'}},
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 12,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': True,
 'target': 'target',
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampled to frequency '5min'.
Provided train_data has 146495 rows (NaN fra

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7c7e7211e3c0>

#############################################

In [None]:
!pip install -U pip
!pip install "autogluon.timeseries[all]"
!pip install osmnx sodapy pyarrow

Collecting osmnx
  Downloading osmnx-2.0.7-py3-none-any.whl.metadata (4.9 kB)
Downloading osmnx-2.0.7-py3-none-any.whl (101 kB)
Installing collected packages: osmnx
Successfully installed osmnx-2.0.7


In [None]:
import pandas as pd

df = pd.read_parquet("/content/manhattan_1month_speed.parquet")

print("raw amount:", len(df))
df.head()


原始行数: 220159


Unnamed: 0,data_as_of,link_id,speed,travel_time,link_points,borough,link_name
0,2025-11-10 23:17:03,4329507,31.68,158,"40.75719,-73.99724 40.76017,-74.00382 40.76185...",Manhattan,LINCOLN TUNNEL W NORTH TUBE NY - NJ
1,2025-11-10 23:17:03,4620298,44.11,151,"40.8462505,-73.932161 40.846951,-73.933641 40....",Manhattan,GWB E LOWER LEVEL PLAZA - CBE E LOWER LEVEL AM...
2,2025-11-10 23:17:07,4456501,45.36,156,"40.68036,-74.00441001 40.6822,-74.0057201 40.6...",Manhattan,BBT W Toll Plaza - Manhattan Portal
3,2025-11-10 23:17:07,4456502,44.11,162,"40.70631,-74.01501 40.705380,-74.01528 40.7049...",Manhattan,BBT E Manhattan Portal - Toll Plaza
4,2025-11-10 23:17:07,4456511,35.41,156,"40.745726,-73.97359 40.745616,-73.97305 40.745...",Manhattan,QMT E Manhattan Side - Toll Plaza


In [None]:
from autogluon.timeseries import TimeSeriesDataFrame

df = df[df["speed"] > 0].copy()

df_speed = df.rename(columns={
    "link_id": "item_id",
    "data_as_of": "timestamp",
    "speed": "target",
})

df_speed["timestamp"] = pd.to_datetime(df_speed["timestamp"])
df_speed["target"] = pd.to_numeric(df_speed["target"], errors="coerce")

df_speed = df_speed.dropna(subset=["target"])

df_speed = df_speed.sort_values(["item_id", "timestamp"])

ts_speed = TimeSeriesDataFrame.from_data_frame(
    df_speed[["item_id", "timestamp", "target"]],
    id_column="item_id",
    timestamp_column="timestamp",
)

ts_speed.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,target
item_id,timestamp,Unnamed: 2_level_1
4329472,2025-11-11 05:38:06,32.31
4329472,2025-11-11 05:43:06,32.31
4329472,2025-11-11 05:48:06,30.44
4329472,2025-11-11 05:53:06,30.44
4329472,2025-11-11 06:33:06,22.36


In [None]:
ts_index = ts_speed.index.get_level_values("timestamp").to_series()
freq_guess = ts_index.diff().mode()[0]
print("推测时间间隔:", freq_guess)

FREQ = "5min"

import pandas as pd

cutoff = df_speed["timestamp"].max() - pd.Timedelta(days=1)
print("train/test split:", cutoff)

train_speed = ts_speed[ts_speed.index.get_level_values("timestamp") <= cutoff]
test_speed  = ts_speed[ts_speed.index.get_level_values("timestamp") > cutoff]

print("train amount:", train_speed.num_items)
print("test amount:", test_speed.num_items)


推测时间间隔: 0 days 00:05:00
训练/测试分割时间: 2025-12-09 17:18:11
训练集序列数: 19
测试集序列数: 16


In [None]:
from autogluon.timeseries import TimeSeriesPredictor
import torch

PREDICTION_LENGTH = 12  # 12 * 5min = 60min

device = "cuda" if torch.cuda.is_available() else "cpu"
print("使用设备:", device)

predictor = TimeSeriesPredictor(
    prediction_length=PREDICTION_LENGTH,
    freq=FREQ,
    path="chronos_bolt_tiny_speed",
)


使用设备: cuda


In [None]:
hyperparameters = {
    "Chronos": {
        "model_path": "bolt_tiny",
        "fine_tune": True,
        "fine_tune_lr": 1e-5,
        "fine_tune_steps": 200,
        "device": device,
    }
}

TIME_LIMIT = 1800

predictor = predictor.fit(
    train_data=train_speed,
    hyperparameters=hyperparameters,
    time_limit=TIME_LIMIT,
)


Beginning AutoGluon training... Time limit = 1800s
AutoGluon will save models to '/content/chronos_bolt_tiny_speed'
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
GPU Count:          1
Memory Avail:       10.77 GB / 12.67 GB (85.0%)
Disk Space Avail:   190.71 GB / 235.68 GB (80.9%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': '5min',
 'hyperparameters': {'Chronos': {'device': 'cuda',
                                 'fine_tune': True,
                                 'fine_tune_lr': 1e-05,
                                 'fine_tune_steps': 200,
                                 'model_path': 'bolt_tiny'}},
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 12,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': 

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/34.6M [00:00<?, ?B/s]

	Saving fine-tuned model to /content/chronos_bolt_tiny_speed/models/Chronos[bolt_tiny]/W0/fine-tuned-ckpt
	-0.2040       = Validation score (-WQL)
	29.08   s     = Training runtime
	0.07    s     = Validation (prediction) runtime
Not fitting ensemble as only 1 model was trained.
Training complete. Models trained: ['Chronos[bolt_tiny]']
Total runtime: 29.17 s
Best model: Chronos[bolt_tiny]
Best model score: -0.2040


In [None]:
predictor.save()


In [None]:
!zip -r chronos_bolt_tiny_speed.zip chronos_bolt_tiny_speed

  adding: chronos_bolt_tiny_speed/ (stored 0%)
  adding: chronos_bolt_tiny_speed/utils/ (stored 0%)
  adding: chronos_bolt_tiny_speed/utils/data/ (stored 0%)
  adding: chronos_bolt_tiny_speed/utils/data/train.pkl (deflated 88%)
  adding: chronos_bolt_tiny_speed/version.txt (stored 0%)
  adding: chronos_bolt_tiny_speed/models/ (stored 0%)
  adding: chronos_bolt_tiny_speed/models/trainer.pkl (deflated 45%)
  adding: chronos_bolt_tiny_speed/models/Chronos[bolt_tiny]/ (stored 0%)
  adding: chronos_bolt_tiny_speed/models/Chronos[bolt_tiny]/utils/ (stored 0%)
  adding: chronos_bolt_tiny_speed/models/Chronos[bolt_tiny]/utils/oof.pkl (deflated 22%)
  adding: chronos_bolt_tiny_speed/models/Chronos[bolt_tiny]/model.pkl (deflated 49%)
  adding: chronos_bolt_tiny_speed/models/Chronos[bolt_tiny]/W0/ (stored 0%)
  adding: chronos_bolt_tiny_speed/models/Chronos[bolt_tiny]/W0/model.pkl (deflated 43%)
  adding: chronos_bolt_tiny_speed/models/Chronos[bolt_tiny]/W0/fine-tuned-ckpt/ (stored 0%)
  adding: 

In [None]:
!pip install osmnx geopandas shapely rtree pyarrow



In [None]:
import osmnx as ox

def load_manhattan_graph():
    G = ox.graph_from_place(
        "Manhattan, New York City, New York, USA",
        network_type="drive"
    )
    G = ox.add_edge_speeds(G)
    G = ox.add_edge_travel_times(G)
    return G

G = load_manhattan_graph()

edges_gdf = ox.graph_to_gdfs(G, nodes=False, edges=True)
edges_gdf = edges_gdf.reset_index()

print(edges_gdf.head())
print(edges_gdf.crs)


          u         v  key                            osmid      highway  \
0  42421728  42435337    0                        195743153    secondary   
1  42421728  42421731    0  [420625565, 420625573, 5668966]    secondary   
2  42421728  42432736    0         [1271523197, 1271523198]    secondary   
3  42421731  42437916    0                          5671485  residential   
4  42421731  42432737    0                        195743186  residential   

  maxspeed               name  oneway reversed      length  \
0   25 mph  Central Park West   False     True   85.345155   
1      NaN  West 106th Street   False    False  138.033090   
2   25 mph  Central Park West   False    False   86.274316   
3      NaN   Manhattan Avenue   False     True   86.149203   
4      NaN   Manhattan Avenue   False    False   85.968765   

                                            geometry  speed_kph  travel_time  \
0  LINESTRING (-73.96004 40.79805, -73.96011 40.7...  40.233500     7.636486   
1  LINESTR

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import LineString, Point

links_df = df[["link_id", "link_points"]].drop_duplicates(subset=["link_id"]).copy()
print("Unique link:", len(links_df))
links_df.head()


唯一 link 数量: 19


Unnamed: 0,link_id,link_points
0,4329507,"40.75719,-73.99724 40.76017,-74.00382 40.76185..."
1,4620298,"40.8462505,-73.932161 40.846951,-73.933641 40...."
2,4456501,"40.68036,-74.00441001 40.6822,-74.0057201 40.6..."
3,4456502,"40.70631,-74.01501 40.705380,-74.01528 40.7049..."
4,4456511,"40.745726,-73.97359 40.745616,-73.97305 40.745..."


In [None]:
def parse_link_points(link_points_str):
    if not isinstance(link_points_str, str):
        return None

    coords = []
    for pair in link_points_str.split():
        try:
            lat_str, lon_str = pair.split(',')
            lat = float(lat_str)
            lon = float(lon_str)
            coords.append((lon, lat))
        except ValueError:
            continue

    if len(coords) >= 2:
        return LineString(coords)
    elif len(coords) == 1:
        return Point(coords[0])
    else:
        return None

links_df["geometry"] = links_df["link_points"].apply(parse_link_points)

links_df = links_df.dropna(subset=["geometry"])

links_gdf = gpd.GeoDataFrame(links_df, geometry="geometry", crs="EPSG:4326")

links_gdf.head()


Unnamed: 0,link_id,link_points,geometry
0,4329507,"40.75719,-73.99724 40.76017,-74.00382 40.76185...","LINESTRING (-73.99724 40.75719, -74.00382 40.7..."
1,4620298,"40.8462505,-73.932161 40.846951,-73.933641 40....","LINESTRING (-73.93216 40.84625, -73.93364 40.8..."
2,4456501,"40.68036,-74.00441001 40.6822,-74.0057201 40.6...","LINESTRING (-74.00441 40.68036, -74.00572 40.6..."
3,4456502,"40.70631,-74.01501 40.705380,-74.01528 40.7049...","LINESTRING (-74.01501 40.70631, -74.01528 40.7..."
4,4456511,"40.745726,-73.97359 40.745616,-73.97305 40.745...","LINESTRING (-73.97359 40.74573, -73.97305 40.7..."


In [None]:
edges_gdf = edges_gdf.to_crs(epsg=4326)
links_gdf = links_gdf.to_crs(epsg=4326)


In [None]:
edges_for_join = edges_gdf[["u", "v", "key", "geometry"]].copy()

links_matched = gpd.sjoin_nearest(
    links_gdf,
    edges_for_join,
    how="left",
    distance_col="dist_degree"
)

links_matched.head()





Unnamed: 0,link_id,link_points,geometry,index_right,u,v,key,dist_degree
0,4329507,"40.75719,-73.99724 40.76017,-74.00382 40.76185...","LINESTRING (-73.99724 40.75719, -74.00382 40.7...",7739,593839894,42430651,0,0.0
0,4329507,"40.75719,-73.99724 40.76017,-74.00382 40.76185...","LINESTRING (-73.99724 40.75719, -74.00382 40.7...",1624,42430651,593839894,0,0.0
0,4329507,"40.75719,-73.99724 40.76017,-74.00382 40.76185...","LINESTRING (-73.99724 40.75719, -74.00382 40.7...",9347,6575692624,42448996,0,0.0
0,4329507,"40.75719,-73.99724 40.76017,-74.00382 40.76185...","LINESTRING (-73.99724 40.75719, -74.00382 40.7...",7208,247085867,247081705,0,0.0
0,4329507,"40.75719,-73.99724 40.76017,-74.00382 40.76185...","LINESTRING (-73.99724 40.75719, -74.00382 40.7...",2624,42433644,42423885,0,0.0


In [None]:
lm = links_matched.dropna(subset=["u", "v", "key"]).copy()

if "dist_degree" in lm.columns:
    lm = lm.sort_values(["link_id", "dist_degree"])
else:
    lm = lm.sort_values(["link_id"])

lm_unique = lm.drop_duplicates(subset=["link_id"], keep="first")

print("raw link_id:", links_matched["link_id"].nunique())
print("matched link_id:", lm_unique["link_id"].nunique())

link_to_edge = (
    lm_unique
        .set_index("link_id")[["u", "v", "key"]]
        .to_dict("index")
)

some_link = next(iter(link_to_edge.keys()))
print("示例 link_id:", some_link)
print("映射的 edge:", link_to_edge[some_link])


原有 link_id 数量: 19
匹配成功 link_id 数量: 19
示例 link_id: 4329472
映射的 edge: {'u': 6575692624, 'v': 42448996, 'key': 0}


In [None]:
from datetime import datetime, timedelta
from sodapy import Socrata
import pandas as pd

DOMAIN = "data.cityofnewyork.us"
DATASET_ID = "i4gi-tjb9"
APP_TOKEN = 'efu3l51fterm4itkm2c2xnc4e'

start_dt = datetime.utcnow() - timedelta(days=90)
START_DATE_STR = start_dt.strftime("%Y-%m-%dT%H:%M:%S")

where_clause = (
    f"data_as_of >= '{START_DATE_STR}' "
    f"AND borough = 'Manhattan'"
)

client = Socrata(DOMAIN, APP_TOKEN, timeout=60)
records = []
limit = 50000
offset = 0

while True:
    batch = client.get(
        DATASET_ID,
        where=where_clause,
        limit=limit,
        offset=offset,
        order="data_as_of"
    )
    if not batch:
        break
    records.extend(batch)
    offset += limit

df_long = pd.DataFrame.from_records(records)
print("amount:", len(df_long))
print("unique link_id:", df_long["link_id"].nunique())


  start_dt = datetime.utcnow() - timedelta(days=90)  # 改成 90 天


HTTPError: 403 Client Error: Forbidden.
	Invalid app_token specified