In [1]:
!pwd

/Users/haowu/Desktop/Transportation Research/Traffic-Prediction/data_processing/TSMO


In [2]:
%cd ..

/Users/haowu/Desktop/Transportation Research/Traffic-Prediction/data_processing


In [3]:
import csv
import pandas as pd
import numpy as np
#import geopandas as gpd
#import networkx as nx
import math
import pickle

import datetime
from datetime import datetime as dt
from collections import Counter
from scipy.spatial.distance import cdist
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from matplotlib import pyplot as plt

from utils import *

### 1. Hyperparameters

In [4]:
# Time
start_date = dt(2022, 2, 14)
end_date = dt(2023, 2, 12)
start_time = 330 # 05:30:00 
end_time = 1260 # 21:00:00
busi_date = pd.bdate_range(start=start_date, end=end_date).date 

In [5]:
'''
execute the following code block only if set_hwd_tmc_segments_shp and set_hwd_xd_segments_str_shp are not in place
'''
# Segments
gdf_tmc_TSMO = gpd.read_file("../data/shape/tmc_shape_TSMO/tmc_shape_TSMO_for_sjoin.geojson")  # 1591 TMC segments in TSMO (selected from Carroll, Howard and Baltimore) used for spatial join
gdf_xd_TSMO = gpd.read_file("../data/shape/xd_shape_TSMO/xd_shape_TSMO_for_sjoin.geojson")  # 2501 XD segments in TSMO (selected from Carroll, Howard and Baltimore) used for spatial join

set_tmc_segments_shp_TSMO = set(gdf_tmc_TSMO.tmc)
set_xd_segments_str_shp_TSMO = set(gdf_xd_TSMO.xd.astype(str))

pickle.dump(set_tmc_segments_shp_TSMO, open("./TSMO/set_tmc_segments_shp_TSMO.pkl", "wb"))
pickle.dump(set_xd_segments_str_shp_TSMO, open("./TSMO/set_xd_segments_str_shp_TSMO.pkl", "wb"))

In [6]:
df_xd_to_tmc_TSMO = pickle.load(open("./TSMO/df_xd_to_tmc_TSMO.pkl", "rb"))
set_xd_segments_str_sjoin_TSMO = set(df_xd_to_tmc_TSMO.id_xd_str)  # 2427

### 2. Downsample Raw Speed Data Files

Execute this code block ONLY IF downsampled csv data are not available on file

tmc process and xd process CANNOT be executed simultaneously, otherwise the notebook will encounter kernel crash


In [7]:
downsampled_tmc_1min_all_bh_part2 = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/B&H_tmc_1min_all_part2.csv", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/downsampled_B&H_tmc_1min_all_part2.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

382it [1:15:25, 11.85s/it]


Finished reading chunks!


In [7]:
# Take ~28 min to execute
# downsampled_tmc_1min_all_bh: shape (203263672, 6)
downsampled_tmc_1min_all_bh_part1 = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/B&H_tmc_1min_all_part1.csv", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/downsampled_B&H_tmc_1min_all_part1.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

# Take ~2 min to execute
# downsampled_tmc_1min_truck_2022_bh: shape (9509162, 7)
downsampled_tmc_1min_truck_2022_bh = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/B&H_tmc_1min_truck_2022.csv", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/downsampled_B&H_tmc_1min_truck_2022.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

# Take ~31 sec to execute
# downsampled_tmc_1min_truck_2023_bh: shape (1268097, 7)
downsampled_tmc_1min_truck_2023_bh = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/B&H_tmc_1min_truck_2023.csv", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/downsampled_B&H_tmc_1min_truck_2023.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

# Take ~4 min to execute
# downsampled_tmc_1min_pv_2022_bh: shape (14498618, 7)
downsampled_tmc_1min_pv_2022_bh = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/B&H_tmc_1min_pv_2022.csv", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/downsampled_B&H_tmc_1min_pv_2022.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

# Take ~1.5 min to execute
# downsampled_tmc_1min_pv_2023_bh: shape (1860391, 7)
downsampled_tmc_1min_pv_2023_bh = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/B&H_tmc_1min_pv_2023.csv", "/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/downsampled_B&H_tmc_1min_pv_2023.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

# Take ~5 min to execute
# downsampled_tmc_1min_all_Carroll: shape (5313712, 8)
downsampled_tmc_1min_all_Carroll = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/Carroll_tmc_1min_all.csv", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/downsampled_Carroll_tmc_1min_all.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

# Take ~4 sec to execute
# downsampled_tmc_1min_truck_2022_Carroll: shape (244643, 7)
downsampled_tmc_1min_truck_2022_Carroll = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/Carroll_tmc_1min_truck_2022.csv", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/downsampled_Carroll_tmc_1min_truck_2022.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

# Take ~1 sec to execute
# downsampled_tmc_1min_truck_2023_Carroll: shape (33574, 7)
downsampled_tmc_1min_truck_2023_Carroll = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/Carroll_tmc_1min_truck_2023.csv", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/downsampled_Carroll_tmc_1min_truck_2023.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

# Take ~8 sec to execute
# downsampled_tmc_1min_pv_2022_Carroll: shape (340386, 7)
downsampled_tmc_1min_pv_2022_Carroll = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/Carroll_tmc_1min_pv_2022.csv", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/downsampled_Carroll_tmc_1min_pv_2022.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

# Take ~1 sec to execute
# downsampled_tmc_1min_pv_2023_Carroll: shape (45562, 7)
downsampled_tmc_1min_pv_2023_Carroll = downsample_large_dataset("tmc", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/Carroll_tmc_1min_pv_2023.csv", "/Volumes/Hao's WD/TSMO/Speed/Carroll_tmc_1min/downsampled_Carroll_tmc_1min_pv_2023.csv", set_tmc_segments_shp_TSMO, busi_date, start_time, end_time)

202it [27:58,  8.31s/it]


Finished reading chunks!


### 3. Pivot Downsampled Data and Generate Speed/Density Dataframe
The following code blocks are executed on MSI desktop, which has 64GB RAM. 

#### 3.1 All Speed & Density

In [7]:
downsampled_tmc_1min_all_bh_part1 = pd.read_csv("/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/downsampled_B&H_tmc_1min_all_part1.csv")
downsampled_tmc_1min_all_bh_part2 = pd.read_csv("/Volumes/Hao's WD/TSMO/Speed/B&H_tmc_1min/downsampled_B&H_tmc_1min_all_part2.csv")

In [8]:
downsampled_tmc_1min_all_bh = pd.concat((downsampled_tmc_1min_all_bh_part1,downsampled_tmc_1min_all_bh_part2)).reset_index(drop=True)

: 

: 

In [None]:
downsampled_tmc_1min_all_bh.to_csv("/Volumnes/Hao's WD/TSMO/Speed/B&H_tmc_1min/downsampled_tmc_1min_all_bh.csv", index=False)

In [17]:
df_downsampled_spd_tmc_1min_all_bh = pivot_df("tmc", "speed", 1, df=downsampled_tmc_1min_all_bh, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../data/df_downsampled_spd_tmc_1min_all_bh.pkl") 

Check dates of incomplete slots:
2022-03-24 927
2022-04-26 928
2022-05-04 923
2022-05-17 929
2022-05-23 929
2022-06-01 929
2022-07-08 929
2022-07-12 926
2022-07-29 928
2022-08-02 929
2022-08-03 918
2022-08-26 927
2022-10-19 929


In [18]:
df_downsampled_dens_tmc_1min_all_bh = pivot_df("tmc", "density", 1, df=downsampled_tmc_1min_all_bh, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../data/df_downsampled_dens_tmc_1min_all_bh.pkl") 

KeyError: 'confidence_score'

#### 3.2 Vehicle-specific Speed & Density

In [35]:
# downsampled_tmc_1min_truck_TSMO: shape (11055476, 7)
downsampled_tmc_1min_truck_TSMO = pd.concat((downsampled_tmc_1min_truck_2022_bh, downsampled_tmc_1min_truck_2023_bh, downsampled_tmc_1min_truck_2022_Carroll, downsampled_tmc_1min_truck_2023_Carroll)).reset_index(drop=True)

# downsampled_tmc_1min_pv_TSMO: shape (16744957, 7)
downsampled_tmc_1min_pv_TSMO = pd.concat((downsampled_tmc_1min_pv_2022_bh, downsampled_tmc_1min_pv_2023_bh, downsampled_tmc_1min_pv_2022_Carroll, downsampled_tmc_1min_pv_2023_Carroll)).reset_index(drop=True)

In [37]:
downsampled_tmc_1min_truck_TSMO.to_csv("/Volumes/Hao's WD/TSMO/Speed/downsampled_tmc_1min_truck_TSMO.csv", index=False)
downsampled_tmc_1min_pv_TSMO.to_csv("/Volumes/Hao's WD/TSMO/Speed/downsampled_tmc_1min_pv_TSMO.csv", index=False)

In [8]:
downsampled_tmc_1min_truck_TSMO = pd.read_csv("/Volumes/Hao's WD/TSMO/Speed/downsampled_tmc_1min_truck_TSMO.csv")
downsampled_tmc_1min_pv_TSMO = pd.read_csv("/Volumes/Hao's WD/TSMO/Speed/downsampled_tmc_1min_pv_TSMO.csv")

In [9]:
df_downsampled_spd_tmc_1min_truck_TSMO = pivot_df("tmc", "speed", 1, df=downsampled_tmc_1min_truck_TSMO, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../data/df_downsampled_spd_tmc_1min_truck_TSMO.pkl") 
df_downsampled_dens_tmc_1min_truck_TSMO = pivot_df("tmc", "density", 1, df=downsampled_tmc_1min_truck_TSMO, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../data/df_downsampled_dens_tmc_1min_truck_TSMO.pkl") 
df_downsampled_spd_tmc_1min_pv_TSMO = pivot_df("tmc", "speed", 1, df=downsampled_tmc_1min_pv_TSMO, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../data/df_downsampled_spd_tmc_1min_pv_TSMO.pkl") 
df_downsampled_dens_tmc_1min_pv_TSMO = pivot_df("tmc", "density", 1, df=downsampled_tmc_1min_pv_TSMO, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../data/df_downsampled_dens_tmc_1min_pv_TSMO.pkl") 

Check dates of incomplete slots:
2022-02-14 186
2022-02-15 186
2022-02-16 186
2022-02-17 186
2022-02-18 186
2022-02-21 186
2022-02-22 186
2022-02-23 186
2022-02-24 186
2022-02-25 186
2022-02-28 186
2022-03-01 186
2022-03-02 186
2022-03-03 186
2022-03-04 186
2022-03-07 186
2022-03-08 186
2022-03-09 186
2022-03-10 186
2022-03-11 186
2022-03-14 186
2022-03-15 186
2022-03-16 186
2022-03-17 186
2022-03-18 186
2022-03-21 186
2022-03-22 186
2022-03-23 186
2022-03-24 186
2022-03-25 186
2022-03-28 186
2022-03-29 186
2022-03-30 186
2022-03-31 186
2022-04-01 186
2022-04-04 186
2022-04-05 186
2022-04-06 186
2022-04-07 186
2022-04-08 186
2022-04-11 186
2022-04-12 186
2022-04-13 186
2022-04-14 186
2022-04-15 186
2022-04-18 186
2022-04-19 186
2022-04-20 186
2022-04-21 186
2022-04-22 186
2022-04-25 186
2022-04-26 186
2022-04-27 186
2022-04-28 186
2022-04-29 186
2022-05-02 186
2022-05-03 186
2022-05-04 186
2022-05-05 186
2022-05-06 186
2022-05-09 186
2022-05-10 186
2022-05-11 186
2022-05-12 186
2022-05