# Data Preprocessing Version 2 TMC

Key Differences from Data Preprocessing Version 1:
- Input:
    - Include XD speed data (aggregated in the frequency of 5 min)  
- Output Ground Truth
    - Incident Indicator: include segments whose speed is abnormal
    - another output ground truth whose speed data refers to 3 types of TMC speed data (All, Truck only, Personal Vehicle only), rather than XD data => we will have two output ground truth files (new_Y_TMC.npy and new_Y_XD.npy)

Version 2 doesn't make change to Part 1. Segment Selection for New Input & Output


In [9]:
import csv
import pandas as pd
import numpy as np
from datetime import datetime as dt

from collections import Counter
from scipy.spatial.distance import cdist
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

### Part 1. Segment Selection for New Input & Output

In [97]:
'''
Columns:
    'tmc', 'road', 'direction', 'intersection', 'state', 'county', 'zip',
    'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
    'miles', 'road_order', 'timezone_name', 'type', 'country', 'tmclinear',
    'frc', 'border_set', 'f_system', 'urban_code', 'faciltype', 'structype',
    'thrulanes', 'route_numb', 'route_sign', 'route_qual', 'altrtename',
    'aadt', 'aadt_singl', 'aadt_combi', 'nhs', 'nhs_pct', 'strhnt_typ',
    'strhnt_pct', 'truck', 'isprimary', 'active_start_date',
    'active_end_date'
'''
tmc = pd.read_csv("data/Carnberry_NPMRDS_5min/manually_select_cranberry_2019_dont_average_2/TMC_Identification.csv")  # (1248, 39), this is the comprehensive list, TMC speed csv data (All/Truck/PV) may not include all TMC in this list
tmc_coord = tmc.loc[:, ["tmc", 'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']]

'''
Columns:
    'xd', 'road-name', 'road-num', 'bearing', 'miles', 'frc', 'county',
    'state', 'zip', 'timezone_name', 'start_latitude', 'start_longitude',
    'end_latitude', 'end_longitude'
'''
xd = pd.read_csv("data/Cranberry_ritis_1min_class123/manually_select_cranberry_class123_20181101_20190727_dont_average/XD_Identification.csv")  # (1628, 14)
xd["xd"] = xd["xd"].apply(str)
xd_coord = xd.loc[:, ["xd", 'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']]

'''
162 targeted IDs
    TMC: 129 (84 found in tmc)
    XD: 33 (21 found in xd)
'''
old_out = list(np.load("data/cran_tmc.npy", allow_pickle=True))
old_out_tmc = old_out[:129]
old_out_xd = old_out[130:]


col_names = list(np.load("data/col_names.npy", allow_pickle=True))

#### 1.1 Select Targeted Segments (Input & Output) by Referring to Old Targeted Segments (Input & Output)

In [121]:
old_in_seg = {}
old_in_seg["tti"] = {} # 369 (331 tmc + 38 xd), including all 162 segments in old_target
old_in_seg["tti"]["tmc"] = [i for i in col_names[:-21] if "sd" not in i and "inc" not in i and i.startswith("104")]
old_in_seg["tti"]["xd"] = [i for i in col_names[:-21] if "sd" not in i and "inc" not in i and not i.startswith("104")]
old_in_seg["inc"] = [i[4:] for i in col_names[:-21] if "inc" in i] # 315 (275 tmc + 40 xd), has 309 in common with old_in_seg["tti"]
old_in_seg["sd"] = [i[3:] for i in col_names[:-21] if "sd" in i] # 303 (266 tmc + 37 xd), subset of old_in_seg["inc"]

In [122]:
new_out_tmc = set(tmc[tmc["tmc"].isin(old_out)]["tmc"])  # 84 preliminarily selected TMC segments for output
new_out_xd = set(xd[xd["xd"].isin(old_out)]["xd"])  # 21 preliminarily selected XD segments for output

new_in_seg = {}
new_in_seg["tti"] = {}
new_in_seg["tti"]["tmc"] = list(tmc[tmc["tmc"].isin(old_in_seg["tti"]["tmc"])]["tmc"])  # 236
new_in_seg["tti"]["xd"] =list(xd[xd["xd"].isin(old_in_seg["tti"]["xd"])]["xd"])  # 21
new_in_seg["inc"] = {}
new_in_seg["inc"]["tmc"] = list(tmc[tmc["tmc"].isin(old_in_seg["inc"])]["tmc"])  # 195, including all new_out_tmc
new_in_seg["inc"]["xd"] =list(xd[xd["xd"].isin(old_in_seg["inc"])]["xd"])  # 21, including all new_out_xd
new_in_seg["sd"] = {}
new_in_seg["sd"]["tmc"] = list(tmc[tmc["tmc"].isin(old_in_seg["sd"])]["tmc"])  # 190
new_in_seg["sd"]["xd"] =list(xd[xd["xd"].isin(old_in_seg["sd"])]["xd"])  # 20

In [123]:
new_in_tmc = set(new_in_seg["tti"]["tmc"] + new_in_seg["sd"]["tmc"] + new_in_seg["inc"]["tmc"])  # 236
new_in_xd = set(new_in_seg["tti"]["xd"] + new_in_seg["sd"]["xd"] + new_in_seg["inc"]["xd"])  # 21
new_in = new_in_tmc.union(new_in_xd)  # 257 targeted TMC & XD segments for input

#### 1.2 Match TMCs and XDs

In [16]:
# compute pairwise distance between TMCs and XDs on their starting & ending coordinates
pairwise_dist_all = cdist(tmc_coord.iloc[:, 1:], xd_coord.iloc[:, 1:], metric="euclidean")  # (1248, 1628)

1.2.1 Match TMC with XD 

In [49]:
# some XDs may be mapped to multiple TMCs
match_xd_cnt = Counter(pairwise_dist_all.argmin(1))  # 426 XDs that achieve one-to-one match with TMCs
dup_xd = [xd.loc[i, "xd"] for i in match_xd_cnt.keys() if match_xd_cnt[i] > 1]  # account for 822 TMCs

match_tmc_to_xd = tmc.copy()
match_tmc_to_xd["direction"] = match_tmc_to_xd["direction"].apply(lambda x: x[0])  # convert direction values from "South" to "S", etc
match_tmc_to_xd = match_tmc_to_xd.iloc[:, :-14]  # remove auxiliary info
match_tmc_to_xd.drop(["state", "country", "timezone_name"], axis=1, inplace=True)

# One-to-one match TMCs with XDs
match_tmc_to_xd["xd_dist"] = pairwise_dist_all.min(1)
match_tmc_to_xd["xd"] = (pd.Series(pairwise_dist_all.argmin(1))).apply(lambda x: xd.loc[x, "xd"])
# match_tmc_to_xd = match_tmc_to_xd[~match_tmc_to_xd["xd"].isin(dup_xd)]  # here we don't remove XDs that correspond to multiple TMCs

# Merge Dataframes
match_tmc_to_xd = match_tmc_to_xd.merge(xd, on="xd")

# remove XDs that has different directions from TMCs
match_tmc_to_xd = match_tmc_to_xd[(match_tmc_to_xd["direction"] == match_tmc_to_xd["bearing"]) | (match_tmc_to_xd["xd_dist"] ==0)]  # 353 TMCs
match_tmc_to_xd.to_csv("data/temp_match_tmc_xd.csv", index=False)

1.2.2 Match XD with TMC

In [50]:
match_tmc_cnt = Counter(pairwise_dist_all.argmin(0))  # 308 TMCs achieves one-to-one match with XDs
dup_tmc = [tmc.loc[i, "tmc"] for i in match_tmc_cnt.keys() if match_tmc_cnt[i] > 1]  # 301 TMCs account for the remaining 1320 XDs

# match XDs with TMCs
match_xd_to_tmc = xd.copy()
match_xd_to_tmc["tmc_dist"] = pairwise_dist_all.min(0)
match_xd_to_tmc["tmc"] = (pd.Series(pairwise_dist_all.argmin(0))).apply(lambda x: tmc.loc[x, "tmc"])
# match_xd_to_tmc = match_xd_to_tmc[~match_xd_to_tmc["tmc"].isin(dup_tmc)]  # here we don't remove TMCs that correspond to multiple XDs

# Merge Dataframes
match_xd_to_tmc = match_xd_to_tmc.merge(tmc, on="tmc")
match_xd_to_tmc["direction"] = match_xd_to_tmc["direction"].apply(lambda x: x[0])

# remove XDs that has different directions from TMCs
match_xd_to_tmc = match_xd_to_tmc[(match_xd_to_tmc["direction"] == match_xd_to_tmc["bearing"]) | (match_xd_to_tmc["tmc_dist"] ==0)]  
match_xd_to_tmc.to_csv("data/temp_match_xd_to_tmc.csv", index=False)

#### 1.3 Get Final Targeted TMC & XD segments

In [51]:
target_tmc = match_tmc_to_xd[match_tmc_to_xd["tmc"].isin(new_out_tmc)]  # 60 (whose XDs are different from target_xd)
target_xd = match_xd_to_tmc[match_xd_to_tmc["xd"].isin(new_out_xd)]  # 10

new_out_tmc = set(target_tmc["tmc"])  # 60 eventually targeted TMC segments for output, is a subset of new_in
new_out_xd = set(target_xd["xd"])  # 10 eventually targeted XD segments for output, is a subset of new_in

In [54]:
target_tmc.to_csv("data/temp_match_target_tmc_to_xd.csv", index=False)
target_xd.to_csv("data/temp_match_target_xd_to_tmc.csv", index=False)

In [106]:
new_out_tmc_xd = new_out_tmc.union(new_out_xd)

# new_out_all_in_xd = set(target_tmc["xd"]).union(new_out_xd)  # 69 unique XD id of 70 targeted TMC & XD segments for output
# new_out_all_in_xd_int = set([int(i) for i in list(new_out_all_in_xd)])
new_out_all_in_xd = list(target_tmc["xd"]) + list(new_out_xd)  # 69 unique XD id of 70 targeted TMC & XD segments for output
new_out_all_in_xd_int = [int(i) for i in list(new_out_all_in_xd)]

# new_out_all_in_tmc = set(target_xd["tmc"]).union(new_out_tmc)  # 63 unique TMC id of 70 targeted TMC & XD segments for output
new_out_all_in_tmc = list(new_out_tmc) + list(target_xd["tmc"])  # 63 unique TMC id of 70 targeted TMC & XD segments for output

### Part 2. Generate New Input & Output Data

In [158]:
'''
Date: 2019.2.10 ~ 2019.7.23 (164 days, including all holidays & weekends)
Time Slots: 
    - For each day, there are 180 targeted time slots from 06:00:00 to 20:55:00
    - For each targeted time slot t (t in 06:00:00 ~ 20:55:00), 
        - old_Y contains padding of 7 slots (t-6, t-5, t-4, t-3, t-2, t-1, t)
        - old_X contains padding of 7 slots as input (t-12, t-11, t-10, t-9, t-8, t-7, t-6)
    
    In new_X and new Y, to allow for more flexibility of hyperparameters and reduce the file size, there won't be padding
    For example, for targeted time slot 06:00:00
        - old_Y has 05:30:00, 05:35:00, 05:40:00, 05:45:00, 05:50:00, 05:55:00, 06:00:00
        - old_X has 05:00:00, 05:05:00, 05:10:00, 05:15:00, 05:20:00, 05:25:00, 05:30:00
        - new_Y has 06:00:00
        - new_X has 05:30:00
'''
old_X = np.load("data/X.npy")  # (29520, 7, 1008)
old_Y = np.load("data/Y.npy")  # (29520, 7, 162)
old_col = list(np.load("data/col_names.npy", allow_pickle=True))  # 1008 (369 tti, 315 inc, 303 sd, 21 weather & time)

In [496]:
# get indices of old_col that will remain as new columns
col_idx = []  # will store the indices of 704 old columns that will become new_X (257 tti, 210 sd, 216 inc, 21 weather & time)
for i in range(987):
    c = old_col[i]
    if "sd" in c:
        c = c[3:]
    if "inc" in c:
        c = c[4:]
    
    if c in new_in:
        col_idx.append(i)
col_idx += list(range(987, 1008))

#### 2.1 Generate New Input

In [497]:
# new_X without additional features
new_X = old_X[:, -1, col_idx] # (29520, 704) 164 days * 180 daily time slots (05:30:00 ~ 20:25:00)

In [6]:
'''
Columns:
    'tmc_code', 'measurement_tstamp', 'speed', 'average_speed',
    'reference_speed', 'travel_time_minutes', 'data_density'
'''
tmc_truck = pd.read_csv("data/Carnberry_NPMRDS_5min/manually_select_cranberry_2019_dont_average/manually_select_cranberry_2019_dont_average.csv") # 9136192, 7
tmc_pv = pd.read_csv("data/Carnberry_NPMRDS_5min/manually_select_cranberry_2019_dont_average_3/manually_select_cranberry_2019_dont_average.csv")  # 21385940, 7
tmc_all = pd.read_csv("data/Carnberry_NPMRDS_5min/manually_select_cranberry_2019_dont_average_2/manually_select_cranberry_2019_dont_average.csv") # 24388983,7

In [394]:
# speed of all vehicles
all_spd = tmc_all.loc[:, ["tmc_code", "measurement_tstamp", "speed"]]
all_spd = all_spd.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "speed")

# select 233 tmc segments based on old tmc input segments
all_spd = all_spd.loc[:, [c for c in all_spd.columns if c in new_in_tmc]]  

# convert index to datetime object, and select 29520 rows of interest
all_spd.index = pd.to_datetime(all_spd.index)
all_spd = all_spd.loc["2019-02-10":"2019-07-23"]
all_spd = all_spd[(all_spd.index.hour * 60 + all_spd.index.minute >= 330 ) & (all_spd.index.hour * 60 + all_spd.index.minute <= 1225)] # 1858961 NaN

all_avg = tmc_all.loc[:, ["tmc_code", "measurement_tstamp", "average_speed"]]
all_avg = all_avg.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "average_speed")
all_avg = all_avg.loc[:, [c for c in all_avg.columns if c in new_in_tmc]]  
all_avg.index = pd.to_datetime(all_avg.index)
all_avg = all_avg.loc["2019-02-10":"2019-07-23"]
all_avg = all_avg[(all_avg.index.hour * 60 + all_avg.index.minute >= 330 ) & (all_avg.index.hour * 60 + all_avg.index.minute <= 1225)] # 1859011 NaN

all_ref = tmc_all.loc[:, ["tmc_code", "measurement_tstamp", "reference_speed"]]
all_ref = all_ref.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "reference_speed")
all_ref = all_ref.loc[:, [c for c in all_ref.columns if c in new_in_tmc]] 
all_ref.index = pd.to_datetime(all_ref.index)
all_ref = all_ref.loc["2019-02-10":"2019-07-23"]
all_ref = all_ref[(all_ref.index.hour * 60 + all_ref.index.minute >= 330 ) & (all_ref.index.hour * 60 + all_ref.index.minute <= 1225)] # 1858961 NaN

2.1.1 Prepare Density Feature

In [114]:
density_tmc = tmc_all.loc[:, ["tmc_code", "measurement_tstamp", "data_density"]]
density_tmc = density_tmc.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "data_density")

# select 233 tmc segments based on old tmc input segments. 233 tmc segments include all new_out_all_in_tmc
density_tmc = density_tmc.loc[:, [c for c in density_tmc.columns if c in new_in_tmc]] 

# fill NAN with "A", which denotes "Fewer than five values"
density_tmc = density_tmc.fillna("A")

# convert index to datetime object, and select 29520 rows of interest
density_tmc.index = pd.to_datetime(density_tmc.index)
density_tmc = density_tmc.loc["2019-02-10":"2019-07-23"]
density_tmc = density_tmc[(density_tmc.index.hour * 60 + density_tmc.index.minute >= 330 ) & (density_tmc.index.hour * 60 + density_tmc.index.minute <= 1225)]

In [125]:
len([c for c in density_tmc.columns if c in old_in_seg["tti"]["tmc"]]), len([c for c in density_tmc.columns if c in new_in_tmc])

(233, 233)

2.1.2 Prepare Features of Truck Speed & Personal Vehicle Speed

In [None]:
# TO DO: prepare 1. TMC All spd featuer & 2. XD spd feature (aggregated in 5 min) for new_in_tmc & new_in_xd

In [448]:
truck = tmc_truck.loc[:, ["tmc_code", "measurement_tstamp", "speed"]]
truck = truck.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "speed")

# select 233 tmc segments based on old tmc input segments
truck = truck.loc[:, [c for c in truck.columns if c in new_in_tmc]]  

# convert index to datetime object, and select 29520 rows of interest
truck.index = pd.to_datetime(truck.index)
truck = truck.resample("5 min").asfreq()  # upsampling with 5-min frequency
truck = truck.loc["2019-02-10":"2019-07-23"]
truck = truck[(truck.index.hour * 60 + truck.index.minute >= 330 ) & (truck.index.hour * 60 + truck.index.minute <= 1225)] # 4461384 NaN

truck_avg = tmc_truck.loc[:, ["tmc_code", "measurement_tstamp", "average_speed"]]
truck_avg = truck_avg.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "average_speed")
truck_avg = truck_avg.loc[:, [c for c in truck_avg.columns if c in new_in_tmc]]  
truck_avg.index = pd.to_datetime(truck_avg.index)
truck_avg = truck_avg.resample("5 min").asfreq()
truck_avg = truck_avg.loc["2019-02-10":"2019-07-23"]
truck_avg = truck_avg[(truck_avg.index.hour * 60 + truck_avg.index.minute >= 330 ) & (truck_avg.index.hour * 60 + truck_avg.index.minute <= 1225)]  # 4461384 NaN

truck_ref = tmc_truck.loc[:, ["tmc_code", "measurement_tstamp", "reference_speed"]]
truck_ref = truck_ref.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "reference_speed")
truck_ref = truck_ref.loc[:, [c for c in truck_ref.columns if c in new_in_tmc]] 
truck_ref.index = pd.to_datetime(truck_ref.index)
truck_ref = truck_ref.resample("5 min").asfreq()
truck_ref = truck_ref.loc["2019-02-10":"2019-07-23"]
truck_ref = truck_ref[(truck_ref.index.hour * 60 + truck_ref.index.minute >= 330 ) & (truck_ref.index.hour * 60 + truck_ref.index.minute <= 1225)]  # 4461384 NaN

In [451]:
pv = tmc_pv.loc[:, ["tmc_code", "measurement_tstamp", "speed"]]
pv = pv.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "speed")

# select 233 tmc segments based on old tmc input segments
pv = pv.loc[:, [c for c in pv.columns if c in new_in_tmc]] 

# convert index to datetime object, and select 29520 rows of interest
pv.index = pd.to_datetime(pv.index)
pv = pv.loc["2019-02-10":"2019-07-23"]
pv = pv[(pv.index.hour * 60 + pv.index.minute >= 330 ) & (pv.index.hour * 60 + pv.index.minute <= 1225)]  # 2258550 NaN

pv_avg = tmc_pv.loc[:, ["tmc_code", "measurement_tstamp", "average_speed"]]
pv_avg = pv_avg.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "average_speed")
pv_avg = pv_avg.loc[:, [c for c in pv_avg.columns if c in new_in_tmc]]  
pv_avg.index = pd.to_datetime(pv_avg.index)
pv_avg = pv_avg.loc["2019-02-10":"2019-07-23"]
pv_avg = pv_avg[(pv_avg.index.hour * 60 + pv_avg.index.minute >= 330 ) & (pv_avg.index.hour * 60 + pv_avg.index.minute <= 1225)]  # 2258600 NaN

pv_ref = tmc_pv.loc[:, ["tmc_code", "measurement_tstamp", "reference_speed"]]
pv_ref = pv_ref.pivot(index = "measurement_tstamp", columns = "tmc_code", values = "reference_speed")
pv_ref = pv_ref.loc[:, [c for c in pv_ref.columns if c in new_in_tmc]]  
pv_ref.index = pd.to_datetime(pv_ref.index)
pv_ref = pv_ref.loc["2019-02-10":"2019-07-23"]
pv_ref = pv_ref[(pv_ref.index.hour * 60 + pv_ref.index.minute >= 330 ) & (pv_ref.index.hour * 60 + pv_ref.index.minute <= 1225)]  # 2258550 NaN

In [452]:
# fillna with historical and reference speed data
truck = truck.fillna(truck_avg)
truck = truck.fillna(truck_ref)

pv = pv.fillna(pv_avg)
pv = pv.fillna(pv_ref)

In [453]:
truck_seg_avg = truck.mean(axis=0, skipna=True)
all_seg_avg = all_spd.mean(axis=0, skipna=True)
pv_seg_avg = pv.mean(axis=0, skipna=True)

# compute fillna weight
truck_vs_all = truck_seg_avg / all_seg_avg
truck_vs_pv = truck_seg_avg / pv_seg_avg
pv_vs_all = pv_seg_avg / all_seg_avg
pv_vs_truck = pv_seg_avg / truck_seg_avg

In [464]:
# weighted fillna with all speed data and private vehicle speed data
truck = truck.fillna(all_spd * truck_vs_all)  # reduce NAN from 4461384 to 1858961
truck = truck.fillna(all_avg * truck_vs_all)  
truck = truck.fillna(all_ref * truck_vs_all)  
truck = truck.fillna(pv * truck_vs_pv)  
truck = truck.fillna(pv_avg * truck_vs_pv)
truck = truck.fillna(pv_ref * truck_vs_pv)


In [468]:
# weighted fillna with all speed data and truck speed data
pv = pv.fillna(all_spd * pv_vs_all)  # reduce NaN from 2258550 to 1858961
pv = pv.fillna(all_avg * pv_vs_all)
pv = pv.fillna(all_ref * pv_vs_all)
pv = pv.fillna(truck * pv_vs_truck)
pv = pv.fillna(truck_avg * pv_vs_truck)
pv = pv.fillna(truck_ref * pv_vs_truck)


In [474]:
# interpolate for NAN value
truck = truck.interpolate(method="linear")
pv = pv.interpolate(method = "linear")

# fill the remaining NaN with column mean
truck = truck.fillna(truck_seg_avg)
pv = pv.fillna(pv_seg_avg)


2.1.3 Merge New Features into New_X

In [512]:
# Ordinal Embedding for Density & Inc Features
density = density.replace(["A", "B", "C"], [1/6, 3/6, 5/6])

new_X[:,467:683][new_X[:,467:683] == 0.0] = 1/6
new_X[:,467:683][new_X[:,467:683] == 1.0] = 3/6
new_X[:,467:683][new_X[:,467:683] == 2.0] = 5/6


In [522]:
final_X = np.concatenate((density.to_numpy(), truck.to_numpy(), pv.to_numpy(), new_X), axis=1)  # (29520, 1403) (233 density, 233 truck, 233 pv, 257 tti, 210 sd, 216 inc, 21 weather & time)

In [529]:
# Scaling Normalization (min-max normalization)
scaler = MinMaxScaler()
final_X = scaler.fit_transform(final_X)

In [531]:
np.save("data/new_X.npy", final_X)

#### 2.2 Generate New Output

2.2.1 Speed Data

In [None]:
# TO DO: 
# 根据new_out_all_in_xd_int重新跑一遍downsample -> 获得最新的downsampled_xd
# generate new output with XD spd (be careful, we need 70 columns, and there are duplicates because two TMCs match the same XD)
# generate new output with TMC spd (be careful, we need 70 columns, and there are duplicates because several XDs match the same TMCs, and some of those matched TMCs are also in target_tmc)
# reconsider incident data by incorporating speed information
# also be careful about aligning columns of spd data with columns of incident data


In [575]:
start_date = dt(2019, 2, 10)
end_date = dt(2019, 7, 24)

'''
Header:
    'xd_id', 'measurement_tstamp', 'speed', 'average_speed', 'reference_speed', 'travel_time_minutes', 'confidence_score', 'cvalue'
'''
# the original csv file stores XD speed data in 1-min slots from 2018.11.1 to 2019.7.27, which is too large
# therefore, we have to read and split csv into 61 dataframe chunks and apply operation individually 
chunksize = 10 ** 7
xd_file = "data/Cranberry_ritis_1min_class123/manually_select_cranberry_class123_20181101_20190727_dont_average/manually_select_cranberry_class123_20181101_20190727_dont_average.csv"
chunklist = []
with pd.read_csv(xd_file, chunksize=chunksize) as reader:
    for chunk in tqdm(reader):
        chunk.measurement_tstamp = pd.to_datetime(chunk.measurement_tstamp)

        # filter dataframe by selecting rows based on xd_id and timestamp of our interest
        chunk = chunk[
                (chunk.xd_id.isin(new_out_all_in_xd_int)) &
                (start_date <= chunk.measurement_tstamp) & 
                (chunk.measurement_tstamp < end_date) & 
                (chunk.measurement_tstamp.dt.hour*60 + chunk.measurement_tstamp.dt.minute >= 360) & 
                (chunk.measurement_tstamp.dt.hour*60 + chunk.measurement_tstamp.dt.minute < 1260) 
                ]
        chunklist.append(chunk)

# concat dataframe chunks and merge into one final dataframe 
downsampled_xd = pd.concat(chunklist) 
downsampled_xd = downsampled_xd.reset_index(drop=True)  # reset index

# save downsampled xd data
downsampled_xd.to_csv("data/downsampled_xd_data.csv", index=False)

62it [07:08,  6.92s/it]


In [648]:
xd_spd = downsampled_xd.pivot(index = "measurement_tstamp", columns = "xd_id", values = "speed")  # (147600, 78)
xd_avg = downsampled_xd.pivot(index = "measurement_tstamp", columns = "xd_id", values = "average_speed")
xd_ref = downsampled_xd.pivot(index = "measurement_tstamp", columns = "xd_id", values = "reference_speed")

In [650]:
xd_spd = xd_spd.interpolate(method="linear")

2.2.2 Incident Data

In [593]:
out_inc_label = pd.read_csv("data/incident_labels.csv")
out_inc_label = out_inc_label.set_index("measurement_tstamp")
out_inc_label.index = pd.to_datetime(out_inc_label.index)

out_inc_label = out_inc_label.resample("1 min").asfreq()  # upsampling with 1-min frequency

# select rows based on timestamps of our interest (2019.2.10 ~ 2019.7.23, 06:00:00~20:59:00 in 1-min frequency everyday)
out_inc_label = out_inc_label[
                (start_date <= out_inc_label.index) & 
                (out_inc_label.index < end_date) & 
                (out_inc_label.index.hour*60 + out_inc_label.index.minute >= 360) & 
                (out_inc_label.index.hour*60 + out_inc_label.index.minute < 1260) 
                ]

# select columns of new out segments 
out_inc_label = out_inc_label.loc[:, new_out_tmc_xd]  # (147600, 78)

# rename columns to represent all segments in XD IDs, so that we can combine incident data with xd speed data eventually
new_col_name = [c if c in new_out_xd else target_tmc[target_tmc.tmc == c]["xd"].values[0] for c in out_inc_label.columns]
out_inc_label.columns = new_col_name

In [600]:
# Be cautious about the order of backfilling and forward filling
# We first do a backfilling because when an incident occurs, we want the segment to be marked "1" as early as possible
# Backfilling allows NaN slots before a slot already marked "1" to be marked as "1" as well.
# Then, we do a forward filling to fill NaN at the end of the dataframe
out_inc_label = out_inc_label.fillna(method = "bfill")
out_inc_label = out_inc_label.fillna(method = "ffill")

2.2.3 Merge into New_Y

In [652]:
xd_spd.columns = [str(c) for c in xd_spd.columns] 

# align the columns of xd speed dataframe and incident dataframe to make sure they have segments in the same order
final_xd_spd, final_out_inc_label = xd_spd.align(out_inc_label, join='inner', axis=1)  

In [653]:
new_Y = np.stack((final_xd_spd.to_numpy().astype("float64"), final_out_inc_label.to_numpy()), axis=-1)  # (147600, 78, 2)

In [655]:
np.save("data/new_Y.npy", new_Y)