In [1]:
import csv
import pandas as pd
import numpy as np
import geopandas as gpd
import networkx as nx
import math
import pickle

import datetime
from datetime import datetime as dt
from collections import Counter
from scipy.spatial.distance import cdist
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from matplotlib import pyplot as plt

from utils import *

In [2]:
# Hyper-paramters

# Time
start_date = dt(2019, 2, 10)
end_date = dt(2019, 7, 23)
start_time = 330 # 05:30:00 
end_time = 1260 # 21:00:00
busi_date = pd.bdate_range(start=start_date, end=end_date).date 

# Segments
set_tmc_segments_within_spd_shp = pickle.load(open("./set_tmc_segments_within_spd_shp.pkl", "rb"))  # 1000
set_xd_segments_str_spd_shp = pickle.load(open("./set_xd_segments_str_spd_shp.pkl", "rb"))  # 1231
df_xd_to_tmc = pickle.load(open("../../data/xd_to_tmc_v2.pkl", "rb"))

### 1. Downsample Raw Data Files

In [12]:
'''
Execute this code block ONLY IF downsampled csv data are not available on file
'''

# downsampled_tmc_5min_all = downsample_dataset("tmc", "../../data/speed/cranberry_tmc_5min/cran_tmc_5min_all.csv","../../data/speed/cranberry_tmc_5min/downsampled_cran_tmc_5min_all.csv", set_tmc_segments_within_spd_shp, busi_date, start_time, end_time)
# downsampled_tmc_5min_pv = downsample_dataset("tmc", "../../data/speed/cranberry_tmc_5min/cran_tmc_5min_pv.csv","../../data/speed/cranberry_tmc_5min/downsampled_cran_tmc_5min_pv.csv", set_tmc_segments_within_spd_shp, busi_date, start_time, end_time)
# downsampled_tmc_5min_truck = downsample_dataset("tmc", "../../data/speed/cranberry_tmc_5min/cran_tmc_5min_truck.csv","../../data/speed/cranberry_tmc_5min/downsampled_cran_tmc_5min_truck.csv", set_tmc_segments_within_spd_shp, busi_date, start_time, end_time)
# downsampled_tmc_1min_all = downsample_large_dataset("tmc", "../../data/speed/cranberry_tmc_1min/cran_tmc_1min_all.csv","../../data/speed/cranberry_tmc_1min/downsampled_cran_tmc_1min_all.csv", set_tmc_segments_within_spd_shp, busi_date, start_time, end_time)
# downsampled_xd_1min = downsample_large_dataset("xd", "../../data/speed/cranberry_xd_1min/cran_xd_1min.csv","../../data/speed/cranberry_tmc_1min/downsampled_cran_xd_1min.csv", set_xd_segments_str_spd_shp, busi_date, start_time, end_time)

In [3]:
downsampled_tmc_5min_all = pd.read_csv("../../data/speed/cranberry_tmc_5min/downsampled_cran_tmc_5min_all.csv")
downsampled_tmc_5min_pv = pd.read_csv("../../data/speed/cranberry_tmc_5min/downsampled_cran_tmc_5min_pv.csv")
downsampled_tmc_5min_truck = pd.read_csv("../../data/speed/cranberry_tmc_5min/downsampled_cran_tmc_5min_truck.csv")
downsampled_tmc_1min_all = pd.read_csv("../../data/speed/cranberry_tmc_1min/downsampled_cran_tmc_1min_all.csv")
downsampled_xd_1min = pd.read_csv("../../data/speed/cranberry_xd_1min/downsampled_cran_xd_1min.csv")

In [4]:
downsampled_tmc_5min_all.shape, downsampled_tmc_5min_pv.shape, downsampled_tmc_5min_truck.shape, downsampled_tmc_1min_all.shape, downsampled_xd_1min.shape

((5070347, 7), (4767297, 7), (2186937, 7), (106627081, 8), (117040954, 8))

### 2. Pivot Downsampled Data and Generate Speed/Density Dataframe

#### 2.1 Generate Speed Dataframe

In [5]:
df_downsampled_spd_xd_1min = pivot_df("xd", "speed", 1, df=downsampled_xd_1min, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_xd_1min/df_downsampled_spd_xd_1min_1104.pkl") 
df_downsampled_spd_tmc_5min_all = pivot_df("tmc", "speed", 5,  df=downsampled_tmc_5min_all, busi_date=busi_date, num_slot=186, freq=5, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_tmc_5min/df_downsampled_spd_tmc_5min_all_435.pkl")
df_downsampled_spd_tmc_5min_pv = pivot_df("tmc", "speed", 5, df=downsampled_tmc_5min_pv, busi_date=busi_date, num_slot=186, freq=5, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_tmc_5min/df_downsampled_spd_tmc_5min_pv_435.pkl")
df_downsampled_spd_tmc_5min_truck = pivot_df("tmc", "speed", 5, df=downsampled_tmc_5min_truck, busi_date=busi_date, num_slot=186, freq=5, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_tmc_5min/df_downsampled_spd_tmc_5min_truck_435.pkl")
df_downsampled_spd_tmc_1min_all = pivot_df("tmc", "speed", 1, df=downsampled_tmc_1min_all, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_tmc_1min/df_downsampled_spd_tmc_1min_all_1000.pkl") 

Check dates of incomplete slots:
Check dates of incomplete slots:
2019-02-22 185
2019-02-25 183
2019-04-25 185
Check dates of incomplete slots:
2019-02-13 185
2019-02-20 185
2019-02-22 185
2019-02-25 181
2019-02-27 185
2019-04-25 185
Check dates of incomplete slots:
2019-02-21 177
2019-02-22 178
2019-02-25 177
2019-02-27 177
2019-04-25 177
Check dates of incomplete slots:
2019-02-14 847
2019-03-19 924
2019-03-20 929
2019-03-29 929
2019-04-16 807
2019-04-17 362
2019-04-18 605
2019-04-19 769
2019-04-22 927
2019-04-23 927
2019-04-24 928
2019-04-25 926
2019-04-29 929
2019-04-30 929
2019-05-01 929
2019-05-03 926
2019-05-07 928
2019-05-09 925
2019-05-10 926
2019-05-14 923
2019-05-15 925
2019-05-16 924
2019-05-17 928
2019-05-20 929
2019-05-21 929
2019-05-22 929
2019-05-23 929
2019-05-24 925
2019-05-29 925
2019-05-30 928
2019-05-31 927
2019-06-03 925
2019-06-04 924
2019-06-05 926
2019-06-06 926
2019-06-07 925
2019-06-10 921
2019-06-11 921
2019-06-12 920
2019-06-13 921
2019-06-14 922
2019-06-17

In [6]:
df_downsampled_spd_xd_1min.shape, df_downsampled_spd_tmc_5min_all.shape, df_downsampled_spd_tmc_5min_pv.shape, df_downsampled_spd_tmc_5min_truck.shape, df_downsampled_spd_tmc_1min_all.shape

((108810, 1104), (21762, 435), (21762, 435), (21762, 424), (108810, 1000))

#### 2.2 Generate Density Dataframe

In [7]:
df_downsampled_dens_xd_1min = pivot_df("xd", "density", 1, df=downsampled_xd_1min, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_xd_1min/df_downsampled_dens_xd_1min_1104.pkl") 
df_downsampled_dens_tmc_5min_all = pivot_df("tmc", "density", 5, df=downsampled_tmc_5min_all, busi_date=busi_date, num_slot=186, freq=5, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_tmc_5min/df_downsampled_dens_tmc_5min_all_435.pkl")
df_downsampled_dens_tmc_5min_pv = pivot_df("tmc", "density", 5, df=downsampled_tmc_5min_pv, busi_date=busi_date, num_slot=186, freq=5, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_tmc_5min/df_downsampled_dens_tmc_5min_pv_435.pkl")
df_downsampled_dens_tmc_5min_truck = pivot_df("tmc", "density", 5, df=downsampled_tmc_5min_truck, busi_date=busi_date, num_slot=186, freq=5, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_tmc_5min/df_downsampled_dens_tmc_5min_truck_435.pkl")
df_downsampled_dens_tmc_1min_all = pivot_df("tmc", "density", 1, df=downsampled_tmc_1min_all, busi_date=busi_date, num_slot=930, freq=1, start_time=start_time, end_time=end_time, output_file_path="../../data/speed/cranberry_tmc_1min/df_downsampled_dens_tmc_1min_all_1000.pkl") 

Check dates of incomplete slots:
Check dates of incomplete slots:
2019-02-22 185
2019-02-25 183
2019-04-25 185
Check dates of incomplete slots:
2019-02-13 185
2019-02-20 185
2019-02-22 185
2019-02-25 181
2019-02-27 185
2019-04-25 185
Check dates of incomplete slots:
2019-02-21 177
2019-02-22 178
2019-02-25 177
2019-02-27 177
2019-04-25 177
Check dates of incomplete slots:
2019-02-14 847
2019-03-19 924
2019-03-20 929
2019-03-29 929
2019-04-16 807
2019-04-17 362
2019-04-18 605
2019-04-19 769
2019-04-22 927
2019-04-23 927
2019-04-24 928
2019-04-25 926
2019-04-29 929
2019-04-30 929
2019-05-01 929
2019-05-03 926
2019-05-07 928
2019-05-09 925
2019-05-10 926
2019-05-14 923
2019-05-15 925
2019-05-16 924
2019-05-17 928
2019-05-20 929
2019-05-21 929
2019-05-22 929
2019-05-23 929
2019-05-24 925
2019-05-29 925
2019-05-30 928
2019-05-31 927
2019-06-03 925
2019-06-04 924
2019-06-05 926
2019-06-06 926
2019-06-07 925
2019-06-10 921
2019-06-11 921
2019-06-12 920
2019-06-13 921
2019-06-14 922
2019-06-17