# PEMS Data prep

The Pems data and priors are mainly taken from https://github.com/liyaguang/DCRNN but we change to format to match our model and check if the prior makes sense.

# Create data for training model

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import pickle
import torch

In [3]:
import sys
sys.path.append('../')
from GraphTrafficLib.utils.visual_utils import PEMS_folium_plot

In [4]:
# The processed data can be found in the numpy zip files
pems_train = np.load('../../datafolder/rawdata/pems/train.npz')
pems_val = np.load('../../datafolder/rawdata/pems/val.npz')
pems_test = np.load('../../datafolder/rawdata/pems/test.npz')

x_train = pems_train['x']
y_train = pems_train['y']
x_val = pems_val['x']
y_val = pems_val['y']
x_test = pems_test['x']
y_test = pems_test['y']

In [5]:
# We can collect the data back to match my format
train_data = np.concatenate([x_train, y_train], axis=1)
val_data = np.concatenate([x_val, y_val], axis=1)
test_data = np.concatenate([x_test, y_test], axis=1)

#np.save('../../datafolder/procdata/pems_data/train_data.npy', train_data)
#np.save('../../datafolder/procdata/pems_data/val_data.npy', val_data)
#np.save('../../datafolder/procdata/pems_data/test_data.npy', test_data)

In [6]:
train_data.shape

(36465, 24, 325, 2)

In [7]:
test_data.shape

(10419, 24, 325, 2)

In [8]:
val_data.shape

(5209, 24, 325, 2)

In [9]:
36465 + 10419 + 5209

52093

In [13]:
36465 / 288

126.61458333333333

In [14]:
10419 / 288

36.177083333333336

In [15]:
5209 / 288

18.086805555555557

In [6]:
# The distances can be found in another csv
distances_df = pd.read_csv('../../datafolder/rawdata/pems/distances_bay_2017.csv', header=None, names=['from', 'to', 'dist'])

# I remove self loops
distances_df = distances_df[distances_df.dist != 0]

distances_df

Unnamed: 0,from,to,dist
3,400030,400045,5108.4
4,400030,400065,7401.1
5,400030,400088,10961.1
6,400030,400100,8360.4
7,400030,400122,5430.2
...,...,...,...
8352,414694,404759,11200.3
8353,414694,405701,1016.7
8354,414694,407710,2290.3
8355,414694,408907,8610.4


In [7]:
# I load their index to sensor id lookup and their adjacancy matrix
with open('../../datafolder/rawdata/pems/adj_mx_bay.pkl', 'rb') as f:
    sensor_ids, sensor_id_to_ind, adj_mx = pickle.load(f, encoding='latin1')

adj_mx_no_loop = adj_mx - np.eye(325)

# we also make the reverse lookup
sensor_ind_to_id = {v: k for k, v in sensor_id_to_ind.items()}

# The locations of the sensors can be found in this csv
location_df = pd.read_csv('../../datafolder/rawdata/pems/graph_sensor_locations_bay.csv', header=None, names=['id', 'lat', 'lon'])
location_df = location_df.set_index('id')

gdf = gpd.GeoDataFrame(
    location_df, geometry=gpd.points_from_xy(location_df.lon, location_df.lat), crs='EPSG:4326')



In [8]:
# Based on the distances I make a sparse spatial adjacancy matrix

spatial_adj_matrix = np.zeros_like(adj_mx_no_loop)
problem_list = []
for sender in distances_df['from'].unique():
    sender_ind = sensor_id_to_ind[str(sender)]
    receivers = distances_df.loc[(distances_df['from'] == sender)].sort_values('dist')['to']
    
    # Try closest one
    for receiver in receivers:
        receiver_ind = sensor_id_to_ind[str(receiver)]
        if spatial_adj_matrix[receiver_ind, sender_ind] != 1:
            spatial_adj_matrix[sender_ind, receiver_ind] = 1
            break
        else: # if already have connection other direction go to second closest
            problem_list.append(sender)
    
spatial_adj_matrix += spatial_adj_matrix.T
spatial_adj_matrix[spatial_adj_matrix != 0] = 1

spatial_adj_tensor = torch.Tensor(spatial_adj_matrix)

In [9]:
# Create empty adj
empty_adj = np.zeros_like(spatial_adj_matrix)

# Create full adj
full_adj = np.ones_like(spatial_adj_matrix) - np.eye(len(spatial_adj_matrix))

In [10]:
# I plot their adjacancy matrix and we can see that it is a decent proxy for spatial closeness
PEMS_folium_plot(gdf, adj_mx_no_loop, sensor_ind_to_id)

In [11]:
# I plot my sparse adjacancy matrix
PEMS_folium_plot(gdf, spatial_adj_matrix, sensor_ind_to_id)

In [12]:
# We save the adj matrices
np.save('../../datafolder/procdata/pems_data/approx_local_adj.npy', adj_mx_no_loop)
np.save('../../datafolder/procdata/pems_data/sparse_local_adj.npy', spatial_adj_matrix)
np.save('../../datafolder/procdata/pems_data/pems_full_adj.npy', full_adj)
np.save('../../datafolder/procdata/pems_data/pems_empty_adj.npy', empty_adj)
