# Create a series of graph of the road network

In [1]:
# common settings
%matplotlib inline
import lab.setup
import functools
import pandas as pd
import numpy as np
import numba

from pandas.tseries.offsets import *

g_region_temporal = 5
g_region_spatial  = 1
g_start_date = '2016-03-03'
g_end_date   = '2016-03-03'
g_start_time = '{} 00:00:00'.format(g_start_date)
g_end_time   = '{} 23:59:59'.format(g_end_date)

DATA_PATH = 'dataset'

In [133]:
# load raw data generated by linear.ipynb
ds_train_full = pd.read_csv('dataset/ds_filled_s1.csv', dtype={'link_ID':'uint64'}, low_memory=False)
ds_train_full.head(1)

Unnamed: 0,link_ID,time_intv,date,time_interval,travel_time,in_links,out_links,filled,uplink_0,uplink_1,uplink_2,uplink_3,downlink_0,downlink_1,downlink_2,downlink_3,uplink_mean_tt,downlink_mean_tt
0,3377906280028510514,2016-03-03 00:00:00,,,5.1,4377906282541600514,4377906280763800514,True,4377906282541600514,0,0,0,4377906280763800514,0,0,0,55.4,8.4


## cross as vertex&& link as edge

In [138]:
# 我们的数据集记录的是道路的通行时间，我们假设在每个道路的末尾有一个传感器，以这个传感器作为图的顶点，
# 这样上下游道路的通行时间就可以作为两个传感器之间带权的边。
for name, group_df in ds_train_full.groupby('link_ID'):
    sample_size = group_df.shape[0]
    time_intv = group_df.time_intv
links = ds_train_full.link_ID.unique()
panel = pd.Panel(np.zeros((sample_size, links.shape[0], links.shape[0])))
panel.items      = pd.to_datetime(time_intv)
panel.major_axis = links
panel.minor_axis = links

# 对每个item（即一个时间戳），遍历对应的原始记录，填入拓扑和通行时间信息
ds_train_full.index = pd.to_datetime(ds_train_full.time_intv)
# ds_train_full['2016-03-03 00:00:00']

# pandas uint64index does not supports keys larger than 2**63 - 1
def _get(df, key):
    i = df.index.get_loc(key)
    return df.iloc[i]

def _build_one_snapshot(ds_row, ds):
    downlink_0 = ds_row.loc['downlink_0']
    downlink_1 = ds_row.loc['downlink_1']
    downlink_2 = ds_row.loc['downlink_2']
    downlink_3 = ds_row.loc['downlink_3']
    
    frame = panel.loc[ds_row.loc['time_intv']]
    for nbr_link in [downlink_0, downlink_1, downlink_2, downlink_3]:
        if nbr_link == 0:
            continue
        
        row = _get(frame, ds_row.loc['link_ID'])
        ds_row_nbr = _get(ds, nbr_link)
        i = row.index.get_loc(nbr_link)
        row.iloc[i] = ds_row_nbr.loc['travel_time']
    
for time_intv in panel:
    ds = ds_train_full.loc[time_intv]
    ds_index_by_link = ds.set_index('link_ID')
    ds.apply(
        functools.partial(
            _build_one_snapshot, ds=ds_index_by_link), axis=1)

In [144]:
panel.to_pickle(f'{DATA_PATH}/ds_graph_panel')

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 132 entries, 3377906280028510514 to 9377906289175510514
Columns: 132 entries, 3377906280028510514 to 9377906289175510514
dtypes: float64(132)
memory usage: 142.2 KB
