## Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)
import warnings; warnings.filterwarnings("ignore")

## Read Data

In [2]:
train = pd.read_parquet('../data/DC_train.pqt')
test = pd.read_parquet('../data/DC_test.pqt')
train.sample(5)

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,maximum_speed_limit,weight_restricted,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
4414750,TRAIN_4414750,20220328,월,9,1,103,일반국도16호선,50.0,0.0,0,제2가시교,33.364336,126.769409,없음,양수장,33.361717,126.766958,없음,55.0
978519,TRAIN_0978519,20220620,월,17,3,107,연삼로,70.0,0.0,0,제주은행사거리,33.500772,126.543837,있음,산지2교,33.499427,126.541298,없음,15.0
1286170,TRAIN_1286170,20211219,일,18,1,106,지방도1120호선,60.0,0.0,0,농산물집하장,33.318746,126.247148,없음,저청삼거리,33.323197,126.248749,없음,39.0
4201877,TRAIN_4201877,20211103,수,8,1,103,일반국도16호선,50.0,0.0,0,월산1교,33.260178,126.490345,없음,월산2교,33.260056,126.490987,없음,53.0
3939378,TRAIN_3939378,20220101,토,1,2,103,일반국도95호선,80.0,0.0,0,제2고성교,33.438939,126.427353,없음,고성교,33.439569,126.428087,없음,76.0


## Start-End 구성

In [3]:
# start node와 end node가 같은 데이터 비중
print('start=end train:', train.query('start_node_name == end_node_name').shape[0]/train.shape[0])
print('start=end test:', test.query('start_node_name == end_node_name').shape[0]/test.shape[0])

start=end train: 0.19717086022619248
start=end test: 0.18238160149154822


### Node name
- start node name과 end node name이 겹치는 것도 많아 데이터 내 node_name의 유일한 값은 총 490개이다.

In [4]:
pd.Series(train.start_node_name.tolist() + train.end_node_name.tolist()\
          + test.start_node_name.tolist() + test.end_node_name.tolist()).nunique()

490

- `[DC]` node_name의 labeling을 한다.

In [5]:
label_df = pd.concat([train['start_node_name'].rename('node_name'), train['end_node_name'].rename('node_name')]).drop_duplicates()\
           .reset_index(drop=True).reset_index().rename(columns={'index':'node_label'}).set_index('node_name')

for label in pd.concat([test['start_node_name'].rename('node_name'), test['end_node_name'].rename('node_name')]).drop_duplicates():
    if label not in label_df.index:
        label_df.loc[label] = label_df['node_label'].max() + 1

In [6]:
train['start_node_label'] = pd.merge(train[['id','start_node_name']], 
                                     label_df.reset_index().rename(columns={'node_name':'start_node_name'}), 
                                     on='start_node_name', how='left')['node_label']
train['end_node_label'] = pd.merge(train[['id','end_node_name']], 
                                   label_df.reset_index().rename(columns={'node_name':'end_node_name'}), 
                                   on='end_node_name', how='left')['node_label']

In [7]:
test['start_node_label'] = pd.merge(test[['id','start_node_name']], 
                                     label_df.reset_index().rename(columns={'node_name':'start_node_name'}), 
                                     on='start_node_name', how='left')['node_label']
test['end_node_label'] = pd.merge(test[['id','end_node_name']], 
                                   label_df.reset_index().rename(columns={'node_name':'end_node_name'}), 
                                   on='end_node_name', how='left')['node_label']

In [8]:
# labeling한 node_name은 제거한다.
del train['start_node_name'], train['end_node_name']
del test['start_node_name'], test['end_node_name']
display(train.sample(5))

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,maximum_speed_limit,weight_restricted,road_type,start_latitude,start_longitude,start_turn_restricted,end_latitude,end_longitude,end_turn_restricted,target,start_node_label,end_node_label
4517938,TRAIN_4517938,20220323,수,15,1,103,일반국도16호선,60.0,32400.0,3,33.32363,126.760714,없음,33.323817,126.760979,없음,49.0,325,325
1230008,TRAIN_1230008,20220106,목,4,2,107,-,50.0,0.0,0,33.289003,126.744291,없음,33.287997,126.739326,없음,57.0,264,469
1635212,TRAIN_1635212,20211217,금,6,2,103,일반국도16호선,80.0,43200.0,3,33.280084,126.362058,없음,33.280072,126.362147,없음,74.0,161,107
2646575,TRAIN_2646575,20210909,목,16,1,103,일반국도16호선,30.0,0.0,0,33.478001,126.543843,없음,33.477445,126.542703,없음,29.0,15,476
2884487,TRAIN_2884487,20210916,목,14,2,103,일반국도12호선,50.0,0.0,0,33.483719,126.405338,없음,33.485264,126.410896,없음,23.0,48,62


### Edge
- 특정 개수의 Edge(start-end)로 구성되어 있으며 방향이 반대인 Edge도 있다.

In [9]:
print('학습데이터 내 Edge개수:', train[['start_node_label','end_node_label']].drop_duplicates().shape[0])
print('평가데이터 내 Edge개수:', test[['start_node_label','end_node_label']].drop_duplicates().shape[0])

학습데이터 내 Edge개수: 808
평가데이터 내 Edge개수: 400


In [10]:
# 학습데이터 내 Edge 중 2개를 제외하곤 반대방향 Edge가 있다.
ease = train[['start_node_label','end_node_label']].drop_duplicates()
print('반대 방향 Edge가 존재하는 Edge수:', len([i for i in ease.values if i in ease[['end_node_label','start_node_label']].values]))

반대 방향 Edge가 존재하는 Edge수: 806


- `[DC]` Edge를 labeling한다.

In [11]:
label_df = train[['start_node_label','end_node_label']].drop_duplicates()\
           .reset_index(drop=True).reset_index().rename(columns={'index':'edge_label'}).set_index(['start_node_label','end_node_label'])

for label in test[['start_node_label','end_node_label']].drop_duplicates().values:
    if tuple(label) not in label_df.index:
        label_df.loc[tuple(label), 'edge_label'] = label_df['edge_label'].max()+1

In [12]:
train['edge_label'] = pd.merge(train[['id','start_node_label','end_node_label']], label_df.reset_index(), 
                               on=['start_node_label','end_node_label'], how='left')['edge_label']
test['edge_label'] = pd.merge(test[['id','start_node_label','end_node_label']], label_df.reset_index(), 
                               on=['start_node_label','end_node_label'], how='left')['edge_label']
display(train.sample(5))

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,maximum_speed_limit,weight_restricted,road_type,start_latitude,start_longitude,start_turn_restricted,end_latitude,end_longitude,end_turn_restricted,target,start_node_label,end_node_label,edge_label
357363,TRAIN_0357363,20220309,수,12,3,103,일반국도99호선,70.0,0.0,0,33.476308,126.483321,있음,33.478959,126.482307,없음,44.0,68,376,73
389414,TRAIN_0389414,20220210,목,18,3,103,일반국도12호선,70.0,0.0,0,33.503254,126.534803,없음,33.501795,126.531511,있음,17.0,387,184,741
1719742,TRAIN_1719742,20211001,금,13,1,107,연북로,30.0,0.0,0,33.487633,126.540414,있음,33.489894,126.543623,없음,18.0,96,268,103
3442389,TRAIN_3442389,20220130,일,9,2,103,일반국도95호선,80.0,0.0,0,33.438795,126.427168,없음,33.437744,126.425663,없음,69.0,304,55,390
3703304,TRAIN_3703304,20210925,토,1,2,107,-,80.0,0.0,0,33.466433,126.454583,없음,33.465863,126.456384,없음,74.0,275,118,633


- `[DC]` 반대 방향 Edge는 음수로 처리한다.

In [13]:
label_df = train[['start_latitude','start_longitude','end_latitude','end_longitude']].drop_duplicates()\
           .reset_index(drop=True).reset_index().rename(columns={'index':'coordinate_label'})\
           .set_index(['start_latitude','start_longitude','end_latitude','end_longitude'])

for label in test[['start_latitude','start_longitude','end_latitude','end_longitude']].drop_duplicates().values:
    if tuple(label) not in label_df.index:
        label_df.loc[tuple(label), 'coordinate_label'] = label_df['coordinate_label'].max()+1  

In [14]:
label_df = label_df.reset_index()

# 방향이 반대인 것은 Minus로 labeling한다.
for IDX, SLAT, SLON, ELAT, ELON, COOR in label_df.itertuples():
    CHANGE = label_df.loc[IDX:].query('(start_latitude==@ELAT) & (start_longitude==@ELON) &\
                                       (end_latitude==@SLAT) & (end_longitude==@SLON)').index
    if len(CHANGE)==1:
        label_df.iloc[CHANGE, -1] = -1 * COOR

In [15]:
train['coor_label'] = pd.merge(train[['id','start_latitude','start_longitude','end_latitude','end_longitude']], label_df, 
                               on=['start_latitude','start_longitude','end_latitude','end_longitude'], how='left')['coordinate_label']
test['coor_label'] = pd.merge(test[['id','start_latitude','start_longitude','end_latitude','end_longitude']], label_df, 
                              on=['start_latitude','start_longitude','end_latitude','end_longitude'], how='left')['coordinate_label']
display(train.sample(5))

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,maximum_speed_limit,weight_restricted,road_type,...,start_longitude,start_turn_restricted,end_latitude,end_longitude,end_turn_restricted,target,start_node_label,end_node_label,edge_label,coor_label
3998145,TRAIN_3998145,20220205,토,17,1,107,산서로,50.0,0.0,0,...,126.412579,없음,33.262025,126.41436,없음,32.0,188,421,772,-687
1786938,TRAIN_1786938,20220328,월,15,2,103,일반국도12호선,60.0,0.0,0,...,126.450593,없음,33.493546,126.445105,없음,35.0,229,148,277,-170
2448837,TRAIN_2448837,20211222,수,10,2,103,일반국도12호선,50.0,0.0,0,...,126.845361,없음,33.336848,126.842086,없음,47.0,441,102,676,-112
4693535,TRAIN_4693535,20210908,수,0,2,106,중산간서로,70.0,0.0,0,...,126.446843,없음,33.461055,126.444333,없음,24.0,215,460,493,534
469106,TRAIN_0469106,20220203,목,11,3,103,일반국도12호선,70.0,0.0,0,...,126.512046,없음,33.500103,126.512851,있음,20.0,294,20,646,-20


## Save Data

In [16]:
train.to_parquet('../data/DC_train.pqt', engine='pyarrow', index=False)
test.to_parquet('../data/DC_test.pqt', engine='pyarrow', index=False)