In [None]:
import os
import time
import pickle
import networkx as nx
import pandas as pd
import cudf

In [2]:
# data directory
DATA_DIR = '../../Input_processed_data'

# IC, 道路情報 csv
IC_CSV = f'{DATA_DIR}/road_master/ic_preprocessed.csv'
IC_NET_CSV = f'{DATA_DIR}/road_master/220303-doronet_ic.csv'
IC_SUBNET_CSV = f'{DATA_DIR}/road_master/icnet_sub.csv'

# 検索ログ csv
SEARCH_LOG_DIR = lambda month: f'{DATA_DIR}/search_records/csv{month}'
SEARCH_LOG_CSV = lambda date: f'{SEARCH_LOG_DIR(date[:6])}/record_{date}.csv'

# 準備

In [3]:
# モジュール内で前処理済み
df_ic = pd.read_csv(IC_CSV, dtype={'ic_code': str})
df_icnet = pd.read_csv(IC_NET_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})
sub_icnet = pd.read_csv(IC_SUBNET_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})

In [4]:
code2name = dict(zip(df_ic['ic_code'], df_ic['ic_name']))
name2code = {v: k for k, v in code2name.items()}

In [5]:
ic_graph = nx.from_pandas_edgelist(
    df_icnet, source='start_code', target='end_code',
    edge_attr=['distance', 'road_code', 'direction'], create_using=nx.DiGraph())

# 検索ログを簡略化

In [6]:
ic_nodes_set: set = set(ic_graph.nodes)

In [7]:
def get_log(date):
    if not os.path.exists(SEARCH_LOG_CSV(date)):
        return None
    
    type_map = {
        'start_code': str,
        'end_code': str,
        'via1_code': str,
        'via2_code': str,
        'via3_code': str,
        'order': str
    }
    df = pd.read_csv(SEARCH_LOG_CSV(date), dtype=type_map)
    return cudf.from_pandas(df)

## 列を選択

In [8]:
def simplify_search_log(df):
    # レコードを限定
    df_res = df.loc[(df['start_code'].isin(ic_nodes_set)) & (df['end_code'].isin(ic_nodes_set))]
    
    # 列を限定
    select_columns = ['date', 'start_code', 'end_code', 'spec_day', 'spec_time', 'spec_type', 'car_type']
    df_res = df_res.loc[:, select_columns].rename(columns={'date': 'datetime'})
    
    # 指定日時を1列にまとめる
    df_res = df_res.assign(spec_datetime=df_res['spec_day'] + ' ' + df_res['spec_time'])
    df_res.drop(['spec_day', 'spec_time'], axis=1, inplace=True)
    
    # 型変換
    df_res = df_res.astype({
        'start_code': 'category',
        'end_code': 'category',
        'spec_type': 'category',
        'car_type': 'category',
    })
    df_res = df_res.assign(
        datetime=cudf.to_datetime(df_res['datetime']),
        spec_datetime=cudf.to_datetime(df_res['spec_datetime'])
    )
    
    # 列並び替え
    select_columns = ['datetime', 'start_code', 'end_code', 'spec_datetime', 'spec_type', 'car_type']
    return df_res.loc[:, select_columns]

In [9]:
df = get_log('20230901')
print(df.shape)
df.head()

(373484, 19)


Unnamed: 0,date,start_code,start_name,end_code,end_name,via1_code,via1_name,via2_code,via2_name,via3_code,via3_name,spec_day,spec_time,spec_type,order,car_type,use_nexco,use_urban,use_local
0,2023/09/01 00:00:00,6001006,宝町,1400091,那珂,,,,,,,2023-09-01,07:00,1,2,3,1,1,1
1,2023/09/01 00:00:00,1461080,太田桐生,1040011,浦和（東京方面）,,,,,,,2023-08-31,08:00,1,3,2,1,1,1
2,2023/09/01 00:00:00,1072023,大和まほろばスマート,1612046,須崎東,,,,,,,2023-10-21,15:00,1,2,2,1,1,1
3,2023/09/01 00:00:01,1461080,太田桐生,212B011,大宮,,,,,,,2023-09-01,00:00,1,2,2,1,1,1
4,2023/09/01 00:00:01,1800076,高崎,1040241,仙台宮城,,,,,,,2023-08-31,23:50,1,2,2,1,1,1


In [10]:
df_simple = simplify_search_log(df)
print(df_simple.shape)
df_simple.head()

(373014, 6)


Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type
0,2023-09-01 00:00:00,6001006,1400091,2023-09-01 07:00:00,1,3
1,2023-09-01 00:00:00,1461080,1040011,2023-08-31 08:00:00,1,2
2,2023-09-01 00:00:00,1072023,1612046,2023-10-21 15:00:00,1,2
3,2023-09-01 00:00:01,1461080,212B011,2023-09-01 00:00:00,1,2
4,2023-09-01 00:00:01,1800076,1040241,2023-08-31 23:50:00,1,2


## （第2回コンテスト用）関越道・東北道を通過するレコードのみを抜き出す

### 経路検索用プログラム

In [11]:
def get_route(src: str, dest: str, route_dict: dict):
    '''
    ic_graph上で出発地から目的地までの経路を得る関数

    Parameters
    --------------
    src: str
        出発ICコード
    dest: str
        目的ICコード

    Returns
    ----------
    path: List[str]
    経路中の各IC codeのリスト
    '''  
    if not (src in code2name and dest in code2name):
        return None
    try:
        path = route_dict[src][dest]
        return path
    except: # 経路が存在しない, もしくはノードがグラフ上に存在しない場合
        return None 

In [12]:
fname = './route_dict.pkl'

if os.path.exists(fname): # 経路マップがすでに存在しているとき、それを使う
    with open(fname, 'rb') as f:
        print('Loading IC Routes...')
        route_dict = pickle.load(f)
else: # 存在していなければ計算してバイナリで保存
    print('Calculating IC Routes...')
    route_dict = dict(nx.all_pairs_dijkstra_path(ic_graph, weight='distance'))
    
    with open(fname, 'wb') as f:
        pickle.dump(route_dict, f)

### 対象道路のみを抽出

In [20]:
def extract_road_related_queries(df_log: pd.DataFrame, target_road_code_set: set):
    query_indices = []

    for i_query, record in enumerate(df_log.to_numpy()):
        start_code, end_code = record[[1, 2]]
        path = get_route(start_code, end_code, route_dict)
        # when the shortest path cannot be calculated
        if path is None:
            continue
        
        for i_segment, (start_code, end_code) in enumerate(zip(path, path[1:])):
            if ic_graph[start_code][end_code]['road_code'] in target_road_code_set:
                query_indices.append(i_query)
                break

    related_df_log = df_log.iloc[query_indices].reset_index(drop=True)
    return related_df_log

In [14]:
PERIOD_BLOCKS = [
    ('20210401', '20210630'),
    ('20210701', '20210930'),
    ('20211001', '20211231'),
    ('20220101', '20220331'),
    ('20220401', '20220630'),
    ('20220701', '20220930'),
    ('20221001', '20221231'),
    ('20230101', '20230331'),
    ('20230401', '20230630'),
    ('20230701', '20230930'),
]

In [15]:
target_road_code_set = {'1800', '1040'}

In [21]:
for start_date, end_date in PERIOD_BLOCKS:
    date_list = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]

    for i_date, target_date in enumerate(date_list):
        print('='*20, target_date, '='*20)
        
        df_log = simplify_search_log(get_log(target_date))

        s = time.time()
        df_related_log = extract_road_related_queries(df_log, target_road_code_set)
        
        print(f'{target_date} | # of related queries: {len(df_related_log)} ({time.time() - s:.2f} [sec])')
        break
    break

    print()

20210401 | # of related queries: 64545 (14.46 [sec])


In [22]:
df_log.shape, df_related_log.shape

((217378, 6), (64545, 6))

In [23]:
df_related_log.head()

Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type
0,2021-04-01 00:00:01,6016021,1461150,2021-03-31 06:00:00,1,2
1,2021-04-01 00:00:06,214K106,1010066,2021-04-01 00:00:00,1,2
2,2021-04-01 00:00:06,6016021,1461150,2021-03-31 07:00:00,1,2
3,2021-04-01 00:00:10,1040263,1010046,2021-04-01 00:00:00,1,2
4,2021-04-01 00:00:14,1040366,5057020,2021-04-01 00:00:00,1,2


In [None]:
# df_route.to_csv('./route_table.csv', index=False)
# df_route.to_pickle('./route_table.pkl')

In [None]:
! du -h ./route_table.*

In [None]:
df_route = pd.read_pickle('./route_table.pkl')
df_route.head()

## インデックスを張る

In [None]:
df_route_indexed = df_route.set_index(['start_code', 'end_code'])
df_route_indexed.info()

In [None]:
df_route_indexed.head()

In [None]:
# df_route_indexed.to_pickle('./route_table_indexed.pkl')

In [None]:
! du -h ./route_table_indexed.*

In [None]:
df_route_indexed = pd.read_pickle('./route_table_indexed.pkl')
df_route_indexed.head()