このファイルについて
- about
    - 検索履歴データから館山道・関越道の各区間の5分単位の検索数を計算する
- author: 松永
- Input
    - ../Input_processed_data/search_records/csv202xxx/*
- Output
    - 時間指定あり検索
        - ../Input_processed_data/search_count/search-count_{tateyama or kannetsu}.csv
    - 時間指定なし検索
        - ../Input_processed_data/search_count/search-count_{tateyama or kannetsu}_unspecified.csv"

In [1]:
import os
import time
import tqdm
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cudf

import warnings
warnings.simplefilter('ignore')

In [2]:
# data directory
DATA_DIR = '../../Input_processed_data'

# IC, 道路情報 csv
IC_CSV = f'{DATA_DIR}/road_master/ic_merged.csv'
IC_NET_CSV = f'{DATA_DIR}/road_master/220303-doronet_ic.csv'
# IC_NET_SUB_CSV = f'{DATA_DIR}/road_master/tateyama_kannetsu_icnet.csv'
IC_NET_SUB_CSV = f'{DATA_DIR}/road_master/touhoku_icnet.csv'

# 検索ログ csv
SEARCH_LOG_DIR = lambda month: f'{DATA_DIR}/search_records/csv{month}'
SEARCH_LOG_CSV = lambda date: f'{SEARCH_LOG_DIR(date[:6])}/record_{date}.csv'

# 検索件数を保存するcsv
## 日時指定あり
SEARCH_COUNT_CSV_KANNETSU = f'./search_count/search-count_kannetsu.csv'
SEARCH_COUNT_CSV_TATEYAMA = f'./search_count/search-count_tateyama.csv'
SEARCH_COUNT_CSV_TOUHOKU = f'./search_count/search-count_touhoku.csv'

### 準備

In [3]:
# モジュール内で前処理済み
df_ic = pd.read_csv(IC_CSV, dtype={'ic_code': str})
df_icnet = pd.read_csv(IC_NET_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})
sub_icnet = pd.read_csv(IC_NET_SUB_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})

code2name = dict(zip(df_ic['ic_code'], df_ic['ic_name']))
name2code = {v: k for k, v in code2name.items()}

ic_graph = nx.from_pandas_edgelist(df_icnet, source='start_code', target='end_code',
                                  edge_attr=['distance', 'road_code', 'direction'],
                                  create_using=nx.DiGraph())

In [4]:
# 区間ごとの制限速度を格納したテーブル, Map を作成
df_limits = sub_icnet.loc[:, ['start_code', 'end_code', 'start_name', 'end_name', 'road_code', 'limit']]

limit_dict = {
    (s_code, e_code): lim 
    for s_code, e_code, lim in df_limits.loc[:, ['start_code', 'end_code', 'limit']].values
}

In [5]:
cudf_icnet = cudf.from_pandas(df_icnet)
cudf_limits = cudf.from_pandas(df_limits)

In [6]:
def str2time(time_str, format='%H:%M'):
    '''
    文字列形式の時刻 => datetime.Timeクラスの時刻
    Parameters
    ----------
    time_str: str
    時刻
    format: str
    時刻文字列の形式

    Returns
    -------
    time: datetime.time
    '''
    time_datetime = dt.datetime.strptime(time_str, format)
    time = time_datetime.time()
    return time

In [7]:
def get_route(ic_graph, src_name, target_name):
    '''
    ic_graph上で出発地（src_name）から目的地（target_name）までの経路を得る関数

    Parameters
    --------------
    ic_graph: nx.DiGraph
        ICの繋がりを表す有向グラフ
    src_name: str
        出発IC名
    target_name: str
        目的IC名

    Returns
    ----------
    path: List[str]
    経路中の各IC codeのリスト
    '''  
    if not (src_name in name2code and target_name in name2code):
        return None

    src = name2code[src_name]
    target = name2code[target_name]

    try:
        path = PATH_DICT[src][target]
        return path
    # 経路が存在しない, もしくはノードがグラフ上に存在しない場合
    except:
        return None


def get_route_with_time(ic_graph, src_name, target_name, departure_time='12:00', arrival_time=None, spec_date=dt.date.today()):
    '''
    ic_graph上で出発地（src_name）から目的地（target_name）までの予想通過時刻付き経路を得る関数

    Parameters
    ----------
    ic_graph: nx.DiGraph
        ICの繋がりを表す有向グラフ
    src_name: str
        出発IC名
    target_name: str
        目的IC名
    departure_time: str or datetime.time
        出発時刻
    arrival_time: str or datetime.time
        到着時刻
    spec_date: str or dateitme.date
        指定日

    Returns
    -------
    path: List[str]
        経路中の各IC codeのリスト
    arrival_time_list: List[datetime.time]
        経路中の各ICへの予想到着時刻（datetime.time型）のリスト
    '''
    # 関越道・館山道 以外の道路の移動速度は80km/hと仮定する
    DEFAULT_SPEED = 80

    path = get_route(ic_graph, src_name, target_name)

    # 経路が存在しない場合
    if path is None:
        return (None, None)
    
    try:
        if isinstance(departure_time, str):
            departure_time = str2time(departure_time)
        if isinstance(arrival_time, str):
            arrival_time = str2time(arrival_time)
    except (ValueError, TypeError) as e:
        return (None, None)
    
    if isinstance(spec_date, str):
        spec_date = dt.datetime.strptime(spec_date, '%Y/%m/%d').date()

    elapsed = dt.timedelta()
    elapsed_time_list = [elapsed]

    for i in range(len(path)-1):
        s, t = path[i], path[i+1]

        dist = ic_graph[s][t]['distance']
        limit_speed = limit_dict.get((s, t), DEFAULT_SPEED)

        # s -> t　までの所要時間を算出
        td = dt.timedelta(hours = dist / limit_speed)

        elapsed += td
        elapsed_time_list.append(elapsed)

    if arrival_time:
        spec_datetime = dt.datetime.combine(spec_date, arrival_time)
        time_list = [spec_datetime - td for td in elapsed_time_list[::-1]]
    else:
        spec_datetime = dt.datetime.combine(spec_date, departure_time)
        time_list = [spec_datetime + td for td in elapsed_time_list]

    return path, time_list

In [8]:
PATH_DICT = dict(nx.all_pairs_dijkstra_path(ic_graph, weight='distance'))

In [9]:
def get_log(date):
    if not os.path.exists(SEARCH_LOG_CSV(date)):
        # print(f'{SEARCH_LOG_CSV(date)} not exists')
        return None

    df = pd.read_csv(SEARCH_LOG_CSV(date), 
                    dtype={'start_code': str, 'end_code': str, 'via1_code': str, 'via2_code': str, 'via3_code': str, 'order': str})
    df = cudf.from_pandas(df)
    return df


def get_unspecified_logs(date):
    '''
    検索履歴全体から時間指定なしの検索のみを得る関数

    Parameters
    --------------
    date: pandas.DataFrame
    得たい検索履歴データの日付

    Returns
    ----------
    unspecified: pandas.DataFrame
    時間指定なしの検索履歴データ
    '''
    df = get_log(date)

    search_dt = cudf.to_datetime(df['date'])
    specified_dt = cudf.to_datetime(df['spec_day'] + ' ' + df['spec_time'], errors='coerce')

    t_diff = abs((search_dt - specified_dt) / np.timedelta64(1, 's'))

    td_thresh = 15 * 60
    unspecified = df[t_diff < td_thresh]

    unspecified.reset_index(drop=True, inplace=True)

    return unspecified


def get_past_logs(target_date, periods, include_target=False):
    '''
    指定日(target_date)から過去数日分の検索履歴データを取得する関数

    Parameters
    --------------
    target_date: str or List[str]
        混雑度算出の対象となる日付（文字列 or リスト)
    periods: int
        過去何日分の履歴を参照するか
    time_specified: bool
        時間指定あり or なし
    include_target: bool
        target_date（Listである場合はそのうち最も早い日付）をデータに含めるかどうか

    Returns
    ----------
    df_all: pandas.DataFrame
        target_dateから過去periods日間の検索履歴データのうち、target_dateを指定日としたログの集合
    '''
    if isinstance(target_date, list):
        end_date = sorted(target_date)[0]
        target_date_set = set(target_date)
    else:
        end_date = target_date
        target_date_set = set([target_date])

    # 参照すべき全日付のiterableを生成
    if include_target:
        dt_range = pd.date_range(end=end_date, periods=periods)
    else:
        dt_range = pd.date_range(end=end_date, periods=periods+1, closed='left')

    DAYS = [d.strftime('%Y%m%d') for d in dt_range]

    df_all = None
    for d in DAYS:
        df = get_log(d)

        if df is None:
              continue

        # 時間指定ありの検索履歴データ
        # 検索の「指定日」 (%Y-%m-%d) がtarget_date (%Y%m%d) と一致する行のみ抽出
        df = df[df['spec_day'].str.replace('-', '').isin(target_date_set)]
        # print(f'{d} (時間指定あり): {len(df)}件')

        df_all = cudf.concat([df_all, df], ignore_index=True)

    df_all.reset_index(drop=True, inplace=True)

    return df_all

## functions

In [10]:
def expand_search_path(df, road_code):
    '''
    検索経路を複数のエッジに展開する
    A => B => C => D という検索経路を A => B, B => C, C => Dというように区切る

    Parameters
    --------------
    df: pandas.DataFrame
    検索ログのデータフレーム
    road_code: str
    道路コード （指定された道路上のエッジのみを結果のデータフレームに格納する）

    Returns
    ----------
    result: pandas.DataFrame
    検索経路がエッジに展開された結果
    エッジの両端のICコード（start_code, end_code）とそのエッジの通過時間(passing_time)が格納される
    '''
    start_codes = []
    end_codes = []
    passing_time_list = []
    
    nodes_set = set(ic_graph.nodes)
    
    df = df.loc[df['start_code'].isin(nodes_set) & df['end_code'].isin(nodes_set)]
    
    for s_name, t_name, tp, time, day in df[['start_name', 'end_name', 'spec_type', 'spec_time', 'spec_day']].to_numpy():
        if tp == 1:
            path, time_list = get_route_with_time(ic_graph, s_name, t_name, 
                                                  departure_time=time, spec_date=day.replace('-', '/'))
        else:
            path, time_list = get_route_with_time(ic_graph, s_name, t_name, 
                                                  arrival_time=time, spec_date=day.replace('-', '/'))
        if path is None:
            continue

        for i, (s, t) in enumerate(zip(path, path[1:])):
            if ic_graph[s][t]['road_code'] == road_code:
                start_codes.append(s)
                end_codes.append(t)
                passing_time_list.append(time_list[i])

    result = cudf.DataFrame({
      'start_code': start_codes,
      'end_code': end_codes,
      'passing_time': passing_time_list      
    })

    return result

In [11]:
def timeslice_grouping(df, timeslice):
    '''
    エッジに展開された検索経路をtimesliceでサンプリングし、各経路の検索量を積算する

    Parameters
    --------------
    df: pandas.DataFrame
    エッジに展開された検索経路が格納されたデータフレーム
    timeslice:
    サンプリング時間（1D, 1h, 5minなどを想定）

    Returns
    ---------
    result: pandas.DataFrame
    timesliceでサンプリングされた検索経路のデータフレーム
    '''
    # 後ほど検索量として積算される対象となるカラムを用意
    result = df\
        .assign(search=1)\
        .assign(passing_time=cudf.to_datetime(df['passing_time']))\
        .assign(start_code=df['start_code'].astype('category'))\
        .assign(end_code=df['end_code'].astype('category'))
    
    # timesliceでサンプリングし、検索量の和を取る
    result = result\
        .set_index('passing_time')\
        .to_pandas()\
        .groupby(['start_code', 'end_code'])\
        .apply(lambda g: g['search'].resample(timeslice).sum())\
        .reset_index()\
        .set_index('passing_time')

    result = cudf.from_pandas(result)

    return result

In [12]:
def count_minutely(date, past_periods, road_code):
    '''
    特定の日dateについて、過去past_periods日の検索ログを参照し、road_code上のエッジの検索量を積算する
    '''
    df = get_past_logs(date, periods=past_periods)
    df_expanded = expand_search_path(df, road_code)
    df_5min = timeslice_grouping(df_expanded, '5min')
    # filtering
    df_5min = cudf.from_pandas(df_5min.to_pandas().loc[date])
    return df_5min

## 時間指定あり検索数の作成

In [13]:
period_blocks = [
    # ('20210402', '20210630'),
    # ('20210701', '20210930'),
    # ('20211001', '20211231'),
    # ('20220101', '20220331'),
    # ('20220401', '20220630'),
    # ('20220701', '20220930'),
    # ('20221001', '20221231'),
    # ('20230101', '20230331'),
    # ('20230401', '20230630'),
    # ('20230701', '20230731'),
    # ('20230801', '20230816'),
    ('20230817', '20230930'),
]

### 関越道

In [17]:
# 過去7日間の検索ログを参照
past_periods = 7
# 関越道
road_code = '1800'

for start_date, end_date in period_blocks:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    
    DAYS = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    df_count = cudf.DataFrame()
    for i, date in enumerate(DAYS):
        s = time.time()

        df_minutely = count_minutely(date, past_periods, road_code)

        print('-'*30, f'[{date}] *{len(df_minutely)} records* ({time.time() - s:.3f} sec)', '-'*30)

        df_count = cudf.concat([df_count, df_minutely])

    df_count.reset_index(inplace=True)
    
    if start_date == '20210402':
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_KANNETSU, index=False)
    else:
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_KANNETSU, mode='a', header=False, index=False)
        
    print()

------------------------------ [20230817] *27645 records* (30.405 sec) ------------------------------
------------------------------ [20230818] *27646 records* (30.420 sec) ------------------------------
------------------------------ [20230819] *27646 records* (43.509 sec) ------------------------------
------------------------------ [20230820] *27644 records* (31.041 sec) ------------------------------
------------------------------ [20230821] *27458 records* (24.500 sec) ------------------------------
------------------------------ [20230822] *27612 records* (20.491 sec) ------------------------------
------------------------------ [20230823] *27602 records* (19.257 sec) ------------------------------
------------------------------ [20230824] *27580 records* (18.455 sec) ------------------------------
------------------------------ [20230825] *27571 records* (24.497 sec) ------------------------------
------------------------------ [20230826] *27632 records* (39.968 sec) -----------

In [18]:
!head -n5 ./search_count/search-count_kannetsu.csv

passing_time,start_code,end_code,search
2021-04-02 00:00:00,1080291,1800186,0
2021-04-02 00:05:00,1080291,1800186,0
2021-04-02 00:10:00,1080291,1800186,0
2021-04-02 00:15:00,1080291,1800186,0


In [19]:
!tail -n5 ./search_count/search-count_kannetsu.csv

2023-09-30 23:35:00,1800186,1800183,0
2023-09-30 23:40:00,1800186,1800183,8
2023-09-30 23:45:00,1800186,1800183,6
2023-09-30 23:50:00,1800186,1800183,0
2023-09-30 23:55:00,1800186,1800183,0


### 東北道

In [17]:
# 過去7日間の検索ログを参照
past_periods = 7
# 関越道
road_code = '1040'

for start_date, end_date in period_blocks:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    
    DAYS = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    df_count = cudf.DataFrame()
    for i, date in enumerate(DAYS):
        s = time.time()

        df_minutely = count_minutely(date, past_periods, road_code)

        print('-'*30, f'[{date}] *{len(df_minutely)} records* ({time.time() - s:.3f} sec)', '-'*30)

        df_count = cudf.concat([df_count, df_minutely])

    df_count.reset_index(inplace=True)
    
    if start_date == '20210402':
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_TOUHOKU, index=False)
    else:
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_TOUHOKU, mode='a', header=False, index=False)
        
    print()

------------------------------ [20230817] *67931 records* (30.878 sec) ------------------------------
------------------------------ [20230818] *67794 records* (31.188 sec) ------------------------------
------------------------------ [20230819] *67623 records* (44.126 sec) ------------------------------
------------------------------ [20230820] *67907 records* (31.516 sec) ------------------------------
------------------------------ [20230821] *67834 records* (24.802 sec) ------------------------------
------------------------------ [20230822] *67848 records* (20.834 sec) ------------------------------
------------------------------ [20230823] *67800 records* (19.794 sec) ------------------------------
------------------------------ [20230824] *67611 records* (18.972 sec) ------------------------------
------------------------------ [20230825] *67909 records* (25.087 sec) ------------------------------
------------------------------ [20230826] *67935 records* (40.779 sec) -----------

In [18]:
!head -n5 ./search_count/search-count_touhoku.csv

passing_time,start_code,end_code,search
2021-04-02 00:00:00,1040001,1040011,3
2021-04-02 00:05:00,1040001,1040011,0
2021-04-02 00:10:00,1040001,1040011,3
2021-04-02 00:15:00,1040001,1040011,0


In [19]:
!tail -n5 ./search_count/search-count_touhoku.csv

2023-09-30 23:30:00,1040456,1040451,4
2023-09-30 23:35:00,1040456,1040451,0
2023-09-30 23:40:00,1040456,1040451,3
2023-09-30 23:45:00,1040456,1040451,0
2023-09-30 23:50:00,1040456,1040451,2


In [20]:
!wc -l ./search_count/search-count_touhoku.csv

61013596 ./search_count/search-count_touhoku.csv


### 館山道

In [14]:
# 過去7日間の検索ログを参照
past_periods = 7
# 館山道
road_code = '1130'

for start_date, end_date in period_blocks:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    
    DAYS = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    df_count = cudf.DataFrame()
    for i, date in enumerate(DAYS):
        s = time.time()
        
        df_minutely = count_minutely(date, past_periods, road_code)

        print('-'*30, f'[{date}] *{len(df_minutely)} records* ({time.time() - s:.3f} sec)', '-'*30)

        df_count = cudf.concat([df_count, df_minutely])

    df_count.reset_index(inplace=True)
    
    if start_date == '20210402':
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_TATEYAMA, index=False)
    else:
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_TATEYAMA, mode='a', header=False, index=False)
        
    print()

------------------------------ [20210402] *3160 records* (3.037 sec) ------------------------------
------------------------------ [20210403] *3890 records* (8.045 sec) ------------------------------
------------------------------ [20210404] *3425 records* (6.016 sec) ------------------------------
------------------------------ [20210405] *3412 records* (6.679 sec) ------------------------------
------------------------------ [20210406] *3514 records* (6.969 sec) ------------------------------
------------------------------ [20210407] *3017 records* (7.971 sec) ------------------------------
------------------------------ [20210408] *3054 records* (8.511 sec) ------------------------------
------------------------------ [20210409] *4278 records* (9.944 sec) ------------------------------
------------------------------ [20210410] *4693 records* (16.865 sec) ------------------------------
------------------------------ [20210411] *4512 records* (11.447 sec) -----------------------------

In [15]:
!head -n5 ./search_count/search-count_tateyama.csv

passing_time,start_code,end_code,search
2021-04-02 00:10:00,1130001,1130006,1
2021-04-02 00:15:00,1130001,1130006,1
2021-04-02 00:20:00,1130001,1130006,0
2021-04-02 00:25:00,1130001,1130006,0


In [16]:
!tail -n5 ./search_count/search-count_tateyama.csv

2023-08-16 22:00:00,1130041,1130046,0
2023-08-16 22:05:00,1130041,1130046,0
2023-08-16 22:10:00,1130041,1130046,0
2023-08-16 22:15:00,1130041,1130046,0
2023-08-16 22:20:00,1130041,1130046,2
