このファイルについて
- about
    - 検索履歴データから館山道・関越道の各区間の5分単位の検索数を計算する
- author: 松永
- Input
    - ../Input_processed_data/search_records/csv202xxx/*
- Output
    - 時間指定あり検索
        - ../Input_processed_data/search_count/search-count_{tateyama or kannetsu}.csv
    - 時間指定なし検索
        - ../Input_processed_data/search_count/search-count_{tateyama or kannetsu}_unspecified.csv

In [1]:
import os
import time
import tqdm
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cudf

import warnings
warnings.simplefilter('ignore')

In [17]:
# data directory
DATA_DIR = '../../Input_processed_data'

# IC, 道路情報 csv
IC_CSV = f'{DATA_DIR}/road_master/ic_merged.csv'
IC_NET_CSV = f'{DATA_DIR}/road_master/220303-doronet_ic.csv'
# IC_NET_SUB_CSV = f'{DATA_DIR}/road_master/tateyama_kannetsu_icnet.csv'
IC_NET_SUB_CSV = f'{DATA_DIR}/road_master/touhoku_icnet.csv'

# 検索ログ csv
SEARCH_LOG_DIR = lambda month: f'{DATA_DIR}/search_records/csv{month}'
SEARCH_LOG_CSV = lambda date: f'{SEARCH_LOG_DIR(date[:6])}/record_{date}.csv'

# 検索件数を保存するcsv
## 日時指定なし
SEARCH_COUNT_CSV_KANNETSU_UNSPECIFIED = f'./search_count/search-count_kannetsu_unspecified.csv'
SEARCH_COUNT_CSV_TATEYAMA_UNSPECIFIED = f'./search_count/search-count_tateyama_unspecified.csv'
SEARCH_COUNT_CSV_TOUHOKU_UNSPECIFIED = f'./search_count/search-count_touhoku_unspecified.csv'

### 準備

In [3]:
# モジュール内で前処理済み
df_ic = pd.read_csv(IC_CSV, dtype={'ic_code': str})
df_icnet = pd.read_csv(IC_NET_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})
sub_icnet = pd.read_csv(IC_NET_SUB_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})

code2name = dict(zip(df_ic['ic_code'], df_ic['ic_name']))
name2code = {v: k for k, v in code2name.items()}

ic_graph = nx.from_pandas_edgelist(df_icnet, source='start_code', target='end_code',
                                  edge_attr=['distance', 'road_code', 'direction'],
                                  create_using=nx.DiGraph())

In [4]:
# 区間ごとの制限速度を格納したテーブル, Map を作成
df_limits = sub_icnet.loc[:, ['start_code', 'end_code', 'start_name', 'end_name', 'road_code', 'limit']]

limit_dict = {
    (s_code, e_code): lim 
    for s_code, e_code, lim in df_limits.loc[:, ['start_code', 'end_code', 'limit']].values
}

In [5]:
cudf_icnet = cudf.from_pandas(df_icnet)
cudf_limits = cudf.from_pandas(df_limits)

In [7]:
def str2time(time_str, format='%H:%M'):
    '''
    文字列形式の時刻 => datetime.Timeクラスの時刻
    Parameters
    ----------
    time_str: str
    時刻
    format: str
    時刻文字列の形式

    Returns
    -------
    time: datetime.time
    '''
    time_datetime = dt.datetime.strptime(time_str, format)
    time = time_datetime.time()
    return time

In [8]:
def get_route(ic_graph, src_name, target_name):
    '''
    ic_graph上で出発地（src_name）から目的地（target_name）までの経路を得る関数

    Parameters
    --------------
    ic_graph: nx.DiGraph
        ICの繋がりを表す有向グラフ
    src_name: str
        出発IC名
    target_name: str
        目的IC名

    Returns
    ----------
    path: List[str]
    経路中の各IC codeのリスト
    '''  
    if not (src_name in name2code and target_name in name2code):
        return None

    src = name2code[src_name]
    target = name2code[target_name]

    try:
        path = PATH_DICT[src][target]
        return path
    # 経路が存在しない, もしくはノードがグラフ上に存在しない場合
    except:
        return None


def get_route_with_time(ic_graph, src_name, target_name, departure_time='12:00', arrival_time=None, spec_date=dt.date.today()):
    '''
    ic_graph上で出発地（src_name）から目的地（target_name）までの予想通過時刻付き経路を得る関数

    Parameters
    ----------
    ic_graph: nx.DiGraph
        ICの繋がりを表す有向グラフ
    src_name: str
        出発IC名
    target_name: str
        目的IC名
    departure_time: str or datetime.time
        出発時刻
    arrival_time: str or datetime.time
        到着時刻
    spec_date: str or dateitme.date
        指定日

    Returns
    -------
    path: List[str]
        経路中の各IC codeのリスト
    arrival_time_list: List[datetime.time]
        経路中の各ICへの予想到着時刻（datetime.time型）のリスト
    '''
    # 関越道・館山道 以外の道路の移動速度は80km/hと仮定する
    DEFAULT_SPEED = 80

    path = get_route(ic_graph, src_name, target_name)

    # 経路が存在しない場合
    if path is None:
        return (None, None)
    
    try:
        if isinstance(departure_time, str):
            departure_time = str2time(departure_time)
        if isinstance(arrival_time, str):
            arrival_time = str2time(arrival_time)
    except (ValueError, TypeError) as e:
        return (None, None)
    
    if isinstance(spec_date, str):
        spec_date = dt.datetime.strptime(spec_date, '%Y/%m/%d').date()

    elapsed = dt.timedelta()
    elapsed_time_list = [elapsed]

    for i in range(len(path)-1):
        s, t = path[i], path[i+1]

        dist = ic_graph[s][t]['distance']
        limit_speed = limit_dict.get((s, t), DEFAULT_SPEED)

        # s -> t　までの所要時間を算出
        td = dt.timedelta(hours = dist / limit_speed)

        elapsed += td
        elapsed_time_list.append(elapsed)

    if arrival_time:
        spec_datetime = dt.datetime.combine(spec_date, arrival_time)
        time_list = [spec_datetime - td for td in elapsed_time_list[::-1]]
    else:
        spec_datetime = dt.datetime.combine(spec_date, departure_time)
        time_list = [spec_datetime + td for td in elapsed_time_list]

    return path, time_list

In [9]:
PATH_DICT = dict(nx.all_pairs_dijkstra_path(ic_graph, weight='distance'))

In [10]:
def get_log(date):
    if not os.path.exists(SEARCH_LOG_CSV(date)):
        # print(f'{SEARCH_LOG_CSV(date)} not exists')
        return None

    df = pd.read_csv(SEARCH_LOG_CSV(date), 
                    dtype={'start_code': str, 'end_code': str, 'via1_code': str, 'via2_code': str, 'via3_code': str, 'order': str})
    df = cudf.from_pandas(df)
    return df


def get_unspecified_logs(date):
    '''
    検索履歴全体から時間指定なしの検索のみを得る関数

    Parameters
    --------------
    date: pandas.DataFrame
    得たい検索履歴データの日付

    Returns
    ----------
    unspecified: pandas.DataFrame
    時間指定なしの検索履歴データ
    '''
    df = get_log(date)

    search_dt = cudf.to_datetime(df['date'])
    specified_dt = cudf.to_datetime(df['spec_day'] + ' ' + df['spec_time'], errors='coerce')

    t_diff = abs((search_dt - specified_dt) / np.timedelta64(1, 's'))

    td_thresh = 15 * 60
    unspecified = df[t_diff < td_thresh]

    unspecified.reset_index(drop=True, inplace=True)

    return unspecified


def get_past_logs(target_date, periods, include_target=False):
    '''
    指定日(target_date)から過去数日分の検索履歴データを取得する関数

    Parameters
    --------------
    target_date: str or List[str]
        混雑度算出の対象となる日付（文字列 or リスト)
    periods: int
        過去何日分の履歴を参照するか
    time_specified: bool
        時間指定あり or なし
    include_target: bool
        target_date（Listである場合はそのうち最も早い日付）をデータに含めるかどうか

    Returns
    ----------
    df_all: pandas.DataFrame
        target_dateから過去periods日間の検索履歴データのうち、target_dateを指定日としたログの集合
    '''
    if isinstance(target_date, list):
        end_date = sorted(target_date)[0]
        target_date_set = set(target_date)
    else:
        end_date = target_date
        target_date_set = set([target_date])

    # 参照すべき全日付のiterableを生成
    if include_target:
        dt_range = pd.date_range(end=end_date, periods=periods)
    else:
        dt_range = pd.date_range(end=end_date, periods=periods+1, closed='left')

    DAYS = [d.strftime('%Y%m%d') for d in dt_range]

    df_all = None
    for d in DAYS:
        df = get_log(d)

        if df is None:
              continue

        # 時間指定ありの検索履歴データ
        # 検索の「指定日」 (%Y-%m-%d) がtarget_date (%Y%m%d) と一致する行のみ抽出
        df = df[df['spec_day'].str.replace('-', '').isin(target_date_set)]
        # print(f'{d} (時間指定あり): {len(df)}件')

        df_all = cudf.concat([df_all, df], ignore_index=True)

    df_all.reset_index(drop=True, inplace=True)

    return df_all

### functions

In [11]:
def expand_search_path(df, road_code):
    '''
    検索経路を複数のエッジに展開する
    A => B => C => D という検索経路を A => B, B => C, C => Dというように区切る

    Parameters
    --------------
    df: pandas.DataFrame
    検索ログのデータフレーム
    road_code: str
    道路コード （指定された道路上のエッジのみを結果のデータフレームに格納する）

    Returns
    ----------
    result: pandas.DataFrame
    検索経路がエッジに展開された結果
    エッジの両端のICコード（start_code, end_code）とそのエッジの通過時間(passing_time)が格納される
    '''
    start_codes = []
    end_codes = []
    passing_time_list = []
    
    nodes_set = set(ic_graph.nodes)
    
    df = df.loc[df['start_code'].isin(nodes_set) & df['end_code'].isin(nodes_set)]
    
    for s_name, t_name, tp, time, day in df[['start_name', 'end_name', 'spec_type', 'spec_time', 'spec_day']].to_numpy():
        if tp == 1:
            path, time_list = get_route_with_time(ic_graph, s_name, t_name, 
                                                  departure_time=time, spec_date=day.replace('-', '/'))
        else:
            path, time_list = get_route_with_time(ic_graph, s_name, t_name, 
                                                  arrival_time=time, spec_date=day.replace('-', '/'))
        if path is None:
            continue

        for i, (s, t) in enumerate(zip(path, path[1:])):
            if ic_graph[s][t]['road_code'] == road_code:
                start_codes.append(s)
                end_codes.append(t)
                passing_time_list.append(time_list[i])

    result = cudf.DataFrame({
      'start_code': start_codes,
      'end_code': end_codes,
      'passing_time': passing_time_list      
    })

    return result

In [12]:
def timeslice_grouping(df, timeslice):
    '''
    エッジに展開された検索経路をtimesliceでサンプリングし、各経路の検索量を積算する

    Parameters
    --------------
    df: pandas.DataFrame
    エッジに展開された検索経路が格納されたデータフレーム
    timeslice:
    サンプリング時間（1D, 1h, 5minなどを想定）

    Returns
    ---------
    result: pandas.DataFrame
    timesliceでサンプリングされた検索経路のデータフレーム
    '''
    # 後ほど検索量として積算される対象となるカラムを用意
    result = df\
        .assign(search=1)\
        .assign(passing_time=cudf.to_datetime(df['passing_time']))\
        .assign(start_code=df['start_code'].astype('category'))\
        .assign(end_code=df['end_code'].astype('category'))
    
    # timesliceでサンプリングし、検索量の和を取る
    result = result\
        .set_index('passing_time')\
        .to_pandas()\
        .groupby(['start_code', 'end_code'])\
        .apply(lambda g: g['search'].resample(timeslice).sum())\
        .reset_index()\
        .set_index('passing_time')

    result = cudf.from_pandas(result)

    return result

In [13]:
def count_minutely(date, past_periods, road_code):
    '''
    特定の日dateについて、過去past_periods日の検索ログを参照し、road_code上のエッジの検索量を積算する
    '''
    df = get_past_logs(date, periods=past_periods)
    df_expanded = expand_search_path(df, road_code)
    df_5min = timeslice_grouping(df_expanded, '5min')
    # filtering
    df_5min = cudf.from_pandas(df_5min.to_pandas().loc[date])
    return df_5min

## 検索数作成: 時間指定なし

In [31]:
period_blocks = [
    # ('20210401', '20210630'),
    # ('20210701', '20210930'),
    # ('20211001', '20211231'),
    # ('20220101', '20220331'),
    # ('20220401', '20220630'),
    # ('20220701', '20220930'),
    # ('20221001', '20221231'),
    # ('20230101', '20230331'),
    # ('20230401', '20230630'),
    # ('20230701', '20230731'),
    ('20230801', '20230816'),
]

### 関越道

In [16]:
# 関越道
road_code = '1800'

for start_date, end_date in period_blocks:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    
    DAYS = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    df_all = cudf.DataFrame()
    for i, date in enumerate(DAYS):
        s = time.time()

        df = get_unspecified_logs(date)
        expanded = expand_search_path(df, road_code)

        result = cudf.from_pandas(
            # 後ほど検索量として積算される対象となるカラムを用意
            expanded
            .assign(search=1)\
            .assign(start_code=expanded['start_code'].astype('category'))\
            .assign(end_code=expanded['end_code'].astype('category'))\
            # 1日単位でサンプリングし、検索数の和を取る
            .to_pandas()\
            .groupby(['start_code', 'end_code'])\
            .apply(lambda g: g['search'].sum())\
            .reset_index()\
            .rename(columns={0: 'search'})\
            # 検索実行日時をフォーマットしてカラムに追加
            .assign(search_date=dt.datetime.strftime(dt.datetime.strptime(date, '%Y%m%d'), '%Y-%m-%d'))\
            .loc[:, ['search_date', 'start_code', 'end_code', 'search']]
        )

        df_all = cudf.concat([df_all, result])

        print('-'*30, f'[{date}] *{len(expanded)} records* ({time.time() - s:.3f} sec)', '-'*30)

    df_all.reset_index(drop=True, inplace=True)
    
    if start_date == '20210401':
        df_all.to_pandas().to_csv(SEARCH_COUNT_CSV_KANNETSU_UNSPECIFIED, index=False)
    else:
        df_all.to_pandas().to_csv(SEARCH_COUNT_CSV_KANNETSU_UNSPECIFIED, mode='a', header=False, index=False)
        
    print()

------------------------------ [20230801] *306854 records* (45.725 sec) ------------------------------
------------------------------ [20230802] *336179 records* (46.747 sec) ------------------------------
------------------------------ [20230803] *325579 records* (46.269 sec) ------------------------------
------------------------------ [20230804] *305506 records* (45.996 sec) ------------------------------
------------------------------ [20230805] *317062 records* (48.286 sec) ------------------------------
------------------------------ [20230806] *325003 records* (46.958 sec) ------------------------------
------------------------------ [20230807] *321637 records* (51.473 sec) ------------------------------
------------------------------ [20230808] *309585 records* (48.369 sec) ------------------------------
------------------------------ [20230809] *318788 records* (53.388 sec) ------------------------------
------------------------------ [20230810] *378915 records* (61.278 sec) -

In [17]:
!head -n5 ./search_count/search-count_kannetsu_unspecified.csv

search_date,start_code,end_code,search
2021-04-01,1080291,1800186,883
2021-04-01,1110210,1800001,384
2021-04-01,1110210,1800004,3221
2021-04-01,1800001,1110210,927


In [18]:
!tail -n5 ./search_count/search-count_kannetsu_unspecified.csv

2023-08-16,1800181,1800183,1623
2023-08-16,1800183,1800181,1426
2023-08-16,1800183,1800186,1633
2023-08-16,1800186,1080291,1658
2023-08-16,1800186,1800183,1402


### 東北道

In [32]:
# 東北道
road_code = '1040'

for start_date, end_date in period_blocks:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    
    DAYS = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    df_all = cudf.DataFrame()
    for i, date in enumerate(DAYS):
        s = time.time()

        df = get_unspecified_logs(date)
        expanded = expand_search_path(df, road_code)

        result = cudf.from_pandas(
            # 後ほど検索量として積算される対象となるカラムを用意
            expanded
            .assign(search=1)\
            .assign(start_code=expanded['start_code'].astype('category'))\
            .assign(end_code=expanded['end_code'].astype('category'))\
            # 1日単位でサンプリングし、検索数の和を取る
            .to_pandas()\
            .groupby(['start_code', 'end_code'])\
            .apply(lambda g: g['search'].sum())\
            .reset_index()\
            .rename(columns={0: 'search'})\
            # 検索実行日時をフォーマットしてカラムに追加
            .assign(search_date=dt.datetime.strftime(dt.datetime.strptime(date, '%Y%m%d'), '%Y-%m-%d'))\
            .loc[:, ['search_date', 'start_code', 'end_code', 'search']]
        )

        df_all = cudf.concat([df_all, result])

        print('-'*30, f'[{date}] *{len(expanded)} records* ({time.time() - s:.3f} sec)', '-'*30)

    df_all.reset_index(drop=True, inplace=True)
    
    if start_date == '20210401':
        df_all.to_pandas().to_csv(SEARCH_COUNT_CSV_TOUHOKU_UNSPECIFIED, index=False)
    else:
        df_all.to_pandas().to_csv(SEARCH_COUNT_CSV_TOUHOKU_UNSPECIFIED, mode='a', header=False, index=False)
        
    print()

------------------------------ [20230801] *855063 records* (45.829 sec) ------------------------------
------------------------------ [20230802] *859503 records* (47.116 sec) ------------------------------
------------------------------ [20230803] *894350 records* (46.873 sec) ------------------------------
------------------------------ [20230804] *894660 records* (46.470 sec) ------------------------------
------------------------------ [20230805] *916765 records* (48.636 sec) ------------------------------
------------------------------ [20230806] *907545 records* (47.481 sec) ------------------------------
------------------------------ [20230807] *1009977 records* (51.798 sec) ------------------------------
------------------------------ [20230808] *992494 records* (48.766 sec) ------------------------------
------------------------------ [20230809] *1069803 records* (53.880 sec) ------------------------------
------------------------------ [20230810] *1208597 records* (61.345 sec

In [33]:
!head -n5 ./search_count/search-count_touhoku_unspecified.csv

search_date,start_code,end_code,search
2021-04-01,1040001,1040011,3854
2021-04-01,1040011,1040001,3176
2021-04-01,1040011,1040013,3796
2021-04-01,1040013,1040011,3143


In [34]:
!tail -n5 ./search_count/search-count_touhoku_unspecified.csv

2023-08-16,1040446,1040443,804
2023-08-16,1040446,1040451,871
2023-08-16,1040451,1040446,669
2023-08-16,1040451,1040456,571
2023-08-16,1040456,1040451,337


### 館山道

In [19]:
# 館山道
road_code = '1130'

for start_date, end_date in period_blocks:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    
    DAYS = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    df_all = cudf.DataFrame()
    for i, date in enumerate(DAYS):
        s = time.time()

        df = get_unspecified_logs(date)
        expanded = expand_search_path(df, road_code)

        result = cudf.from_pandas(
            # 後ほど検索量として積算される対象となるカラムを用意
            expanded
            .assign(search=1)\
            .assign(start_code=expanded['start_code'].astype('category'))\
            .assign(end_code=expanded['end_code'].astype('category'))\
            # 1日単位でサンプリングし、検索数の和を取る
            .to_pandas()\
            .groupby(['start_code', 'end_code'])\
            .apply(lambda g: g['search'].sum())\
            .reset_index()\
            .rename(columns={0: 'search'})\
            # 検索実行日時をフォーマットしてカラムに追加
            .assign(search_date=dt.datetime.strftime(dt.datetime.strptime(date, '%Y%m%d'), '%Y-%m-%d'))\
            .loc[:, ['search_date', 'start_code', 'end_code', 'search']]
        )

        df_all = cudf.concat([df_all, result])

        print('-'*30, f'[{date}] *{len(expanded)} records* ({time.time() - s:.3f} sec)', '-'*30)

    df_all.reset_index(drop=True, inplace=True)
    
    if start_date == '20210401':
        df_all.to_pandas().to_csv(SEARCH_COUNT_CSV_TATEYAMA_UNSPECIFIED, index=False)
    else:
        df_all.to_pandas().to_csv(SEARCH_COUNT_CSV_TATEYAMA_UNSPECIFIED, mode='a', header=False, index=False)
        
    print()

------------------------------ [20230801] *8574 records* (45.393 sec) ------------------------------
------------------------------ [20230802] *8256 records* (46.568 sec) ------------------------------
------------------------------ [20230803] *7941 records* (46.192 sec) ------------------------------
------------------------------ [20230804] *7980 records* (46.060 sec) ------------------------------
------------------------------ [20230805] *9077 records* (48.025 sec) ------------------------------
------------------------------ [20230806] *7987 records* (46.879 sec) ------------------------------
------------------------------ [20230807] *8704 records* (51.396 sec) ------------------------------
------------------------------ [20230808] *8048 records* (48.481 sec) ------------------------------
------------------------------ [20230809] *8236 records* (53.559 sec) ------------------------------
------------------------------ [20230810] *9062 records* (60.950 sec) ---------------------

In [20]:
!head -n5 ./search_count/search-count_tateyama_unspecified.csv

search_date,start_code,end_code,search
2021-04-01,1130001,1130006,672
2021-04-01,1130006,1130001,441
2021-04-01,1130006,1130011,558
2021-04-01,1130011,1130006,353


In [21]:
!tail -n5 ./search_count/search-count_tateyama_unspecified.csv

2023-08-16,1130039,1130036,89
2023-08-16,1130039,1130041,199
2023-08-16,1130041,1130039,80
2023-08-16,1130041,1130046,51
2023-08-16,1130046,1130041,38
