In [1]:
import os
import time
import tqdm
import random
import pickle
import datetime as dt
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cudf

import warnings
warnings.simplefilter('ignore')

In [2]:
# data directory
DATA_DIR = '../../Input_processed_data'

# IC, 道路情報 csv
IC_CSV = f'{DATA_DIR}/road_master/ic_merged.csv'
IC_NET_CSV = f'{DATA_DIR}/road_master/220303-doronet_ic.csv'
IC_NET_SUB_CSV = f'{DATA_DIR}/road_master/tateyama_kannetsu_icnet.csv'

# 検索ログ csv
SEARCH_LOG_DIR = lambda month: f'{DATA_DIR}/search_records/csv{month}'
SEARCH_LOG_CSV = lambda date: f'{SEARCH_LOG_DIR(date[:6])}/record_{date}.csv'

# 検索件数を保存するcsv
## 日時指定あり
# SEARCH_COUNT_CSV_KANNETSU = f'./search_count/search-count_kannetsu.csv'
# SEARCH_COUNT_CSV_TATEYAMA = f'./search_count/search-count_tateyama.csv'

# 準備

In [3]:
# モジュール内で前処理済み
df_ic = pd.read_csv(IC_CSV, dtype={'ic_code': str})
df_icnet = pd.read_csv(IC_NET_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})
sub_icnet = pd.read_csv(IC_NET_SUB_CSV, dtype={'start_code': str, 'end_code': str, 'road_code': str})

code2name = dict(zip(df_ic['ic_code'], df_ic['ic_name']))
name2code = {v: k for k, v in code2name.items()}

ic_graph = nx.from_pandas_edgelist(df_icnet, source='start_code', target='end_code',
                                  edge_attr=['distance', 'road_code', 'direction'],
                                  create_using=nx.DiGraph())

In [4]:
# 区間ごとの制限速度を格納したテーブル, Map を作成
df_limits = sub_icnet.loc[:, ['start_code', 'end_code', 'start_name', 'end_name', 'road_code', 'limit']]

limit_dict = {
    (s_code, e_code): lim 
    for s_code, e_code, lim in df_limits.loc[:, ['start_code', 'end_code', 'limit']].values
}

In [5]:
cudf_icnet = cudf.from_pandas(df_icnet)
cudf_limits = cudf.from_pandas(df_limits)

# 経路表をDict形式で作成

In [6]:
# route_dict = dict(nx.all_pairs_dijkstra_path(ic_graph, weight='distance'))

# # バイナリ保存
# fname_out = './route_dict.pkl'
# with open(fname_out, 'wb') as f:
#     pickle.dump(route_dict, f)

In [7]:
! du -h ./route_dict.pkl

3.4G	./route_dict.pkl


In [6]:
# バイナリ形式をロード
import pickle

fname_in = './route_dict.pkl'
with open(fname_in, 'rb') as f:
    route_dict = pickle.load(f)

# 経路表をpandas.DataFrameの形式で作成

In [70]:
df_route = pd.DataFrame.from_dict(route_dict, orient='index')
df_route = df_route.stack().reset_index()
df_route.columns = ['start_code', 'end_code', 'route']
df_route = df_route.astype({'start_code': 'category', 'end_code': 'category'})
df_route.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6632408 entries, 0 to 6632407
Data columns (total 3 columns):
 #   Column      Dtype   
---  ------      -----   
 0   start_code  category
 1   end_code    category
 2   route       object  
dtypes: category(2), object(1)
memory usage: 76.1+ MB


In [71]:
df_route.head()

Unnamed: 0,start_code,end_code,route
0,1010001,1010001,[1010001]
1,1010001,1010004,"[1010001, 1010004]"
2,1010001,6009021,"[1010001, 6009021]"
3,1010001,6009016,"[1010001, 6009021, 6009016]"
4,1010001,6009011,"[1010001, 6009021, 6009016, 6009011]"


In [10]:
# df_route.to_csv('./route_table.csv', index=False)
# df_route.to_pickle('./route_table.pkl')

In [9]:
! du -h ./route_table.*

7.8G	./route_table.csv
3.4G	./route_table.pkl


In [7]:
df_route = pd.read_pickle('./route_table.pkl')
df_route.head()

Unnamed: 0,start_code,end_code,route
0,1010001,1010001,[1010001]
1,1010001,1010004,"[1010001, 1010004]"
2,1010001,6009021,"[1010001, 6009021]"
3,1010001,6009016,"[1010001, 6009021, 6009016]"
4,1010001,6009011,"[1010001, 6009021, 6009016, 6009011]"


## インデックスを張る

In [72]:
df_route_indexed = df_route.set_index(['start_code', 'end_code'])
df_route_indexed.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6632408 entries, ('1010001', '1010001') to ('8010006', '8010006')
Data columns (total 1 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   route   object
dtypes: object(1)
memory usage: 76.1+ MB


In [73]:
df_route_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,route
start_code,end_code,Unnamed: 2_level_1
1010001,1010001,[1010001]
1010001,1010004,"[1010001, 1010004]"
1010001,6009021,"[1010001, 6009021]"
1010001,6009016,"[1010001, 6009021, 6009016]"
1010001,6009011,"[1010001, 6009021, 6009016, 6009011]"


In [12]:
# df_route_indexed.to_pickle('./route_table_indexed.pkl')

In [13]:
! du -h ./route_table_indexed.*

3.4G	./route_table_indexed.pkl


In [8]:
df_route_indexed = pd.read_pickle('./route_table_indexed.pkl')
df_route_indexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,route
start_code,end_code,Unnamed: 2_level_1
1010001,1010001,[1010001]
1010001,1010004,"[1010001, 1010004]"
1010001,6009021,"[1010001, 6009021]"
1010001,6009016,"[1010001, 6009021, 6009016]"
1010001,6009011,"[1010001, 6009021, 6009016, 6009011]"


# 経路の検索速度を比較
- 辞書 < テーブル（インデックス） < テーブル
- 辞書はテーブル（インデックス）のおよそ1000倍速い
- テーブル（インデックス）はテーブルのおよそ10倍速い

In [15]:
start_code = "1040191"
end_code = "1440096"

In [16]:
%timeit route_dict[start_code][end_code]

135 ns ± 0.146 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [17]:
%timeit df_route.loc[(df_route.start_code == start_code) & (df_route.end_code == end_code)]

7.18 ms ± 20.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
%timeit df_route_indexed.loc[(start_code, end_code)]

228 µs ± 26.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


# 検索ログを簡略化

In [20]:
def get_log(date):
    if not os.path.exists(SEARCH_LOG_CSV(date)):
        return None

    df = pd.read_csv(SEARCH_LOG_CSV(date), 
                     dtype={
                         'start_code': str,
                         'end_code': str,
                         'via1_code': str,
                         'via2_code': str,
                         'via3_code': str,
                         'order': str
                     })
    df = cudf.from_pandas(df)
    return df

In [23]:
nodes_set = set(ic_graph.nodes)

def simplify_search_log(df):
    # レコードを限定
    df = df.loc[(df['start_code'].isin(nodes_set)) & (df['end_code'].isin(nodes_set))]
    
    # 列を限定
    df_res = df.loc[:, ['date', 'start_code', 'end_code', 'spec_day', 'spec_time', 'spec_type', 'car_type']]
    df_res.rename(columns={'date': 'datetime'}, inplace=True)
    
    # 指定日時を1列にまとめる
    df_res = df_res.assign(spec_datetime=df_res['spec_day'] + ' ' + df_res['spec_time'])
    df_res.drop(['spec_day', 'spec_time'], axis=1, inplace=True)
    
    # 型変換
    df_res = df_res.astype({
        'start_code': 'category',
        'end_code': 'category',
        'spec_type': 'category',
        'car_type': 'category',
    })
    df_res = df_res.assign(
        datetime=cudf.to_datetime(df_res['datetime']),
        spec_datetime=cudf.to_datetime(df_res['spec_datetime'])
    )
    
    # 列並び替え
    df_res = df_res.loc[:, ['datetime', 'start_code', 'end_code', 'spec_datetime', 'spec_type', 'car_type']]
    return df_res

In [24]:
df = get_log('20230501')
print(df.shape)
df.head()

(494076, 19)


Unnamed: 0,date,start_code,start_name,end_code,end_name,via1_code,via1_name,via2_code,via2_name,via3_code,via3_name,spec_day,spec_time,spec_type,order,car_type,use_nexco,use_urban,use_local
0,2023/05/01 00:00:00,1040191,福島西,1440096,新潟中央,,,,,,,2023-05-04,06:00,1,2,2,1,1,1
1,2023/05/01 00:00:00,5004001,玉川,5010041,箱根口,,,,,,,2023-04-30,00:00,1,2,2,1,1,1
2,2023/05/01 00:00:00,6009021,用賀,1461200,友部,,,,,,,2023-05-01,06:00,1,2,2,1,1,1
3,2023/05/01 00:00:00,1400171,広野,7006081,西宮山口東,,,,,,,2023-04-30,01:00,1,2,2,1,1,1
4,2023/05/01 00:00:00,1120081,潮来,6004046,大師,,,,,,,2023-05-05,16:00,1,2,2,1,1,1


In [25]:
df = simplify_search_log(df)
df.head()

Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type
0,2023-05-01,1040191,1440096,2023-05-04 06:00:00,1,2
1,2023-05-01,5004001,5010041,2023-04-30 00:00:00,1,2
2,2023-05-01,6009021,1461200,2023-05-01 06:00:00,1,2
3,2023-05-01,1400171,7006081,2023-04-30 01:00:00,1,2
4,2023-05-01,1120081,6004046,2023-05-05 16:00:00,1,2


In [27]:
df.info()

<class 'cudf.core.dataframe.DataFrame'>
Int64Index: 493645 entries, 0 to 494075
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   datetime       493645 non-null  datetime64[ns]
 1   start_code     493645 non-null  category
 2   end_code       493645 non-null  category
 3   spec_datetime  493645 non-null  datetime64[ns]
 4   spec_type      493645 non-null  category
 5   car_type       493645 non-null  category
dtypes: category(4), datetime64[ns](2)
memory usage: 14.2 MB


# 検索ログと経路表をマージ

In [28]:
df = get_log('20230501')
df = simplify_search_log(df)
df.head()

Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type
0,2023-05-01,1040191,1440096,2023-05-04 06:00:00,1,2
1,2023-05-01,5004001,5010041,2023-04-30 00:00:00,1,2
2,2023-05-01,6009021,1461200,2023-05-01 06:00:00,1,2
3,2023-05-01,1400171,7006081,2023-04-30 01:00:00,1,2
4,2023-05-01,1120081,6004046,2023-05-05 16:00:00,1,2


In [29]:
# df_route = pd.read_pickle('./route_table.pkl')
df_route.head()

Unnamed: 0,start_code,end_code,route
0,1010001,1010001,[1010001]
1,1010001,1010004,"[1010001, 1010004]"
2,1010001,6009021,"[1010001, 6009021]"
3,1010001,6009016,"[1010001, 6009021, 6009016]"
4,1010001,6009011,"[1010001, 6009021, 6009016, 6009011]"


In [30]:
df.info()

<class 'cudf.core.dataframe.DataFrame'>
Int64Index: 493645 entries, 0 to 494075
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   datetime       493645 non-null  datetime64[ns]
 1   start_code     493645 non-null  category
 2   end_code       493645 non-null  category
 3   spec_datetime  493645 non-null  datetime64[ns]
 4   spec_type      493645 non-null  category
 5   car_type       493645 non-null  category
dtypes: category(4), datetime64[ns](2)
memory usage: 14.2 MB


In [31]:
pd.merge(df.to_pandas(), df_route, how='left', on=['start_code', 'end_code'])

Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type,route
0,2023-05-01 00:00:00,1040191,1440096,2023-05-04 06:00:00,1,2,"[1040191, 1040186, 1040181, 1040176, 1040171, ..."
1,2023-05-01 00:00:00,5004001,5010041,2023-04-30 00:00:00,1,2,"[5004001, 5004006, 5004011, 5004021, 6028040, ..."
2,2023-05-01 00:00:00,6009021,1461200,2023-05-01 06:00:00,1,2,"[6009021, 6009016, 6009011, 6016076, 6009006, ..."
3,2023-05-01 00:00:00,1400171,7006081,2023-04-30 01:00:00,1,2,"[1400171, 1400166, 1400161, 1400156, 1400151, ..."
4,2023-05-01 00:00:00,1120081,6004046,2023-05-05 16:00:00,1,2,"[1120081, 1120076, 1120071, 1120066, 1120061, ..."
...,...,...,...,...,...,...,...
493640,2023-05-01 23:59:09,1080321,1820036,2023-07-22 06:30:00,1,2,"[1080321, 1080316, 1080311, 1080306, 1080301, ..."
493641,2023-05-01 23:59:11,1410036,1040331,2023-05-28 16:00:00,1,2,"[1410036, 1410031, 1410029, 1410026, 1410021, ..."
493642,2023-05-01 23:59:20,6026086,1010221,2023-06-10 00:00:00,1,1,"[6026086, 6026081, 6026076, 6026071, 6016056, ..."
493643,2023-05-01 23:59:44,6026086,1010221,2023-06-10 00:00:00,1,1,"[6026086, 6026081, 6026076, 6026071, 6016056, ..."


# 検索ログの扱い
- **所要時間の計算**を改良するアプローチ
    - 車両種別によって速度を変化させる
        - https://www.driveplaza.com/search/division/
            - 1: 軽自動車等
            - 2: 普通車
            - 3: 中型車
            - 4: 大型車
            - 5: 特大車
    - 過去の交通データを参考にする
        - 1年前のある日時・ある区間はこの平均速度で車両が通過していた、という情報
        - 厳密にやるには<u>予測対象道路（関越道 + 東北道）以外の交通データが必要になってしまう</u>
    - <u>結局1時間単位で丸めてしまえば多少の計算方法の違いでは影響出にくい？</u>
    
- **検索ログの集計方法**を改良するアプローチ
    - 検索した日時と指定日時の時間差によって重みを変える
        - 7日前の検索はあまり参考にせず、昨日の検索を重視する
        - 昨日の朝の検索よりも、昨日の夜の検索を特に重視する
    - 短時間で重複した検索は除外し、その中で最新の検索のみ参照する
        - 出発時刻を変えながら何度か経路を検索してみる人もいそう
        - <u>重複した検索をどう検出するかが問題</u>
            - 出発地・到着地の完全一致
            - 出発と到着のICを少しずらすパターンもある

In [32]:
df = get_log('20230501')
df = simplify_search_log(df)
df.head()

Unnamed: 0,datetime,start_code,end_code,spec_datetime,spec_type,car_type
0,2023-05-01,1040191,1440096,2023-05-04 06:00:00,1,2
1,2023-05-01,5004001,5010041,2023-04-30 00:00:00,1,2
2,2023-05-01,6009021,1461200,2023-05-01 06:00:00,1,2
3,2023-05-01,1400171,7006081,2023-04-30 01:00:00,1,2
4,2023-05-01,1120081,6004046,2023-05-05 16:00:00,1,2


In [33]:
df['car_type'].value_counts()

2    408131
1     47724
3     16369
4     12994
5      8427
Name: car_type, dtype: int32

# old

In [12]:
period_blocks = [
    ('20220501', '20220501')
    # ('20221001', '20221231'),
    # ('20230101', '20230331'),
    # ('20230401', '20230507')
]

### 関越道

In [32]:
# 過去7日間の検索ログを参照
past_periods = 7
# 関越道
road_code = '1800'

for start_date, end_date in period_blocks:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    
    DAYS = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    df_count = cudf.DataFrame()
    for i, date in enumerate(DAYS):
        s = time.time()

        df_minutely = count_minutely(date, past_periods, road_code)

        print('-'*30, f'[{date}] *{len(df_minutely)} records* ({time.time() - s:.3f} sec)', '-'*30)

        df_count = cudf.concat([df_count, df_minutely])

    df_count.reset_index(inplace=True)
    
    if start_date == '20210402':
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_KANNETSU, index=False)
    else:
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_KANNETSU, mode='a', header=False, index=False)
        
    print()

------------------------------ [20230401] *27640 records* (41.507 sec) ------------------------------
------------------------------ [20230402] *27476 records* (26.539 sec) ------------------------------
------------------------------ [20230403] *27620 records* (21.342 sec) ------------------------------
------------------------------ [20230404] *27534 records* (17.820 sec) ------------------------------
------------------------------ [20230405] *27066 records* (17.184 sec) ------------------------------
------------------------------ [20230406] *27521 records* (15.937 sec) ------------------------------
------------------------------ [20230407] *27401 records* (19.229 sec) ------------------------------
------------------------------ [20230408] *27635 records* (35.410 sec) ------------------------------
------------------------------ [20230409] *27469 records* (25.349 sec) ------------------------------
------------------------------ [20230410] *27332 records* (19.136 sec) -----------

In [11]:
def get_log(date):
    if not os.path.exists(SEARCH_LOG_CSV(date)):
        # print(f'{SEARCH_LOG_CSV(date)} not exists')
        return None

    df = pd.read_csv(SEARCH_LOG_CSV(date), 
                    dtype={'start_code': str, 'end_code': str, 'via1_code': str, 'via2_code': str, 'via3_code': str, 'order': str})
    df = cudf.from_pandas(df)
    return df


def get_unspecified_logs(date):
    '''
    検索履歴全体から時間指定なしの検索のみを得る関数

    Parameters
    --------------
    date: pandas.DataFrame
    得たい検索履歴データの日付

    Returns
    ----------
    unspecified: pandas.DataFrame
    時間指定なしの検索履歴データ
    '''
    df = get_log(date)

    search_dt = cudf.to_datetime(df['date'])
    specified_dt = cudf.to_datetime(df['spec_day'] + ' ' + df['spec_time'], errors='coerce')

    t_diff = abs((search_dt - specified_dt) / np.timedelta64(1, 's'))

    td_thresh = 15 * 60
    unspecified = df[t_diff < td_thresh]

    unspecified.reset_index(drop=True, inplace=True)

    return unspecified


def get_past_logs(target_date, periods, include_target=False):
    '''
    指定日(target_date)から過去数日分の検索履歴データを取得する関数

    Parameters
    --------------
    target_date: str or List[str]
        混雑度算出の対象となる日付（文字列 or リスト)
    periods: int
        過去何日分の履歴を参照するか
    time_specified: bool
        時間指定あり or なし
    include_target: bool
        target_date（Listである場合はそのうち最も早い日付）をデータに含めるかどうか

    Returns
    ----------
    df_all: pandas.DataFrame
        target_dateから過去periods日間の検索履歴データのうち、target_dateを指定日としたログの集合
    '''
    if isinstance(target_date, list):
        end_date = sorted(target_date)[0]
        target_date_set = set(target_date)
    else:
        end_date = target_date
        target_date_set = set([target_date])

    # 参照すべき全日付のiterableを生成
    if include_target:
        dt_range = pd.date_range(end=end_date, periods=periods)
    else:
        dt_range = pd.date_range(end=end_date, periods=periods+1, closed='left')

    DAYS = [d.strftime('%Y%m%d') for d in dt_range]

    df_all = None
    for d in DAYS:
        df = get_log(d)

        if df is None:
              continue

        # 時間指定ありの検索履歴データ
        # 検索の「指定日」 (%Y-%m-%d) がtarget_date (%Y%m%d) と一致する行のみ抽出
        df = df[df['spec_day'].str.replace('-', '').isin(target_date_set)]
        # print(f'{d} (時間指定あり): {len(df)}件')

        df_all = cudf.concat([df_all, df], ignore_index=True)

    df_all.reset_index(drop=True, inplace=True)

    return df_all

## functions

In [10]:
def expand_search_path(df, road_code):
    '''
    検索経路を複数のエッジに展開する
    A => B => C => D という検索経路を A => B, B => C, C => Dというように区切る

    Parameters
    --------------
    df: pandas.DataFrame
    検索ログのデータフレーム
    road_code: str
    道路コード （指定された道路上のエッジのみを結果のデータフレームに格納する）

    Returns
    ----------
    result: pandas.DataFrame
    検索経路がエッジに展開された結果
    エッジの両端のICコード（start_code, end_code）とそのエッジの通過時間(passing_time)が格納される
    '''
    start_codes = []
    end_codes = []
    passing_time_list = []
    
    nodes_set = set(ic_graph.nodes)
    
    df = df.loc[df['start_code'].isin(nodes_set) & df['end_code'].isin(nodes_set)]
    
    for s_name, t_name, tp, time, day in df[['start_name', 'end_name', 'spec_type', 'spec_time', 'spec_day']].to_numpy():
        if tp == 1:
            path, time_list = get_route_with_time(ic_graph, s_name, t_name, 
                                                  departure_time=time, spec_date=day.replace('-', '/'))
        else:
            path, time_list = get_route_with_time(ic_graph, s_name, t_name, 
                                                  arrival_time=time, spec_date=day.replace('-', '/'))
        if path is None:
            continue

        for i, (s, t) in enumerate(zip(path, path[1:])):
            if ic_graph[s][t]['road_code'] == road_code:
                start_codes.append(s)
                end_codes.append(t)
                passing_time_list.append(time_list[i])

    result = cudf.DataFrame({
      'start_code': start_codes,
      'end_code': end_codes,
      'passing_time': passing_time_list      
    })

    return result

In [11]:
def timeslice_grouping(df, timeslice):
    '''
    エッジに展開された検索経路をtimesliceでサンプリングし、各経路の検索量を積算する

    Parameters
    --------------
    df: pandas.DataFrame
    エッジに展開された検索経路が格納されたデータフレーム
    timeslice:
    サンプリング時間（1D, 1h, 5minなどを想定）

    Returns
    ---------
    result: pandas.DataFrame
    timesliceでサンプリングされた検索経路のデータフレーム
    '''
    # 後ほど検索量として積算される対象となるカラムを用意
    result = df\
        .assign(search=1)\
        .assign(passing_time=cudf.to_datetime(df['passing_time']))\
        .assign(start_code=df['start_code'].astype('category'))\
        .assign(end_code=df['end_code'].astype('category'))
    
    # timesliceでサンプリングし、検索量の和を取る
    result = result\
        .set_index('passing_time')\
        .to_pandas()\
        .groupby(['start_code', 'end_code'])\
        .apply(lambda g: g['search'].resample(timeslice).sum())\
        .reset_index()\
        .set_index('passing_time')

    result = cudf.from_pandas(result)

    return result

In [12]:
def count_minutely(date, past_periods, road_code):
    '''
    特定の日dateについて、過去past_periods日の検索ログを参照し、road_code上のエッジの検索量を積算する
    '''
    df = get_past_logs(date, periods=past_periods)
    df_expanded = expand_search_path(df, road_code)
    df_5min = timeslice_grouping(df_expanded, '5min')
    # filtering
    df_5min = cudf.from_pandas(df_5min.to_pandas().loc[date])
    return df_5min

## 時間指定あり検索数の作成

In [13]:
period_blocks = [
    ('20220501', '20220501')
    # ('20221001', '20221231'),
    # ('20230101', '20230331'),
    # ('20230401', '20230507')
]

### 関越道

In [32]:
# 過去7日間の検索ログを参照
past_periods = 7
# 関越道
road_code = '1800'

for start_date, end_date in period_blocks:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    
    DAYS = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    df_count = cudf.DataFrame()
    for i, date in enumerate(DAYS):
        s = time.time()

        df_minutely = count_minutely(date, past_periods, road_code)

        print('-'*30, f'[{date}] *{len(df_minutely)} records* ({time.time() - s:.3f} sec)', '-'*30)

        df_count = cudf.concat([df_count, df_minutely])

    df_count.reset_index(inplace=True)
    
    if start_date == '20210402':
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_KANNETSU, index=False)
    else:
        df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_KANNETSU, mode='a', header=False, index=False)
        
    print()

------------------------------ [20230401] *27640 records* (41.507 sec) ------------------------------
------------------------------ [20230402] *27476 records* (26.539 sec) ------------------------------
------------------------------ [20230403] *27620 records* (21.342 sec) ------------------------------
------------------------------ [20230404] *27534 records* (17.820 sec) ------------------------------
------------------------------ [20230405] *27066 records* (17.184 sec) ------------------------------
------------------------------ [20230406] *27521 records* (15.937 sec) ------------------------------
------------------------------ [20230407] *27401 records* (19.229 sec) ------------------------------
------------------------------ [20230408] *27635 records* (35.410 sec) ------------------------------
------------------------------ [20230409] *27469 records* (25.349 sec) ------------------------------
------------------------------ [20230410] *27332 records* (19.136 sec) -----------

In [35]:
!head -n5 ./search_count/search-count_kannetsu.csv

passing_time,start_code,end_code,search
2021-04-02 00:00:00,1080291,1800186,0
2021-04-02 00:05:00,1080291,1800186,0
2021-04-02 00:10:00,1080291,1800186,0
2021-04-02 00:15:00,1080291,1800186,0


In [36]:
!tail -n5 ./search_count/search-count_kannetsu.csv

2023-05-07 23:20:00,1800186,1800183,0
2023-05-07 23:25:00,1800186,1800183,0
2023-05-07 23:30:00,1800186,1800183,0
2023-05-07 23:35:00,1800186,1800183,0
2023-05-07 23:40:00,1800186,1800183,1


### 館山道

In [14]:
# 過去7日間の検索ログを参照
past_periods = 7
# 館山道
road_code = '1130'

for start_date, end_date in period_blocks:
    print('='*40, f'{start_date} -> {end_date}', '='*40)
    
    DAYS = [d.strftime('%Y%m%d') for d in pd.date_range(start_date, end_date, freq='1D')]
    
    df_count = cudf.DataFrame()
    for i, date in enumerate(DAYS):
        s = time.time()
        
        df_minutely = count_minutely(date, past_periods, road_code)

#         print('-'*30, f'[{date}] *{len(df_minutely)} records* ({time.time() - s:.3f} sec)', '-'*30)

#         df_count = cudf.concat([df_count, df_minutely])

#     df_count.reset_index(inplace=True)
    
#     if start_date == '20210402':
#         df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_TATEYAMA, index=False)
#     else:
#         df_count.to_pandas().to_csv(SEARCH_COUNT_CSV_TATEYAMA, mode='a', header=False, index=False)
        
#     print()

20220501


In [37]:
!head -n5 ./search_count/search-count_tateyama.csv

passing_time,start_code,end_code,search
2021-04-02 00:10:00,1130001,1130006,1
2021-04-02 00:15:00,1130001,1130006,1
2021-04-02 00:20:00,1130001,1130006,0
2021-04-02 00:25:00,1130001,1130006,0


In [38]:
!tail -n5 ./search_count/search-count_tateyama.csv

2023-05-07 10:35:00,1130041,1130046,0
2023-05-07 10:40:00,1130041,1130046,0
2023-05-07 10:45:00,1130041,1130046,0
2023-05-07 10:50:00,1130041,1130046,0
2023-05-07 10:55:00,1130041,1130046,2
