In [1]:
import sys
sys.version

'3.6.8 (default, Jan 14 2019, 11:02:34) \n[GCC 8.0.1 20180414 (experimental) [trunk revision 259383]]'

In [2]:
import os
os.chdir("/project/work/Passenger_Demand")

## 패키지 설치
!pip install seaborn
!pip install haversine
!pip install pyarrow
!pip install multiprocessing_on_dill
!pip install statsmodels

In [3]:
import pandas as pd
import numpy as np
import math
import numbers

from haversine import haversine
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import datetime

In [11]:
from multiprocessing_on_dill import Pool, cpu_count

In [5]:
def groupby_process(df,
                    func, 
                    group_keys,
                    num_cores = None,
                    **params) :
    
    if num_cores is None:
        num_cores = cpu_count()
        
    gr_df = df.groupby(group_keys)
    df_list = [group for name, group in gr_df]
    
    def map_func(data):
        return func(group_df = data.copy(), **params)
        
    with Pool(num_cores) as p:

        pd_result = pd.concat(p.map(map_func, df_list))
#         pd_result = p.starmap(func, **params)
        
    return pd_result

In [6]:
pd.set_option('display.max_columns', None)

plt.rcParams['font.family'] = 'Nanum Gothic'
sns.set(font="NanumGothic")

In [None]:
%%time
bus_demand_401 = pd.read_parquet('model_mr/mybicard_401_agg.parquet', engine='pyarrow')

In [None]:
bus_demand_401.groupby(pd.Grouper(key="transdate", freq='1D')).size()

In [None]:
bus_demand_401.sort_values('transdate').head(10)

In [None]:
bus_demand_401_check = bus_demand_401.copy()
bus_demand_401_check['date'] = bus_demand_401['transdate'].dt.date

In [None]:
bus_demand_401_check.groupby(['mybi_stop_id']).nunique().head(10)

In [None]:
#처음날짜, 마지막날짜
min_date, max_date = bus_demand_401['transdate'].min(), bus_demand_401['transdate'].max()
min_date, max_date

In [None]:
#전체 일수 계산
( bus_demand_401['transdate'].max() - bus_demand_401['transdate'].min() ).days

In [None]:
#전체 날짜 생성
entire_date = [min_date + datetime.timedelta(days= i) for i in range(237)]
entire_date[:1], entire_date[-1:]

In [None]:
#보유데이터 시각 추출
hours = bus_demand_401['transdate'].dt.hour.unique()
hours.sort(); hours

In [None]:
#전체 transdate 생성
transdate = [x + datetime.timedelta(hours= int(y)) for x in entire_date for y in hours]
transdate[18:22]

In [None]:
#보유데이터 정류장번호 추출
stop_ids = bus_demand_401['mybi_stop_id'].unique()
len(stop_ids)

In [None]:
data = {'mybi_stop_id' : [stop_id for stop_id in stop_ids for i in range(len(transdate))],
        'transdate' : transdate * len(stop_ids)}
bus_stop_df = pd.DataFrame(data= data)

In [None]:
bus_stop_df.head(3)

In [None]:
bus_demand_401.head(3)

In [None]:
aa = pd.merge(bus_stop_df, bus_demand_401, on=['mybi_stop_id', 'transdate'], how='left')

In [None]:
aa.shape

In [None]:
def fill_blank_data(data, group_data):
    data_copy = data.copy()
    # 데이터 있는지 확인
    if math.isnan(data_copy['totalcnt']) : # 데이터 없음
        col_names = ['normalcnt','studentcnt','childcnt','totalcnt']
        subset_df = group_data.sort_values(['transdate']).reset_index(drop=True).copy()

        # 해당 일의 데이터가 있는지 확인
        check_day_df = subset_df[subset_df['transdate'].dt.date == data_copy['transdate'].date()][col_names].copy()
        
        if len(check_day_df) == 0 : # 해당 일의 데이터 없음
            
            # 4주 전까지의 데이터 확인
            date_4W_list = [data_copy['transdate'] - datetime.timedelta(weeks= i) for i in range(1,5)]
            if len(date_4W_list) == 0 : # 4주 전까지의 데이터 없음
                tmp_date = data_copy['transdate'].date()
                
                # n주 전 데이터 찾기
                while tmp_date < subset_df['transdate'].min() :
                    result_data = subset_df[subset_df['transdate'] == tmp_date][col_names].copy()
                    if len(result_data) == 0 :
                        tmp_date -= datetime.timedelta(weeks= 1)
                    else :
                        break
                        
                # n주 전 데이터 찾았는지 확인
                if len(result_data) == 0: # False
                    result_data = pd.DataFrame([[data_copy['mybi_stop_id'], data_copy['transdate'], 0, 0, 0, 0]], columns= subset_df.columns)
                    
            else : # 4주 전까지의 데이터 있음
                result_data = subset_df[subset_df['transdate'].isin(date_4W_list)].groupby('mybi_stop_id').mean() \
                            .apply(lambda x: math.ceil(x), axis=0).T[col_names].copy()
                
        else : # 해당 일의 데이터 있음 => 해당 시간대에는 0으로 처리
            result_data = pd.DataFrame([[data_copy['mybi_stop_id'], data_copy['transdate'], 0, 0, 0, 0]], columns= subset_df.columns)
            
    else : # 데이터 있음
        result_data = pd.DataFrame([data_copy])
    return result_data.reset_index(drop=True).loc[0]

In [None]:
%%time
bb = parallelize_dataframe(df = aa, func = fill_blank_data, group_keys = 'mybi_stop_id', num_cores = 40)

In [None]:
bb.shape

In [None]:
bb.head(10)

In [None]:
bb.isnull().sum()

In [7]:
def period_before_cnt(group_df, cols, i):
    # i일 전의 lag데이터를 가져옴
    group_df_copy = group_df.sort_values(['transdate']).reset_index(drop=True).copy()
    
    ## cols = ['normalcnt','studentcnt','childcnt','totalcnt']
    for col in cols:
        col_name_day = col + '_D-' + str(i)
        group_df_copy[col_name_day] = group_df_copy.groupby('mybi_stop_id')[col].shift(i*20)
        group_df_copy[col_name_day] = np.where(pd.notnull(group_df_copy[col_name_day]), group_df_copy[col_name_day], group_df_copy[col])
        
        col_name_week = col + '_W-' + str(i)
        group_df_copy[col_name_week] = group_df_copy.groupby('mybi_stop_id')[col].shift(i*20*7)
        group_df_copy[col_name_week] = np.where(pd.notnull(group_df_copy[col_name_week]), group_df_copy[col_name_week], group_df_copy[col])
            
    return group_df_copy

In [8]:
target_cols = ['normalcnt', 'studentcnt', 'childcnt', 'totalcnt']

In [9]:
%%time
mybicard_401_agg_imputation = pd.read_parquet('model_mr/mybicard_401_agg_imputation.parquet', engine='pyarrow')

CPU times: user 654 ms, sys: 1.35 s, total: 2.01 s
Wall time: 458 ms


In [12]:
%%time
aa = groupby_process(df= mybicard_401_agg_imputation[['mybi_stop_id', 'transdate'] + target_cols], func= period_before_cnt, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 1)

CPU times: user 581 ms, sys: 382 ms, total: 963 ms
Wall time: 1.21 s


In [13]:
%%time
bb = groupby_process(df= aa, func= period_before_cnt, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 2)

CPU times: user 734 ms, sys: 362 ms, total: 1.1 s
Wall time: 1.36 s


In [14]:
%%time
cc = groupby_process(df= bb, func= period_before_cnt, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 3)

CPU times: user 890 ms, sys: 562 ms, total: 1.45 s
Wall time: 1.58 s


In [15]:
%%time
dd = groupby_process(df= cc, func= period_before_cnt, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 4)

CPU times: user 968 ms, sys: 692 ms, total: 1.66 s
Wall time: 1.76 s


In [16]:
%%time
ee = groupby_process(df= dd, func= period_before_cnt, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 5)

CPU times: user 1.25 s, sys: 888 ms, total: 2.14 s
Wall time: 2.21 s


In [17]:
%%time
ff = groupby_process(df= ee, func= period_before_cnt, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 6)

CPU times: user 1.35 s, sys: 1.17 s, total: 2.51 s
Wall time: 2.6 s


1) 이전 n개일자들의 동일 시간대 평균
2) n주전까지의 동일 요일의 동일 시간대 평균

3) 이전 n개일자들의 전체 평균
4) n주전까지의 동일 요일의 전체 평균
5) n주전까지의 전체 평균

In [18]:
def moving_average_hour(group_df, cols, i):
    group_df_copy = group_df.sort_values(['transdate']).reset_index(drop=True).copy()
    
    ## cols = ['normalcnt', 'studentcnt', 'childcnt', 'totalcnt']
    for col in cols:
        tmp_df = pd.DataFrame()
        mean_col_name = col+'_MA_hour_mean_D'+str(i) # i일전까지의 동일시간대(당일제외)
        std_col_name = col+'_MA_hour_std_D'+str(i) # i일전까지의 동일시간대(당일제외)

        for j in range(1, i+1):
            ## i일전까지의 동일시간대(당일제외) 컬럼생성
            tmp_df[col+'_'+str(j)] = group_df_copy[col].shift(j*20)

        ## 동일시간대 평균컬럼(당일제외) 생성
        ## 동일시간대 평균컬럼(당일제외) 없을 경우 당일집계 데이터 사용
        group_df_copy[mean_col_name] = tmp_df.mean(axis=1)
        group_df_copy[mean_col_name] = np.where(pd.notnull(group_df_copy[mean_col_name]), group_df_copy[mean_col_name], group_df_copy[col])
        # group_df_copy[mean_col_name] = group_df_copy[mean_col_name].apply(np.ceil)
        
        ## 동일시간대 분산컬럼(당일제외) 생성
        ## 동일시간대 분산컬럼(당일제외) 없을 경우 당일집계 데이터 사용
        group_df_copy[std_col_name] = tmp_df.std(axis=1)
        group_df_copy[std_col_name] = np.where(pd.notnull(group_df_copy[std_col_name]), group_df_copy[std_col_name], group_df_copy[col])
        # group_df_copy[mean_col_name] = group_df_copy[mean_col_name].apply(np.ceil)
    
    ## cols = ['normalcnt', 'studentcnt', 'childcnt', 'totalcnt']
    for col in cols:
        tmp_df = pd.DataFrame()
        mean_col_name = col+'_MA_hour_mean_W'+str(i) # i주전까지의 동일시간대(당일제외)
        std_col_name = col+'_MA_hour_std_W'+str(i) # i일전까지의 동일시간대(당일제외)

        for j in range(1, i+1):
            ## i주전까지의 동일시간대(당일제외) 컬럼생성
            tmp_df[col+'_'+str(j)] = group_df_copy[col].shift(j*20*7)
            
        ## 동일시간대 평균컬럼(당일제외) 생성
        ## 동일시간대 평균컬럼(당일제외) 없을 경우 당일집계 데이터 사용
        group_df_copy[mean_col_name] = tmp_df.mean(axis=1)
        group_df_copy[mean_col_name] = np.where(pd.notnull(group_df_copy[mean_col_name]), group_df_copy[mean_col_name], group_df_copy[col])
        # group_df_copy[mean_col_name] = group_df_copy[mean_col_name].apply(np.ceil)
        
        ## 동일시간대 분산컬럼(당일제외) 생성
        ## 동일시간대 분산컬럼(당일제외) 없을 경우 당일집계 데이터 사용
        group_df_copy[std_col_name] = tmp_df.std(axis=1)
        group_df_copy[std_col_name] = np.where(pd.notnull(group_df_copy[std_col_name]), group_df_copy[std_col_name], group_df_copy[col])
        # group_df_copy[mean_col_name] = group_df_copy[mean_col_name].apply(np.ceil)

    return group_df_copy

In [19]:
%%time
aaa = groupby_process(df= ff, func= moving_average_hour, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 2)

CPU times: user 1.72 s, sys: 1.26 s, total: 2.98 s
Wall time: 3.27 s


In [20]:
%%time
bbb = groupby_process(df= aaa, func= moving_average_hour, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 3)

CPU times: user 1.99 s, sys: 1.54 s, total: 3.53 s
Wall time: 3.81 s


In [21]:
%%time
ccc = groupby_process(df= bbb, func= moving_average_hour, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 4)

CPU times: user 2.3 s, sys: 1.68 s, total: 3.98 s
Wall time: 4.26 s


In [22]:
%%time
ddd = groupby_process(df= ccc, func= moving_average_hour, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 5)

CPU times: user 2.64 s, sys: 2.13 s, total: 4.77 s
Wall time: 4.92 s


In [23]:
%%time
eee = groupby_process(df= ddd, func= moving_average_hour, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols= target_cols, i= 6)

CPU times: user 3.1 s, sys: 2.28 s, total: 5.37 s
Wall time: 5.58 s


In [24]:
def moving_average_all(group_df, cols, i):
    group_df_copy = group_df.sort_values(['transdate']).reset_index(drop=True).copy()
    ## 일자별 집계
    group_sum_df_copy = group_df_copy[['transdate'] + cols].groupby(group_df_copy['transdate'].dt.date).sum().copy()

    ## cols = ['normalcnt', 'studentcnt', 'childcnt', 'totalcnt']
    for col in cols:
        tmp_df = pd.DataFrame()
        mean_col_name_all = col+'_MA_all_mean_A'+str(i) # i주전까지의 일집계(당일제외)
        mean_col_name_day = col+'_MA_all_mean_D'+str(i) # i일전까지의 일집계(당일제외)
        mean_col_name_week = col+'_MA_all_mean_W'+str(i) # i주전까지의 동일요일 일집계(당일제외)
        
        std_col_name_all = col+'_MA_all_std_A'+str(i) # i주전까지의 일집계(당일제외)
        std_col_name_day = col+'_MA_all_std_D'+str(i) # i일전까지의 일집계(당일제외)
        std_col_name_week = col+'_MA_all_std_W'+str(i) # i주전까지의 동일요일 일집계(당일제외)

        for j in range(1, (i*7)+1):
            ## i주 전까지의 일집계(당일제외) 컬럼생성
            tmp_df[col+'_'+str(j)] = group_sum_df_copy[col].shift(j)
        
        ## 일집계 평균컬럼(당일제외) 생성
        group_sum_df_copy[mean_col_name_all] = tmp_df.mean(axis=1)
        group_sum_df_copy[mean_col_name_day] = tmp_df[tmp_df.columns[:i]].mean(axis=1)
        group_sum_df_copy[mean_col_name_week] = tmp_df[[tmp_df.columns[7*n-1] for n in range(1,i)]].mean(axis=1)
        
        ## 일집계 평균데이터가 없을 경우 당일집계 데이터 사용
        group_sum_df_copy[mean_col_name_all] = np.where(pd.notnull(group_sum_df_copy[mean_col_name_all]), group_sum_df_copy[mean_col_name_all], group_sum_df_copy[col])
        group_sum_df_copy[mean_col_name_day] = np.where(pd.notnull(group_sum_df_copy[mean_col_name_day]), group_sum_df_copy[mean_col_name_day], group_sum_df_copy[col])
        group_sum_df_copy[mean_col_name_week] = np.where(pd.notnull(group_sum_df_copy[mean_col_name_week]), group_sum_df_copy[mean_col_name_week], group_sum_df_copy[col])
        
        ## 일집계 평균컬럼(당일제외) 생성
        group_sum_df_copy[std_col_name_all] = tmp_df.std(axis=1)
        group_sum_df_copy[std_col_name_day] = tmp_df[tmp_df.columns[:i]].std(axis=1)
        group_sum_df_copy[std_col_name_week] = tmp_df[[tmp_df.columns[7*n-1] for n in range(1,i)]].std(axis=1)
        
        ## 일집계 평균데이터가 없을 경우 당일집계 데이터 사용
        group_sum_df_copy[std_col_name_all] = np.where(pd.notnull(group_sum_df_copy[std_col_name_all]), group_sum_df_copy[std_col_name_all], group_sum_df_copy[col])
        group_sum_df_copy[std_col_name_day] = np.where(pd.notnull(group_sum_df_copy[std_col_name_day]), group_sum_df_copy[std_col_name_day], group_sum_df_copy[col])
        group_sum_df_copy[std_col_name_week] = np.where(pd.notnull(group_sum_df_copy[std_col_name_week]), group_sum_df_copy[std_col_name_week], group_sum_df_copy[col])
        
        ## 일집계 평균데이터 올림
        # group_sum_df_copy[col_name_all] = group_sum_df_copy[col_name_all].apply(np.ceil)
        # group_sum_df_copy[col_name_day] = group_sum_df_copy[col_name_day].apply(np.ceil)
        # group_sum_df_copy[col_name_week] = group_sum_df_copy[col_name_week].apply(np.ceil)
    
    ## MA데이터 생성
    group_df_copy = pd.merge(group_df_copy, group_sum_df_copy[group_sum_df_copy.columns[4:]]
                            , left_on= group_df_copy['transdate'].dt.date, right_index= True, how= 'left')
    return group_df_copy

In [25]:
%%time
aaaa = groupby_process(df= eee, func= moving_average_all, group_keys= 'mybi_stop_id', num_cores= 10, cols= target_cols, i= 1)

CPU times: user 3.79 s, sys: 2.93 s, total: 6.72 s
Wall time: 6.71 s


In [26]:
%%time
bbbb = groupby_process(df= aaaa, func= moving_average_all, group_keys= 'mybi_stop_id', num_cores= 10, cols= target_cols, i= 2)

CPU times: user 4.36 s, sys: 3.68 s, total: 8.04 s
Wall time: 8.14 s


In [27]:
%%time
cccc = groupby_process(df= bbbb, func= moving_average_all, group_keys= 'mybi_stop_id', num_cores= 10, cols= target_cols, i= 3)

CPU times: user 4.44 s, sys: 3.41 s, total: 7.85 s
Wall time: 8.03 s


In [28]:
%%time
dddd = groupby_process(df= cccc, func= moving_average_all, group_keys= 'mybi_stop_id', num_cores= 10, cols= target_cols, i= 4)

CPU times: user 4.82 s, sys: 4.06 s, total: 8.87 s
Wall time: 9.32 s


In [29]:
%%time
eeee = groupby_process(df= dddd, func= moving_average_all, group_keys= 'mybi_stop_id', num_cores= 10, cols= target_cols, i= 5)

CPU times: user 5.5 s, sys: 4.86 s, total: 10.4 s
Wall time: 10.6 s


In [30]:
%%time
ffff = groupby_process(df= eeee, func= moving_average_all, group_keys= 'mybi_stop_id', num_cores= 10, cols= target_cols, i= 6)

CPU times: user 5.86 s, sys: 6.44 s, total: 12.3 s
Wall time: 14.3 s


In [31]:
ffff.head(3)

Unnamed: 0,mybi_stop_id,transdate,normalcnt,studentcnt,childcnt,totalcnt,normalcnt_D-1,normalcnt_W-1,studentcnt_D-1,studentcnt_W-1,childcnt_D-1,childcnt_W-1,totalcnt_D-1,totalcnt_W-1,normalcnt_D-2,normalcnt_W-2,studentcnt_D-2,studentcnt_W-2,childcnt_D-2,childcnt_W-2,totalcnt_D-2,totalcnt_W-2,normalcnt_D-3,normalcnt_W-3,studentcnt_D-3,studentcnt_W-3,childcnt_D-3,childcnt_W-3,totalcnt_D-3,totalcnt_W-3,normalcnt_D-4,normalcnt_W-4,studentcnt_D-4,studentcnt_W-4,childcnt_D-4,childcnt_W-4,totalcnt_D-4,totalcnt_W-4,normalcnt_D-5,normalcnt_W-5,studentcnt_D-5,studentcnt_W-5,childcnt_D-5,childcnt_W-5,totalcnt_D-5,totalcnt_W-5,normalcnt_D-6,normalcnt_W-6,studentcnt_D-6,studentcnt_W-6,childcnt_D-6,childcnt_W-6,totalcnt_D-6,totalcnt_W-6,normalcnt_MA_hour_mean_D2,normalcnt_MA_hour_std_D2,studentcnt_MA_hour_mean_D2,studentcnt_MA_hour_std_D2,childcnt_MA_hour_mean_D2,childcnt_MA_hour_std_D2,totalcnt_MA_hour_mean_D2,totalcnt_MA_hour_std_D2,normalcnt_MA_hour_mean_W2,normalcnt_MA_hour_std_W2,studentcnt_MA_hour_mean_W2,studentcnt_MA_hour_std_W2,childcnt_MA_hour_mean_W2,childcnt_MA_hour_std_W2,totalcnt_MA_hour_mean_W2,totalcnt_MA_hour_std_W2,normalcnt_MA_hour_mean_D3,normalcnt_MA_hour_std_D3,studentcnt_MA_hour_mean_D3,studentcnt_MA_hour_std_D3,childcnt_MA_hour_mean_D3,childcnt_MA_hour_std_D3,totalcnt_MA_hour_mean_D3,totalcnt_MA_hour_std_D3,normalcnt_MA_hour_mean_W3,normalcnt_MA_hour_std_W3,studentcnt_MA_hour_mean_W3,studentcnt_MA_hour_std_W3,childcnt_MA_hour_mean_W3,childcnt_MA_hour_std_W3,totalcnt_MA_hour_mean_W3,totalcnt_MA_hour_std_W3,normalcnt_MA_hour_mean_D4,normalcnt_MA_hour_std_D4,studentcnt_MA_hour_mean_D4,studentcnt_MA_hour_std_D4,childcnt_MA_hour_mean_D4,childcnt_MA_hour_std_D4,totalcnt_MA_hour_mean_D4,totalcnt_MA_hour_std_D4,normalcnt_MA_hour_mean_W4,normalcnt_MA_hour_std_W4,studentcnt_MA_hour_mean_W4,studentcnt_MA_hour_std_W4,childcnt_MA_hour_mean_W4,childcnt_MA_hour_std_W4,totalcnt_MA_hour_mean_W4,totalcnt_MA_hour_std_W4,normalcnt_MA_hour_mean_D5,normalcnt_MA_hour_std_D5,studentcnt_MA_hour_mean_D5,studentcnt_MA_hour_std_D5,childcnt_MA_hour_mean_D5,childcnt_MA_hour_std_D5,totalcnt_MA_hour_mean_D5,totalcnt_MA_hour_std_D5,normalcnt_MA_hour_mean_W5,normalcnt_MA_hour_std_W5,studentcnt_MA_hour_mean_W5,studentcnt_MA_hour_std_W5,childcnt_MA_hour_mean_W5,childcnt_MA_hour_std_W5,totalcnt_MA_hour_mean_W5,totalcnt_MA_hour_std_W5,normalcnt_MA_hour_mean_D6,normalcnt_MA_hour_std_D6,studentcnt_MA_hour_mean_D6,studentcnt_MA_hour_std_D6,childcnt_MA_hour_mean_D6,childcnt_MA_hour_std_D6,totalcnt_MA_hour_mean_D6,totalcnt_MA_hour_std_D6,normalcnt_MA_hour_mean_W6,normalcnt_MA_hour_std_W6,studentcnt_MA_hour_mean_W6,studentcnt_MA_hour_std_W6,childcnt_MA_hour_mean_W6,childcnt_MA_hour_std_W6,totalcnt_MA_hour_mean_W6,totalcnt_MA_hour_std_W6,normalcnt_MA_all_mean_A1,normalcnt_MA_all_mean_D1,normalcnt_MA_all_mean_W1,normalcnt_MA_all_std_A1,normalcnt_MA_all_std_D1,normalcnt_MA_all_std_W1,studentcnt_MA_all_mean_A1,studentcnt_MA_all_mean_D1,studentcnt_MA_all_mean_W1,studentcnt_MA_all_std_A1,studentcnt_MA_all_std_D1,studentcnt_MA_all_std_W1,childcnt_MA_all_mean_A1,childcnt_MA_all_mean_D1,childcnt_MA_all_mean_W1,childcnt_MA_all_std_A1,childcnt_MA_all_std_D1,childcnt_MA_all_std_W1,totalcnt_MA_all_mean_A1,totalcnt_MA_all_mean_D1,totalcnt_MA_all_mean_W1,totalcnt_MA_all_std_A1,totalcnt_MA_all_std_D1,totalcnt_MA_all_std_W1,normalcnt_MA_all_mean_A2,normalcnt_MA_all_mean_D2,normalcnt_MA_all_mean_W2,normalcnt_MA_all_std_A2,normalcnt_MA_all_std_D2,normalcnt_MA_all_std_W2,studentcnt_MA_all_mean_A2,studentcnt_MA_all_mean_D2,studentcnt_MA_all_mean_W2,studentcnt_MA_all_std_A2,studentcnt_MA_all_std_D2,studentcnt_MA_all_std_W2,childcnt_MA_all_mean_A2,childcnt_MA_all_mean_D2,childcnt_MA_all_mean_W2,childcnt_MA_all_std_A2,childcnt_MA_all_std_D2,childcnt_MA_all_std_W2,totalcnt_MA_all_mean_A2,totalcnt_MA_all_mean_D2,totalcnt_MA_all_mean_W2,totalcnt_MA_all_std_A2,totalcnt_MA_all_std_D2,totalcnt_MA_all_std_W2,normalcnt_MA_all_mean_A3,normalcnt_MA_all_mean_D3,normalcnt_MA_all_mean_W3,normalcnt_MA_all_std_A3,normalcnt_MA_all_std_D3,normalcnt_MA_all_std_W3,studentcnt_MA_all_mean_A3,studentcnt_MA_all_mean_D3,studentcnt_MA_all_mean_W3,studentcnt_MA_all_std_A3,studentcnt_MA_all_std_D3,studentcnt_MA_all_std_W3,childcnt_MA_all_mean_A3,childcnt_MA_all_mean_D3,childcnt_MA_all_mean_W3,childcnt_MA_all_std_A3,childcnt_MA_all_std_D3,childcnt_MA_all_std_W3,totalcnt_MA_all_mean_A3,totalcnt_MA_all_mean_D3,totalcnt_MA_all_mean_W3,totalcnt_MA_all_std_A3,totalcnt_MA_all_std_D3,totalcnt_MA_all_std_W3,normalcnt_MA_all_mean_A4,normalcnt_MA_all_mean_D4,normalcnt_MA_all_mean_W4,normalcnt_MA_all_std_A4,normalcnt_MA_all_std_D4,normalcnt_MA_all_std_W4,studentcnt_MA_all_mean_A4,studentcnt_MA_all_mean_D4,studentcnt_MA_all_mean_W4,studentcnt_MA_all_std_A4,studentcnt_MA_all_std_D4,studentcnt_MA_all_std_W4,childcnt_MA_all_mean_A4,childcnt_MA_all_mean_D4,childcnt_MA_all_mean_W4,childcnt_MA_all_std_A4,childcnt_MA_all_std_D4,childcnt_MA_all_std_W4,totalcnt_MA_all_mean_A4,totalcnt_MA_all_mean_D4,totalcnt_MA_all_mean_W4,totalcnt_MA_all_std_A4,totalcnt_MA_all_std_D4,totalcnt_MA_all_std_W4,normalcnt_MA_all_mean_A5,normalcnt_MA_all_mean_D5,normalcnt_MA_all_mean_W5,normalcnt_MA_all_std_A5,normalcnt_MA_all_std_D5,normalcnt_MA_all_std_W5,studentcnt_MA_all_mean_A5,studentcnt_MA_all_mean_D5,studentcnt_MA_all_mean_W5,studentcnt_MA_all_std_A5,studentcnt_MA_all_std_D5,studentcnt_MA_all_std_W5,childcnt_MA_all_mean_A5,childcnt_MA_all_mean_D5,childcnt_MA_all_mean_W5,childcnt_MA_all_std_A5,childcnt_MA_all_std_D5,childcnt_MA_all_std_W5,totalcnt_MA_all_mean_A5,totalcnt_MA_all_mean_D5,totalcnt_MA_all_mean_W5,totalcnt_MA_all_std_A5,totalcnt_MA_all_std_D5,totalcnt_MA_all_std_W5,normalcnt_MA_all_mean_A6,normalcnt_MA_all_mean_D6,normalcnt_MA_all_mean_W6,normalcnt_MA_all_std_A6,normalcnt_MA_all_std_D6,normalcnt_MA_all_std_W6,studentcnt_MA_all_mean_A6,studentcnt_MA_all_mean_D6,studentcnt_MA_all_mean_W6,studentcnt_MA_all_std_A6,studentcnt_MA_all_std_D6,studentcnt_MA_all_std_W6,childcnt_MA_all_mean_A6,childcnt_MA_all_mean_D6,childcnt_MA_all_mean_W6,childcnt_MA_all_std_A6,childcnt_MA_all_std_D6,childcnt_MA_all_std_W6,totalcnt_MA_all_mean_A6,totalcnt_MA_all_mean_D6,totalcnt_MA_all_mean_W6,totalcnt_MA_all_std_A6,totalcnt_MA_all_std_D6,totalcnt_MA_all_std_W6
0,3100020,2020-04-08 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0
1,3100020,2020-04-08 05:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0
2,3100020,2020-04-08 06:00:00,2.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,48.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,50.0,50.0,50.0,50.0,50.0


In [32]:
ffff.to_parquet("model_mr/mybicard_401_agg_moving_average.parquet")

In [33]:
ffff.shape

(620940, 278)

In [35]:
#18시의 같은정류장 2주치의 데이터
ffff[(ffff['mybi_stop_id']==3100020) & (ffff['transdate'].dt.hour==18)] \
    [['mybi_stop_id', 'transdate', 'totalcnt', 'totalcnt_D-1', 'totalcnt_D-2',
      'totalcnt_MA_hour_mean_D2', 'totalcnt_MA_hour_mean_W2',
      'totalcnt_MA_all_mean_A2', 'totalcnt_MA_all_mean_D2', 'totalcnt_MA_all_mean_W2']].head(14)

Unnamed: 0,mybi_stop_id,transdate,totalcnt,totalcnt_D-1,totalcnt_D-2,totalcnt_MA_hour_mean_D2,totalcnt_MA_hour_mean_W2,totalcnt_MA_all_mean_A2,totalcnt_MA_all_mean_D2,totalcnt_MA_all_mean_W2
14,3100020,2020-04-08 18:00:00,2.0,2.0,2.0,2.0,2.0,50.0,50.0,50.0
34,3100020,2020-04-09 18:00:00,0.0,2.0,0.0,2.0,0.0,50.0,50.0,27.0
54,3100020,2020-04-10 18:00:00,2.0,0.0,2.0,1.0,2.0,38.5,38.5,42.0
74,3100020,2020-04-11 18:00:00,1.0,2.0,0.0,1.0,1.0,39.666667,34.5,25.0
94,3100020,2020-04-12 18:00:00,2.0,1.0,2.0,1.5,2.0,36.0,33.5,23.0
114,3100020,2020-04-13 18:00:00,3.0,2.0,1.0,1.5,3.0,33.4,24.0,41.0
134,3100020,2020-04-14 18:00:00,7.0,3.0,2.0,2.5,7.0,34.666667,32.0,57.0
154,3100020,2020-04-15 18:00:00,0.0,7.0,3.0,5.0,2.0,37.857143,49.0,50.0
174,3100020,2020-04-16 18:00:00,0.0,0.0,7.0,3.5,0.0,33.125,28.5,27.0
194,3100020,2020-04-17 18:00:00,0.0,0.0,0.0,0.0,2.0,29.444444,0.0,42.0
