In [1]:
import sys
sys.version

'3.6.8 (default, Jan 14 2019, 11:02:34) \n[GCC 8.0.1 20180414 (experimental) [trunk revision 259383]]'

In [2]:
import os
os.chdir("/project/work/Passenger_Demand")

## 패키지 설치
!pip install seaborn
!pip install haversine
!pip install pyarrow
!pip install multiprocessing_on_dill
!pip install statsmodels

In [3]:
import pandas as pd
import numpy as np
import math
import numbers

from haversine import haversine
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import datetime

In [5]:
from multiprocessing_on_dill import Pool, cpu_count
from functools import partial
from tqdm import tqdm
import pandas as pd
import numpy as np


def parallelize_dataframe(df, 
                          func, 
                          group_keys = None, 
                          num_cores = None,
                          **params):
    """
        Pandas DataFrame의 apply함수를 병렬처리 하는 함수

        Args: 
            df: 적용 대상 데이터프레임 (Pandas.DataFrame)
            group_keys: apply를 적용할 때 기준이되는 group by key

        Returns:
            df: (Pandas.DataFrame)
            
        Exception: 
    """

    if num_cores is None:
        num_cores = cpu_count()

    if group_keys is None:
        df_list = np.array_split(df, num_cores)
    elif group_keys is not None:
        gr_df = df.groupby(group_keys)
        df_list = [group for name, group in gr_df]
    
    #func = partial(func, **params)
    
    def map_func(data):
        return data.apply(func, axis = 1, group_data= data.copy() ,**params)
        
        
    with Pool(num_cores) as p:

        pd_result = pd.concat(p.map(map_func, df_list))
#         pd_result = p.starmap(func, **params)
        
    return pd_result

In [6]:
pd.set_option('display.max_columns', None)

plt.rcParams['font.family'] = 'Nanum Gothic'
sns.set(font="NanumGothic")

In [7]:
%%time
bus_demand_401 = pd.read_parquet('model_mr/mybicard_401_agg.parquet', engine='pyarrow')

CPU times: user 83.8 ms, sys: 81.3 ms, total: 165 ms
Wall time: 114 ms


In [8]:
bus_demand_401.sort_values('transdate').head(10)

Unnamed: 0,mybi_stop_id,transdate,normalcnt,studentcnt,childcnt,totalcnt
223368,3101482,2020-04-08 00:00:00,1,1,0,2
16084,3100144,2020-04-08 00:00:00,2,0,0,2
89825,3101414,2020-04-08 00:00:00,0,1,0,1
164705,3101446,2020-04-08 00:00:00,1,0,0,1
156351,3101442,2020-04-08 00:00:00,1,0,0,1
95714,3101416,2020-04-08 00:00:00,1,0,0,1
150970,3101440,2020-04-08 00:00:00,1,0,0,1
278482,3101537,2020-04-08 05:00:00,2,0,0,2
293924,3101542,2020-04-08 05:00:00,16,0,0,16
13909,3100101,2020-04-08 05:00:00,1,0,0,1


In [9]:
bus_demand_401_check = bus_demand_401.copy()
bus_demand_401_check['date'] = bus_demand_401['transdate'].dt.date

In [10]:
bus_demand_401_check.groupby(['mybi_stop_id']).nunique().head(10)

Unnamed: 0_level_0,mybi_stop_id,transdate,normalcnt,studentcnt,childcnt,totalcnt,date
mybi_stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3100020,1,2797,15,6,1,16,190
3100021,1,3311,25,5,3,23,190
3100057,1,1518,19,3,2,19,190
3100058,1,2102,20,4,3,19,190
3100085,1,1150,6,6,2,8,189
3100086,1,3031,15,5,4,14,190
3100101,1,1444,8,4,1,8,190
3100106,1,37,4,2,1,3,35
3100142,1,315,7,21,1,22,140
3100143,1,379,6,11,2,13,152


In [11]:
#처음날짜, 마지막날짜
min_date, max_date = bus_demand_401['transdate'].min(), bus_demand_401['transdate'].max()
min_date, max_date

(Timestamp('2020-04-08 00:00:00'), Timestamp('2020-11-30 23:00:00'))

In [12]:
#전체 일수 계산
( bus_demand_401['transdate'].max() - bus_demand_401['transdate'].min() ).days

236

In [13]:
#전체 날짜 생성
entire_date = [min_date + datetime.timedelta(days= i) for i in range(237)]
entire_date[:1], entire_date[-1:]

([Timestamp('2020-04-08 00:00:00')], [Timestamp('2020-11-30 00:00:00')])

In [14]:
#보유데이터 시각 추출
hours = bus_demand_401['transdate'].dt.hour.unique()
hours.sort(); hours

array([ 0,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21, 22, 23])

In [15]:
#전체 transdate 생성
transdate = [x + datetime.timedelta(hours= int(y)) for x in entire_date for y in hours]
transdate[18:22]

[Timestamp('2020-04-08 22:00:00'),
 Timestamp('2020-04-08 23:00:00'),
 Timestamp('2020-04-09 00:00:00'),
 Timestamp('2020-04-09 05:00:00')]

In [16]:
#보유데이터 정류장번호 추출
stop_ids = bus_demand_401['mybi_stop_id'].unique()
len(stop_ids)

131

In [17]:
data = {'mybi_stop_id' : [stop_id for stop_id in stop_ids for i in range(len(transdate))],
        'transdate' : transdate * len(stop_ids)}
bus_stop_df = pd.DataFrame(data= data)

In [18]:
bus_stop_df.head(3)

Unnamed: 0,mybi_stop_id,transdate
0,3100020,2020-04-08 00:00:00
1,3100020,2020-04-08 05:00:00
2,3100020,2020-04-08 06:00:00


In [19]:
bus_demand_401.head(3)

Unnamed: 0,mybi_stop_id,transdate,normalcnt,studentcnt,childcnt,totalcnt
0,3100020,2020-04-08 06:00:00,2,0,0,2
1,3100020,2020-04-08 08:00:00,7,0,0,7
2,3100020,2020-04-08 10:00:00,2,0,0,2


In [20]:
aa = pd.merge(bus_stop_df, bus_demand_401, on=['mybi_stop_id', 'transdate'], how='left')

In [21]:
aa.shape

(620940, 6)

In [22]:
def fill_blank_data(data, group_data):
    data_copy = data.copy()
    # 데이터 있는지 확인
    if math.isnan(data_copy['totalcnt']) : # 데이터 없음
        col_names = ['normalcnt','studentcnt','childcnt','totalcnt']
        subset_df = group_data.sort_values(['transdate']).reset_index(drop=True).copy()

        # 해당 일의 데이터가 있는지 확인
        check_day_df = subset_df[subset_df['transdate'].dt.date == data_copy['transdate'].date()][col_names].copy()
        
        if len(check_day_df) == 0 : # 해당 일의 데이터 없음
            
            # 4주 전까지의 데이터 확인
            date_4W_list = [data_copy['transdate'] - datetime.timedelta(weeks= i) for i in range(1,5)]
            if len(date_4W_list) == 0 : # 4주 전까지의 데이터 없음
                tmp_date = data_copy['transdate'].date()
                
                # n주 전 데이터 찾기
                while tmp_date < subset_df['transdate'].min() :
                    result_data = subset_df[subset_df['transdate'] == tmp_date][col_names].copy()
                    if len(result_data) == 0 :
                        tmp_date -= datetime.timedelta(weeks= 1)
                    else :
                        break
                        
                # n주 전 데이터 찾았는지 확인
                if len(result_data) == 0: # False
                    result_data = pd.DataFrame([[data_copy['mybi_stop_id'], data_copy['transdate'], 0, 0, 0, 0]], columns= subset_df.columns)
                    
            else : # 4주 전까지의 데이터 있음
                result_data = subset_df[subset_df['transdate'].isin(date_4W_list)].groupby('mybi_stop_id').mean() \
                            .apply(lambda x: math.ceil(x), axis=0).T[col_names].copy()
                
        else : # 해당 일의 데이터 있음 => 해당 시간대에는 0으로 처리
            result_data = pd.DataFrame([[data_copy['mybi_stop_id'], data_copy['transdate'], 0, 0, 0, 0]], columns= subset_df.columns)
            
    else : # 데이터 있음
        result_data = pd.DataFrame([data_copy])
    return result_data.reset_index(drop=True).loc[0]

In [23]:
%%time
bb = parallelize_dataframe(df = aa, func = fill_blank_data, group_keys = 'mybi_stop_id', num_cores = 40)

CPU times: user 4.89 s, sys: 5.93 s, total: 10.8 s
Wall time: 25min 56s


In [24]:
bb.shape

(620940, 6)

In [25]:
bb.head(10)

Unnamed: 0,mybi_stop_id,transdate,normalcnt,studentcnt,childcnt,totalcnt
0,3100020,2020-04-08 00:00:00,0.0,0.0,0.0,0.0
1,3100020,2020-04-08 05:00:00,0.0,0.0,0.0,0.0
2,3100020,2020-04-08 06:00:00,2.0,0.0,0.0,2.0
3,3100020,2020-04-08 07:00:00,0.0,0.0,0.0,0.0
4,3100020,2020-04-08 08:00:00,7.0,0.0,0.0,7.0
5,3100020,2020-04-08 09:00:00,0.0,0.0,0.0,0.0
6,3100020,2020-04-08 10:00:00,2.0,0.0,0.0,2.0
7,3100020,2020-04-08 11:00:00,2.0,0.0,0.0,2.0
8,3100020,2020-04-08 12:00:00,1.0,0.0,0.0,1.0
9,3100020,2020-04-08 13:00:00,2.0,1.0,0.0,3.0


In [26]:
bb.isnull().sum()

mybi_stop_id    0
transdate       0
normalcnt       0
studentcnt      0
childcnt        0
totalcnt        0
dtype: int64

In [27]:
def period_before_cnt(data, group_data, i):
    col_names = ['normalcnt','studentcnt','childcnt','totalcnt']
    col_all = ['mybi_stop_id', 'transdate'] + col_names
    data_copy = data.copy()
    subset_df = group_data[col_all].copy()
    
    before_day = subset_df[(subset_df['transdate'] == data_copy['transdate'] - datetime.timedelta(days=i))][col_names].copy()
    before_week = subset_df[(subset_df['transdate'] == data_copy['transdate'] - datetime.timedelta(weeks=i))][col_names].copy()
    
    if len(before_day) == 0:
        before_day = pd.DataFrame([data_copy[col_names]])
    if len(before_week) == 0:
        before_week = pd.DataFrame([data_copy[col_names]])
    
    for name in col_names:
        data_copy[name + '_D-' + str(i)] = before_day.reset_index(drop=True).loc[0][name]
        data_copy[name + '_W-' + str(i)] = before_week.reset_index(drop=True).loc[0][name]
    return data_copy

In [28]:
%%time
cc = parallelize_dataframe(df = bb, func = period_before_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 1)

CPU times: user 2.79 s, sys: 2.63 s, total: 5.42 s
Wall time: 38min 47s


In [29]:
%%time
dd = parallelize_dataframe(df = cc, func = period_before_cnt, group_keys = 'mybi_stop_id', num_cores = 20, i= 2)

CPU times: user 6.42 s, sys: 7.54 s, total: 14 s
Wall time: 1h 3min 31s


In [30]:
%%time
ee = parallelize_dataframe(df = dd, func = period_before_cnt, group_keys = 'mybi_stop_id', num_cores = 40, i= 3)

CPU times: user 14.5 s, sys: 20.3 s, total: 34.8 s
Wall time: 1h 34min 18s


In [31]:
%%time
ff = parallelize_dataframe(df = ee, func = period_before_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 4)

CPU times: user 3.12 s, sys: 3.34 s, total: 6.46 s
Wall time: 39min 17s


In [32]:
ff.shape

(620940, 38)

In [33]:
col = ['mybi_stop_id', 'transdate', 'normalcnt', 'studentcnt', 'childcnt', 'totalcnt'
       , 'normalcnt_D-1', 'studentcnt_D-1', 'childcnt_D-1', 'totalcnt_D-1'
       , 'normalcnt_D-2', 'studentcnt_D-2', 'childcnt_D-2', 'totalcnt_D-2'
       , 'normalcnt_D-3', 'studentcnt_D-3', 'childcnt_D-3', 'totalcnt_D-3'
       , 'normalcnt_D-4', 'studentcnt_D-4', 'childcnt_D-4', 'totalcnt_D-4'
       , 'normalcnt_W-1', 'studentcnt_W-1', 'childcnt_W-1','totalcnt_W-1'
       , 'normalcnt_W-2', 'studentcnt_W-2', 'childcnt_W-2','totalcnt_W-2'
       , 'normalcnt_W-3', 'studentcnt_W-3', 'childcnt_W-3','totalcnt_W-3'
       , 'normalcnt_W-4', 'studentcnt_W-4', 'childcnt_W-4','totalcnt_W-4']

In [34]:
ff[col].to_parquet("model_mr/mybicard_401_agg_imputation.parquet")

In [35]:
mybicard_401_agg_imputation = ff.copy()

In [36]:
mybicard_401_agg_imputation['date'] = mybicard_401_agg_imputation['transdate'].dt.date
mybicard_401_agg_imputation['hour'] = mybicard_401_agg_imputation['transdate'].dt.hour
mybicard_401_agg_imputation['dayofweek'] = mybicard_401_agg_imputation['transdate'].dt.dayofweek

1) 이전 n개일자들의 동일 시간대 평균
2) n주전까지의 동일 요일의 동일 시간대 평균

3) 이전 n개일자들의 전체 평균
4) n주전까지의 동일 요일의 전체 평균
5) n주전까지의 전체 평균

In [37]:
def moving_average_hour_cnt(data, group_data, i):
    col_names = ['normalcnt','studentcnt','childcnt','totalcnt']
    col_all = ['mybi_stop_id', 'transdate'] + col_names
    
    data_copy = data.copy()
    subset_df = group_data[group_data['hour'] == data_copy['hour']][col_all].copy()
    
    date_day_list = [data_copy['transdate'] - datetime.timedelta(days= j) for j in range(1,i+1)]
    date_week_list = [data_copy['transdate'] - datetime.timedelta(weeks= j) for j in range(1,i+1)]
    
    subset_day_df = subset_df[subset_df['transdate'].isin(date_day_list)][['mybi_stop_id'] + col_names].copy()
    subset_week_df = subset_df[subset_df['transdate'].isin(date_week_list)][['mybi_stop_id'] + col_names].copy()
    
    if len(subset_day_df) == 0:
        moving_average_day = pd.DataFrame([data_copy[['mybi_stop_id'] + col_names]])
        
    if len(subset_week_df) == 0:
        moving_average_week = pd.DataFrame([data_copy[['mybi_stop_id'] + col_names]])
        
    moving_average_day = subset_day_df.groupby('mybi_stop_id').mean().apply(lambda x: math.ceil(x), axis=0).copy()
    moving_average_week = subset_week_df.groupby('mybi_stop_id').mean().apply(lambda x: math.ceil(x), axis=0).copy()

    for name in col_names:
        data_copy[name + '_MA_hour_D-' + str(i)] = moving_average_day[name]
        data_copy[name + '_MA_hour_W-' + str(i)] = moving_average_week[name]
    
    return data_copy

In [38]:
%%time
aaa = parallelize_dataframe(df = mybicard_401_agg_imputation, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 2)

CPU times: user 37.7 s, sys: 6.42 s, total: 44.1 s
Wall time: 48min 22s


In [39]:
%%time
bbb = parallelize_dataframe(df = aaa, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 3)

CPU times: user 1min 45s, sys: 8.93 s, total: 1min 54s
Wall time: 49min 30s


In [40]:
%%time
ccc = parallelize_dataframe(df = bbb, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 4)

CPU times: user 2min 59s, sys: 14.8 s, total: 3min 13s
Wall time: 50min 32s


In [41]:
%%time
ddd = parallelize_dataframe(df = ccc, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 5)

CPU times: user 4min 7s, sys: 19.7 s, total: 4min 27s
Wall time: 51min 18s


In [42]:
%%time
eee = parallelize_dataframe(df = ddd, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 6)

CPU times: user 5min 26s, sys: 23.6 s, total: 5min 50s
Wall time: 52min 46s


In [54]:
[eee['date'][0] - datetime.timedelta(days= j) for j in range(1,3)]

[datetime.date(2020, 4, 7), datetime.date(2020, 4, 6)]

In [59]:
def moving_average_entire_cnt(data, group_data, i):
    col_names = ['normalcnt','studentcnt','childcnt','totalcnt']
    col_all = ['mybi_stop_id', 'date'] + col_names
    
    data_copy = data.copy()
    subset_df = group_data[col_all].groupby(['mybi_stop_id', 'date']).sum().copy()
    
    date_day_list = [data_copy['date'] - datetime.timedelta(days= j) for j in range(1,i+1)]
    date_week_list = [data_copy['date'] - datetime.timedelta(weeks= j) for j in range(1,i+1)]
    date_entire_list = [data_copy['date'] - datetime.timedelta(days= j) for j in range(1,i*7+1)]
                                            
    subset_day_df = subset_df[subset_df['date'].isin(date_day_list)][['mybi_stop_id'] + col_names].copy()
    subset_week_df = subset_df[subset_df['date'].isin(date_week_list)][['mybi_stop_id'] + col_names].copy()
    subset_entire_df = subset_df[subset_df['date'].isin(date_entire_list)][['mybi_stop_id'] + col_names].copy()
    
    if len(subset_day_df) == 0:
        subset_day_df = pd.DataFrame([data_copy[col_names]])
    if len(subset_week_df) == 0:
        subset_week_df = pd.DataFrame([data_copy[col_names]])
    if len(subset_entire_df) == 0:
        subset_entire_df = pd.DataFrame([data_copy[col_names]])
    
    moving_average_day = subset_day_df.groupby('mybi_stop_id').mean().apply(lambda x: math.ceil(x), axis=0).copy()
    moving_average_week = subset_week_df.groupby('mybi_stop_id').mean().apply(lambda x: math.ceil(x), axis=0).copy()
    moving_average_entire = subset_entire_df.groupby('mybi_stop_id').mean().apply(lambda x: math.ceil(x), axis=0).copy()

    for name in col_names:
        data_copy[name + '_MA_all_D-' + str(i)] = moving_average_day[name]
        data_copy[name + '_MA_all_W-' + str(i)] = moving_average_week[name]
        data_copy[name + '_MA_all_A-' + str(i)] = moving_average_entire[name]
    
    return data_copy

In [7]:
%%time
aaa = pd.read_parquet('model_mr/mybicard_401_agg_imputation.parquet', engine='pyarrow')

CPU times: user 712 ms, sys: 1.14 s, total: 1.85 s
Wall time: 907 ms


In [16]:
aa = aaa[:560][['mybi_stop_id', 'transdate','totalcnt']].copy()

In [48]:
aa.head(3)

Unnamed: 0,mybi_stop_id,transdate,totalcnt
0,3100020,2020-04-08 00:00:00,0.0
1,3100020,2020-04-08 05:00:00,0.0
2,3100020,2020-04-08 06:00:00,2.0


In [None]:
def moving_average_entire_cnt(data, group_data, i):
    col_names = ['normalcnt','studentcnt','childcnt','totalcnt']
    col_all = ['mybi_stop_id', 'date'] + col_names
    
    data_copy = data.copy()
    subset_df = group_data[col_all].groupby(['mybi_stop_id', 'date']).sum().copy()
    
    date_day_list = [data_copy['date'] - datetime.timedelta(days= j) for j in range(1,i+1)]
    date_week_list = [data_copy['date'] - datetime.timedelta(weeks= j) for j in range(1,i+1)]
    date_entire_list = [data_copy['date'] - datetime.timedelta(days= j) for j in range(1,i*7+1)]
                                            
    subset_day_df = subset_df[subset_df['date'].isin(date_day_list)][['mybi_stop_id'] + col_names].copy()
    subset_week_df = subset_df[subset_df['date'].isin(date_week_list)][['mybi_stop_id'] + col_names].copy()
    subset_entire_df = subset_df[subset_df['date'].isin(date_entire_list)][['mybi_stop_id'] + col_names].copy()
    
    if len(subset_day_df) == 0:
        subset_day_df = pd.DataFrame([data_copy[col_names]])
    if len(subset_week_df) == 0:
        subset_week_df = pd.DataFrame([data_copy[col_names]])
    if len(subset_entire_df) == 0:
        subset_entire_df = pd.DataFrame([data_copy[col_names]])
    
    moving_average_day = subset_day_df.groupby('mybi_stop_id').mean().apply(lambda x: math.ceil(x), axis=0).copy()
    moving_average_week = subset_week_df.groupby('mybi_stop_id').mean().apply(lambda x: math.ceil(x), axis=0).copy()
    moving_average_entire = subset_entire_df.groupby('mybi_stop_id').mean().apply(lambda x: math.ceil(x), axis=0).copy()

    for name in col_names:
        data_copy[name + '_MA_all_D-' + str(i)] = moving_average_day[name]
        data_copy[name + '_MA_all_W-' + str(i)] = moving_average_week[name]
        data_copy[name + '_MA_all_A-' + str(i)] = moving_average_entire[name]
    
    return data_copy

In [116]:
aa[['transdate', 'totalcnt']].groupby(aa['transdate'].dt.date).sum()

Unnamed: 0_level_0,totalcnt
transdate,Unnamed: 1_level_1
2020-04-08,50.0
2020-04-09,27.0
2020-04-10,42.0
2020-04-11,25.0
2020-04-12,23.0
2020-04-13,41.0
2020-04-14,57.0
2020-04-15,0.0
2020-04-16,0.0
2020-04-17,0.0


In [74]:
def moving_average_entire(group_df, cols, i):
    group_df_copy = group_df.sort_values(['transdate']).reset_index(drop=True).copy()
    group_sum_df_copy = group_df_copy[['transdate'] + cols].groupby(group_df_copy['transdate'].dt.date).sum().copy()
    # group_sum_df_copy['dayofweek'] = group_sum_df_copy['transdate'].dt.dayofweek

    for col in cols:
        tmp_df = pd.DataFrame()
        col_name = col+'_MA_all_A'+str(i)

        for j in range(1, (i*7)+1):
            tmp_df[col+'_'+str(j)] = group_df[col].shift(j)

        group_df_copy[col_name] = tmp_df.mean(axis=1)
        group_df_copy[col_name] = np.where(pd.notnull(group_df_copy[col_name]), group_df_copy[col_name], group_df_copy[col])
        group_df_copy[col_name] = group_df_copy[col_name].apply(np.ceil)
    
    for col in cols:
        tmp_df = pd.DataFrame()
        col_name = col+'_MA_all_D'+str(i)

        for j in range(1, i+1):
            tmp_df[col+'_'+str(j)] = group_df[col].shift(j)

        group_df_copy[col_name] = tmp_df.mean(axis=1)
        group_df_copy[col_name] = np.where(pd.notnull(group_df_copy[col_name]), group_df_copy[col_name], group_df_copy[col])
        group_df_copy[col_name] = group_df_copy[col_name].apply(np.ceil)
    
    for col in cols:
        tmp_df = pd.DataFrame()
        col_name = col+'_MA_all_W'+str(i)

        for j in range(1, i+1):
            tmp_df[col+'_'+str(j)] = group_df[col].shift(j*7)

        group_df_copy[col_name] = tmp_df.mean(axis=1)
        group_df_copy[col_name] = np.where(pd.notnull(group_df_copy[col_name]), group_df_copy[col_name], group_df_copy[col])
        group_df_copy[col_name] = group_df_copy[col_name].apply(np.ceil)

    return group_df_copy

In [121]:
aa.mybi_stop_id.unique()

array([3100020])

In [125]:
a1 = aa[['transdate'] +  ['totalcnt']].groupby(aa['transdate'].dt.date).sum().copy()

In [126]:
tmp_df = pd.DataFrame()

In [127]:
for j in range(1, (2*7)+1):
    tmp_df['totalcnt'+'_'+str(j)] = a1['totalcnt'].shift(j)

In [154]:
bb = tmp_df[[tmp_df.columns[6], tmp_df.columns[13]]][14:16]

In [162]:
pd.merge(aa[280:320], bb, left_on = aa[280:320]['transdate'].dt.date, right_index = True, how = 'left')

Unnamed: 0,mybi_stop_id,transdate,totalcnt,totalcnt_7,totalcnt_14
280,3100020,2020-04-22 00:00:00,0.0,0.0,50.0
281,3100020,2020-04-22 05:00:00,0.0,0.0,50.0
282,3100020,2020-04-22 06:00:00,0.0,0.0,50.0
283,3100020,2020-04-22 07:00:00,0.0,0.0,50.0
284,3100020,2020-04-22 08:00:00,0.0,0.0,50.0
285,3100020,2020-04-22 09:00:00,0.0,0.0,50.0
286,3100020,2020-04-22 10:00:00,0.0,0.0,50.0
287,3100020,2020-04-22 11:00:00,0.0,0.0,50.0
288,3100020,2020-04-22 12:00:00,0.0,0.0,50.0
289,3100020,2020-04-22 13:00:00,0.0,0.0,50.0


In [160]:
aa[280:320]

Unnamed: 0,mybi_stop_id,transdate,totalcnt
280,3100020,2020-04-22 00:00:00,0.0
281,3100020,2020-04-22 05:00:00,0.0
282,3100020,2020-04-22 06:00:00,0.0
283,3100020,2020-04-22 07:00:00,0.0
284,3100020,2020-04-22 08:00:00,0.0
285,3100020,2020-04-22 09:00:00,0.0
286,3100020,2020-04-22 10:00:00,0.0
287,3100020,2020-04-22 11:00:00,0.0
288,3100020,2020-04-22 12:00:00,0.0
289,3100020,2020-04-22 13:00:00,0.0


In [155]:
bb

Unnamed: 0_level_0,totalcnt_7,totalcnt_14
transdate,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-22,0.0,50.0
2020-04-23,0.0,27.0


In [145]:
[tmp_df.columns[6], tmp_df.columns[13]]

['totalcnt_7', 'totalcnt_14']

['totalcnt_7', 'totalcnt_14']

[7, 14]

In [138]:
tmp_df.columns[7, 14]

IndexError: too many indices for array

In [137]:
tmp_df[tmp_df.columns[[7*n for n in range(1,3)]]]

IndexError: index 14 is out of bounds for axis 0 with size 14

In [None]:
for j in range(1, (i*7)+1):
            tmp_df[col+'_'+str(j)] = group_sum_df_copy[col].shift(j)

In [None]:
def moving_average_entire(group_df, cols, i):
    group_df_copy = group_df.sort_values(['transdate']).reset_index(drop=True).copy()
    group_sum_df_copy = group_df_copy[['transdate'] + cols].groupby(group_df_copy['transdate'].dt.date).sum().copy()
    # group_sum_df_copy['dayofweek'] = group_sum_df_copy['transdate'].dt.dayofweek

    for col in cols:
        tmp_df = pd.DataFrame()
        col_name_all = col+'_MA_all_A'+str(i)
        col_name_day = col+'_MA_all_D'+str(i)
        col_name_week = col+'_MA_all_W'+str(i)

        for j in range(1, (i*7)+1):
            tmp_df[col+'_'+str(j)] = group_sum_df_copy[col].shift(j)

        group_sum_df_copy[col_name_all] = tmp_df.mean(axis=1)
        group_sum_df_copy[col_name_day] = tmp_df[tmp_df.columns[:i]].mean(axis=1)
        group_sum_df_copy[col_name_week] = tmp_df[[tmp_df.columns[7*n-1] for n in range(1,i)]].mean(axis=1)
        
        group_sum_df_copy[col_name_all] = np.where(pd.notnull(group_sum_df_copy[col_name_all]), group_sum_df_copy[col_name_all], group_sum_df_copy[col])
        group_sum_df_copy[col_name_day] = np.where(pd.notnull(group_sum_df_copy[col_name_day]), group_sum_df_copy[col_name_day], group_sum_df_copy[col])
        group_sum_df_copy[col_name_week] = np.where(pd.notnull(group_sum_df_copy[col_name_week]), group_sum_df_copy[col_name_week], group_sum_df_copy[col])
        
        group_sum_df_copy[col_name_all] = group_sum_df_copy[col_name_all].apply(np.ceil)
        group_sum_df_copy[col_name_day] = group_sum_df_copy[col_name_day].apply(np.ceil)
        group_sum_df_copy[col_name_week] = group_sum_df_copy[col_name_week].apply(np.ceil)
    
    group_df_copy = pd.merge(group_df_copy, group_sum_df_copy, left_on= group_df_copy.dt.date, right_index= True, how= 'left')
    return group_df_copy

In [109]:
def moving_average_hour(group_df, cols, i):
    group_df_copy = group_df.sort_values(['transdate']).reset_index(drop=True).copy()

    for col in cols:
        tmp_df = pd.DataFrame()
        col_name = col+'_MA_hour_D-'+str(i)

        for j in range(1, i+1):
            tmp_df[col+'_'+str(j)] = group_df[col].shift(j*20)

        group_df_copy[col_name] = tmp_df.mean(axis=1)
        group_df_copy[col_name] = np.where(pd.notnull(group_df_copy[col_name]), group_df_copy[col_name], group_df_copy[col])
        group_df_copy[col_name] = group_df_copy[col_name].apply(np.ceil)

    return group_df_copy

In [110]:
def groupby_process(df,
                    func, 
                    group_keys,
                    num_cores = None,
                    **params) :
    
    if num_cores is None:
        num_cores = cpu_count()
        
    gr_df = df.groupby(group_keys)
    df_list = [group for name, group in gr_df]
    
    def map_func(data):
        return func(group_df = data.copy(), **params)
        
    with Pool(10) as p:

        pd_result = pd.concat(p.map(map_func, df_list))
#         pd_result = p.starmap(func, **params)
        
    return pd_result

In [111]:
%%time
aaaa = groupby_process(df= aaa, func= moving_average_hour, group_keys= 'mybi_stop_id', num_cores= 10
                       , cols=['normalcnt', 'studentcnt', 'childcnt', 'totalcnt'], i= 2)

CPU times: user 1.88 s, sys: 1.58 s, total: 3.46 s
Wall time: 10.9 s


In [112]:
aaaa['dayofweek'] = aaaa['transdate'].dt.dayofweek

In [113]:
aaaa['hour'] = aaaa['transdate'].dt.hour

In [114]:
aaaa[(aaaa['hour']==18)][['transdate','normalcnt', 'studentcnt', 'childcnt', 'totalcnt'
                                                , 'normalcnt_MA_hour_D-2', 'studentcnt_MA_hour_D-2'
                                                , 'childcnt_MA_hour_D-2', 'totalcnt_MA_hour_D-2'
                                                ]]

Unnamed: 0,transdate,normalcnt,studentcnt,childcnt,totalcnt,normalcnt_MA_hour_D-2,studentcnt_MA_hour_D-2,childcnt_MA_hour_D-2,totalcnt_MA_hour_D-2
14,2020-04-08 18:00:00,2.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0
34,2020-04-09 18:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0
54,2020-04-10 18:00:00,2.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0
74,2020-04-11 18:00:00,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
94,2020-04-12 18:00:00,2.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0
114,2020-04-13 18:00:00,2.0,1.0,0.0,3.0,2.0,0.0,0.0,2.0
134,2020-04-14 18:00:00,7.0,0.0,0.0,7.0,2.0,1.0,0.0,3.0
154,2020-04-15 18:00:00,0.0,0.0,0.0,0.0,5.0,1.0,0.0,5.0
174,2020-04-16 18:00:00,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0
194,2020-04-17 18:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
aaaa[(aaaa['hour']==18)][['transdate','normalcnt', 'studentcnt', 'childcnt', 'totalcnt'
                                                , 'normalcnt_MA_hour_D-2', 'studentcnt_MA_hour_D-2'
                                                , 'childcnt_MA_hour_D-2', 'totalcnt_MA_hour_D-2'
                                                ]]

Unnamed: 0,transdate,normalcnt,studentcnt,childcnt,totalcnt,normalcnt_MA_hour_D-2,studentcnt_MA_hour_D-2,childcnt_MA_hour_D-2,totalcnt_MA_hour_D-2
14,2020-04-08 18:00:00,2.0,0.0,0.0,2.0,2.0,0.0,0.0,2.0
34,2020-04-09 18:00:00,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0
54,2020-04-10 18:00:00,2.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0
74,2020-04-11 18:00:00,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
94,2020-04-12 18:00:00,2.0,0.0,0.0,2.0,1.5,0.0,0.0,1.5
114,2020-04-13 18:00:00,2.0,1.0,0.0,3.0,1.5,0.0,0.0,1.5
134,2020-04-14 18:00:00,7.0,0.0,0.0,7.0,2.0,0.5,0.0,2.5
154,2020-04-15 18:00:00,0.0,0.0,0.0,0.0,4.5,0.5,0.0,5.0
174,2020-04-16 18:00:00,0.0,0.0,0.0,0.0,3.5,0.0,0.0,3.5
194,2020-04-17 18:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
bb['totalcnt_MA_hour_D-2'] = np.where(pd.notnull(bb['totalcnt_MA_hour_D-2']), bb['totalcnt_MA_hour_D-2'], bb['totalcnt'])


In [None]:
from multiprocessing_on_dill import Pool, cpu_count
from functools import partial
import pandas as pd
import numpy as np


def parallelize_dataframe(df, 
                          func, 
                          group_keys = None, 
                          num_cores = None,
                          **params):
    """
        Pandas DataFrame의 apply함수를 병렬처리 하는 함수

        Args: 
            df: 적용 대상 데이터프레임 (Pandas.DataFrame)
            group_keys: apply를 적용할 때 기준이되는 group by key

        Returns:
            df: (Pandas.DataFrame)
            
        Exception: 
    """

    if num_cores is None:
        num_cores = cpu_count()

    if group_keys is None:
        df_list = np.array_split(df, num_cores)
    elif group_keys is not None:
        gr_df = df.groupby(group_keys)
        df_list = [group for name, group in gr_df]
    
    #func = partial(func, **params)
    
    def map_func(data):
        return data.apply(func, axis = 1, group_data= data.copy() ,**params)
        
        
    with Pool(num_cores) as p:

        pd_result = pd.concat(p.map(map_func, df_list))
#         pd_result = p.starmap(func, **params)
        
    return pd_result

In [17]:
aa.tail()

Unnamed: 0,mybi_stop_id,transdate,totalcnt
555,3100020,2020-05-05 19:00:00,0.0
556,3100020,2020-05-05 20:00:00,0.0
557,3100020,2020-05-05 21:00:00,0.0
558,3100020,2020-05-05 22:00:00,0.0
559,3100020,2020-05-05 23:00:00,0.0


In [36]:
tmp = pd.DataFrame()

In [37]:
tmp['totalcnt_1'] = aa['totalcnt'].shift(1*20)
tmp['totalcnt_2'] = aa['totalcnt'].shift(2*20)

In [38]:
aa.shape

(560, 3)

In [39]:
tmp.shape

(560, 2)

In [40]:
tmp

Unnamed: 0,totalcnt_1,totalcnt_2
0,,
1,,
2,,
3,,
4,,
5,,
6,,
7,,
8,,
9,,


SyntaxError: invalid syntax (<ipython-input-8-f4084c14c139>, line 2)

In [None]:
%%time
aaaa = parallelize_dataframe(df = eee, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 2)

In [None]:
%%time
bbbb = parallelize_dataframe(df = aaaa, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 3)

In [None]:
%%time
cccc = parallelize_dataframe(df = bbbb, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 4)

In [None]:
%%time
dddd = parallelize_dataframe(df = cccc, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 5)

In [None]:
%%time
eeee = parallelize_dataframe(df = dddd, func = moving_average_hour_cnt, group_keys = 'mybi_stop_id', num_cores = 10, i= 6)