In [None]:
from multiprocessing_on_dill import Pool, cpu_count
from functools import partial
from tqdm import tqdm
import pandas as pd
import numpy as np


def parallelize_dataframe(df, 
                          func, 
                          group_keys = None, 
                          num_cores = None,
                          **params):
    """
        Pandas DataFrame의 apply함수를 병렬처리 하는 함수

        Args: 
            df: 적용 대상 데이터프레임 (Pandas.DataFrame)
            group_keys: apply를 적용할 때 기준이되는 group by key

        Returns:
            df: (Pandas.DataFrame)
            
        Exception: 
    """

    if num_cores is None:
        num_cores = cpu_count()

    if group_keys is None:
        df_list = np.array_split(df, num_cores)
    elif group_keys is not None:
        gr_df = df.groupby(group_keys)
        df_list = [group for name, group in gr_df]
    
    #func = partial(func, **params)
    
    def map_func(data):
        return data.apply(func, axis = 1, group_data= data.copy() ,**params)
        
        
    with Pool(num_cores) as p:

        pd_result = pd.concat(p.map(map_func, df_list))
#         pd_result = p.starmap(func, **params)
        
    return pd_result

In [None]:
def period_before_cnt(data, group_data):
    data_copy = data.copy()
    col_names = ['totalcnt','normalcnt','studentcnt','childcnt']
    
    subset_df = group_data.sort_values(['transdate']).reset_index(drop=True).copy()
    before_1day = subset_df[(subset_df['transdate'] == data_copy['transdate'] - datetime.timedelta(days=1))][col_names].copy()
    before_1week = subset_df[(subset_df['transdate'] == data_copy['transdate'] - datetime.timedelta(weeks=1))][col_names].copy()
    
    j = 0
    while len(before_1day) == 0:
        j += 1
        if subset_df['transdate'].min() >= data_copy['transdate'] - datetime.timedelta(days=j):
            before_1day = subset_df[(subset_df['transdate'] == data_copy['transdate'])][col_names].copy()
            j = -1
            break
        before_1day = subset_df[(subset_df['transdate'] == data_copy['transdate'] - datetime.timedelta(days=j))][col_names].copy()
    before_1day['check'] = j
    
    k = 0
    while len(before_1week) == 0:
        k += 1
        if subset_df['transdate'].min() >= data_copy['transdate'] - datetime.timedelta(weeks=k):
            before_1week = subset_df[(subset_df['transdate'] == data_copy['transdate'])][col_names].copy()
            k = -1
            break
        before_1week = subset_df[(subset_df['transdate'] == data_copy['transdate'] - datetime.timedelta(weeks=k))][col_names].copy()
    before_1week['check'] = k
    
    for name in col_names + ['check']:
        data_copy[name + 'D-1'] = before_1day.reset_index(drop=True).loc[0][name]
        data_copy[name + 'W-1'] = before_1week.reset_index(drop=True).loc[0][name]
    return data_copy

In [None]:
%%time
aa = parallelize_dataframe(df = bus_demand_401_10m[:1000][['stop_id', 'transdate', 'totalcnt', 'normalcnt', 'studentcnt', 'childcnt']]
                                          , func = period_before_cnt, group_keys = 'stop_id', num_cores = 10, entire_df = bus_demand_401_10m[:1000][['stop_id', 'transdate', 'totalcnt', 'normalcnt', 'studentcnt', 'childcnt']])