window 돌면서 feature engineering

train시에는 첫달 버리고 진행

In [1]:
import pandas as pd
import numpy as np
import gc
import joblib
import datetime
import itertools
import os.path as path
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import json

In [2]:
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
%matplotlib inline

font_dirs = ['/home/workspace/user-workspace/font']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
    
plt.rcParams['font.family'] = 'NanumGothic'

In [3]:
train_path = '/home/workspace/user-workspace/slim_train.parquet'
test_path ='/home/workspace/user-workspace/slim_test.parquet'
encoder = '/home/workspace/user-workspace/cat_encoder.json'
decoder = '/home/workspace/user-workspace/inverse_cat_encoder.json'
data_dir = '/home/workspace/user-workspace/junheon/data/task150/'

In [4]:
negative_ratio = 1
bagging_size = 5

In [5]:
train_df = pd.read_parquet(train_path).reset_index().rename(columns={"index": "id"})

In [6]:
test_df = pd.read_parquet(test_path).reset_index().rename(columns={"index": "id"})

In [7]:
with open("/home/workspace/user-workspace/cat_encoder.json") as json_file:
    decoder = json.load(json_file)

In [8]:
train_df.shape

(7866548, 42)

# Create Indices 
# (7월, ACUM_RCPT_AMT<1 빼고)

In [9]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 04:56:56.126504


In [10]:
df_train = train_df[['id', 'REQ_DD', "ACUM_RCPT_AMT", 'target']]

In [11]:
df_train['datetime'] = pd.to_datetime(df_train['REQ_DD'], format='%Y%m%d')
df_train['month'] = df_train['datetime'].dt.month.astype("uint8")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['datetime'] = pd.to_datetime(df_train['REQ_DD'], format='%Y%m%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['month'] = df_train['datetime'].dt.month.astype("uint8")


In [12]:
positive = df_train[(df_train['target']==1)&(df_train['month']!=7)&(df_train['ACUM_RCPT_AMT']>=1)]
negative = df_train[(df_train['target']==0)&(df_train['month']!=7)&(df_train['ACUM_RCPT_AMT']>=1)]

In [13]:
print(len(positive))
print(len(negative))

58665
5653077


In [14]:
for seed in range(bagging_size):
    negative_sample = negative.sample(n=(len(positive)*negative_ratio), random_state=seed)
    id_list = positive.id.tolist() + negative_sample.id.tolist()
    joblib.dump(id_list, f"{data_dir}indices_{seed}.pkl")

# Basic features

In [7]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 10:11:09.701543


In [11]:
FILE_NAME = "basic_feature"

In [7]:
basic_features = {
    "id": "uint32",
    "AC_PAY_AMT": "uint32",
    "AGE": "uint8",
    "SMS_RE_SND_CNT": "uint8",
    "ACUM_RCPT_AMT": "int32",
    "MAX_NPAY_CNT_24M": "uint8",
    "TRD_CNT_6M": "uint8",
    "REAL_TRD_CNT_6M": "uint8",
    "NPAY_CNT_24M": "uint8",
    "NPAY_CNT_12MNTS": "uint8",
    "MM_LMT_AMT": "float32",
    "REMD_LMT_AMT": "float32",
    "NPAY_AMT_24M": "float32",
    "NIGHT_TRD_RT_6M": "float32",
    "AVG_AMT_6M": "float32",
    "MAX_LMT_3M_RT": "float32",
    "NPAY_AMT_60M": "float32",
    "SUB_IP_A": "uint16",
    "SUB_IP_B": "uint16",
    "SUB_IP_C": "uint16",
    "SUB_IP_D": "uint16",
    "CP_CD": "uint32"
}

In [8]:
category_features = [
    "COMMC_CLF", "NPAY_YN", "PAY_MTHD_CD", "ARS_AUTHTI_YN", "GNDR", "FOREI_YN",  "AUTHTI_CLF_FLG", 
    "SVC_CLF_NM", "CP_M_CLF_NM", "CP_S_CLF_NM" 
]

In [9]:
df_train = train_df[list(basic_features.keys()) + category_features + ["target"]].astype(basic_features)
df_test = test_df[list(basic_features.keys()) + category_features].astype(basic_features)

df_train['NPAY_YN'] = [-99 if x==2 else x for x in df_train['NPAY_YN']]
df_test['NPAY_YN'] = [-99 if x==2 else x for x in df_test['NPAY_YN']]

df_train['PAY_MTHD_CD'] = [-99 if x==0 else x for x in df_train['PAY_MTHD_CD']]
df_test['PAY_MTHD_CD'] = [-99 if x==0 else x for x in df_test['PAY_MTHD_CD']]

df_train['MM_LMT_AMT'] = [1000000 if np.isnan(x) else x for x in df_train['MM_LMT_AMT']]
df_test['MM_LMT_AMT'] = [1000000 if np.isnan(x) else x for x in df_test['MM_LMT_AMT']]

df_train['ARS_AUTHTI_YN'] = [-99 if x==2 else x for x in df_train['ARS_AUTHTI_YN']]
df_test['ARS_AUTHTI_YN'] = [-99 if x==2 else x for x in df_test['ARS_AUTHTI_YN']]

df_train['CP_M_CLF_NM'] = [-99 if x==5 else x for x in df_train['CP_M_CLF_NM']]
df_test['CP_M_CLF_NM'] = [-99 if x==5 else x for x in df_test['CP_M_CLF_NM']]

df_train['CP_S_CLF_NM'] = [-99 if x==34 else x for x in df_train['CP_S_CLF_NM']]
df_test['CP_S_CLF_NM'] = [-99 if x==34 else x for x in df_test['CP_S_CLF_NM']]

In [12]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [13]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

# day, weekday

In [23]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 04:58:28.225266


In [24]:
FILE_NAME = "day_weekday"

In [25]:
df_train = train_df[['id', 'REQ_DD']].astype({"REQ_DD": "str"})
df_test = test_df[['id', 'REQ_DD']].astype({"REQ_DD": "str"})

In [26]:
df_train['datetime'] = pd.to_datetime(df_train['REQ_DD'], format='%Y%m%d')
df_test['datetime'] = pd.to_datetime(df_test['REQ_DD'], format='%Y%m%d')

In [27]:
df_train['day'] = df_train['datetime'].dt.day.astype("uint8")
df_train['weekday'] = df_train['datetime'].dt.weekday.astype('uint8')
df_train['month'] = df_train['datetime'].dt.month.astype("uint8")
df_test['day'] = df_test['datetime'].dt.day.astype("uint8")
df_test['weekday'] = df_test['datetime'].dt.weekday.astype('uint8')
df_test['month'] = df_test['datetime'].dt.month.astype("uint8")

In [28]:
df_train = df_train[['id', 'day', 'weekday', 'month']]
df_test = df_test[['id', 'day', 'weekday', 'month']]

In [29]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [30]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [31]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-14 04:58:45.801035


# SUB IP

In [32]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 04:58:45.806186


In [33]:
FILE_NAME = "sub_ip"

In [34]:
ip_features = {
    "id": "uint32",
    "SUB_IP_A": "uint16",
    "SUB_IP_B": "uint16",
    "SUB_IP_C": "uint16",
    "SUB_IP_D": "uint16"
}

In [35]:
df_train = train_df[list(ip_features.keys())].astype(ip_features)
df_test = test_df[list(ip_features.keys())].astype(ip_features)

In [36]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [37]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [38]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-14 04:58:47.701355


# 거래금액 (AC_PAY_AMT) 49000원, 11000원

49000원: 롤 rp 충전 최고 금액

11000원: 아프리카 별풍선 100

In [39]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 04:58:47.706274


In [40]:
FILE_NAME = "pay_amt"

In [41]:
df_train = train_df[['id', 'AC_PAY_AMT']]
df_test = test_df[['id', 'AC_PAY_AMT']]

In [42]:
df_train["is_49900_PAY_AMT"] = (df_train['AC_PAY_AMT']==49900)
df_train["is_11000_PAY_AMT"] = (df_train['AC_PAY_AMT']==11000)
df_train["is_11000s_PAY_AMT"] = (df_train['AC_PAY_AMT']%11000==0)
df_test["is_49900_PAY_AMT"] = (df_test['AC_PAY_AMT']==49900)
df_test["is_11000_PAY_AMT"] = (df_test['AC_PAY_AMT']==11000)
df_test["is_11000s_PAY_AMT"] = (df_test['AC_PAY_AMT']%11000==0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_49900_PAY_AMT"] = (df_train['AC_PAY_AMT']==49900)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_11000_PAY_AMT"] = (df_train['AC_PAY_AMT']==11000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_11000s_PAY_AMT"] = (df_train['AC_PAY_AMT']%11000==0)
A value is tr

In [43]:
df_train = df_train[['id', 'is_49900_PAY_AMT', 'is_11000_PAY_AMT', 'is_11000s_PAY_AMT']]
df_test = df_test[['id', 'is_49900_PAY_AMT', 'is_11000_PAY_AMT', 'is_11000s_PAY_AMT']]

In [44]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [45]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [46]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-14 04:58:49.288392


# Store(CP_CD)

## count

In [47]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 04:58:53.144600


In [48]:
WINDOW_SIZE = [3, 10, 30]

In [49]:
FILE_NAME = "cp_count"
COLUMN = "CP_CD"

In [50]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD']], test_df[['id', COLUMN, 'REQ_DD']]], ignore_index=True)

In [51]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [52]:
for window_size in WINDOW_SIZE:
    print(f"window size: {window_size}D")
    count = df.groupby([COLUMN, 'REQ_DD']).count().reset_index().rename(columns={"id": "count"})
    count['datetime'] = pd.to_datetime(count['REQ_DD'], format='%Y%m%d')
    count = count.drop(columns=['REQ_DD'])
    cum_count = count.sort_values(by=['datetime']).groupby([COLUMN]).rolling(f"{window_size+1}D", min_periods=1, on='datetime')['count'].sum().reset_index().rename(columns={"count": f"cum_{window_size}_count"})
    cum_count['REQ_DD'] = cum_count['datetime'].dt.strftime('%Y%m%d')
    cum_count = cum_count.drop(columns=['datetime'])
    count['REQ_DD'] = count['datetime'].dt.strftime('%Y%m%d')
    count = count.drop(columns=['datetime'])
    count = count.merge(cum_count, on=[COLUMN, 'REQ_DD'], how='left')
    count[f'{FILE_NAME}_{window_size}'] = count[f'cum_{window_size}_count'] - count['count']
    count = count[[COLUMN, 'REQ_DD', f'{FILE_NAME}_{window_size}']].astype({'REQ_DD': 'int32'})
    df_train = df_train.merge(count, on=[COLUMN, 'REQ_DD'], how='left')
    df_test = df_test.merge(count, on=[COLUMN, 'REQ_DD'], how='left')

window size: 3D
window size: 10D
window size: 30D


In [53]:
df_train = df_train.drop(columns=['REQ_DD', COLUMN])
df_test = df_test.drop(columns=['REQ_DD', COLUMN])

In [54]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [55]:
df_test.fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [56]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-14 04:59:12.741398


## unique count: 'GODS_NM', 'PAYR_SEQ', 'PAYR_IP', 'MPHN_NO', 'COMMC_CLF'

In [58]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 05:09:01.159047


In [59]:
WINDOW_SIZE = [3, 10, 30]

In [60]:
FILE_NAME = "unique_count_wrt_cp"
COLUMN = "CP_CD"
TARGET_LIST = ["GODS_NM", "PAYR_SEQ", "PAYR_IP", "MPHN_NO", "COMMC_CLF"]

In [61]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [62]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}unique_{target}_count_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].drop_duplicates(subset=[COLUMN, target, 'REQ_DD']).sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        count_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            count_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, target]]
            count_df = count_df.groupby([COLUMN]).agg(['nunique']).reset_index()
            count_df.columns = [COLUMN, f'unique_{target}_wrt_{COLUMN}_{window}']
            count_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            count_list.append(count_df)
            start_date = start_date + datetime.timedelta(days=1)
        count_df = pd.concat(count_list, axis=0)
        count_df.to_parquet(data_path)        

window: 3, target: GODS_NM
/home/workspace/user-workspace/junheon/data/task150/unique_GODS_NM_count_wrt_CP_CD_3 exists!
window: 3, target: PAYR_SEQ
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_SEQ_count_wrt_CP_CD_3 exists!
window: 3, target: PAYR_IP
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_IP_count_wrt_CP_CD_3 exists!
window: 3, target: MPHN_NO
/home/workspace/user-workspace/junheon/data/task150/unique_MPHN_NO_count_wrt_CP_CD_3 exists!
window: 3, target: COMMC_CLF
/home/workspace/user-workspace/junheon/data/task150/unique_COMMC_CLF_count_wrt_CP_CD_3 exists!
window: 10, target: GODS_NM
/home/workspace/user-workspace/junheon/data/task150/unique_GODS_NM_count_wrt_CP_CD_10 exists!
window: 10, target: PAYR_SEQ
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_SEQ_count_wrt_CP_CD_10 exists!
window: 10, target: PAYR_IP
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_IP_count_wrt_CP_CD_10 exists!
window: 10, target: MPHN_N

In [63]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [64]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}unique_{target}_count_wrt_{COLUMN}_{window}"
        
        count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
        df_test = df_test.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: GODS_NM
window: 3, target: PAYR_SEQ
window: 3, target: PAYR_IP
window: 3, target: MPHN_NO
window: 3, target: COMMC_CLF
window: 10, target: GODS_NM
window: 10, target: PAYR_SEQ
window: 10, target: PAYR_IP
window: 10, target: MPHN_NO
window: 10, target: COMMC_CLF
window: 30, target: GODS_NM
window: 30, target: PAYR_SEQ
window: 30, target: PAYR_IP
window: 30, target: MPHN_NO
window: 30, target: COMMC_CLF


In [65]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [66]:
df_test.drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [67]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 07:43:24.644363


## average: 'NPAY_AMT_24M', 'MAX_NPAY_CNT_24M', 'NIGHT_TRD_RT_6M' 

In [66]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 05:10:17.278208


In [67]:
WINDOW_SIZE = [3, 10, 30]

In [68]:
FILE_NAME = "average_wrt_cp"
COLUMN = "CP_CD"
TARGET_LIST = ["NPAY_AMT_24M", "MAX_NPAY_CNT_24M", "NIGHT_TRD_RT_6M"]

In [69]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [70]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}average_{target}_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        average_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            average_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, target]]
            average_df = average_df.groupby([COLUMN]).mean().reset_index()
            average_df.columns = [COLUMN, f'average_{target}_wrt_{COLUMN}_{window}']
            average_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            average_list.append(average_df)
            start_date = start_date + datetime.timedelta(days=1)
        average_df = pd.concat(average_list, axis=0)
        average_df.to_parquet(data_path)        

window: 3, target: NPAY_AMT_24M
/home/workspace/user-workspace/junheon/data/task150/average_NPAY_AMT_24M_wrt_CP_CD_3 exists!
window: 3, target: MAX_NPAY_CNT_24M
/home/workspace/user-workspace/junheon/data/task150/average_MAX_NPAY_CNT_24M_wrt_CP_CD_3 exists!
window: 3, target: NIGHT_TRD_RT_6M
/home/workspace/user-workspace/junheon/data/task150/average_NIGHT_TRD_RT_6M_wrt_CP_CD_3 exists!
window: 10, target: NPAY_AMT_24M
/home/workspace/user-workspace/junheon/data/task150/average_NPAY_AMT_24M_wrt_CP_CD_10 exists!
window: 10, target: MAX_NPAY_CNT_24M
/home/workspace/user-workspace/junheon/data/task150/average_MAX_NPAY_CNT_24M_wrt_CP_CD_10 exists!
window: 10, target: NIGHT_TRD_RT_6M
/home/workspace/user-workspace/junheon/data/task150/average_NIGHT_TRD_RT_6M_wrt_CP_CD_10 exists!
window: 30, target: NPAY_AMT_24M
/home/workspace/user-workspace/junheon/data/task150/average_NPAY_AMT_24M_wrt_CP_CD_30 exists!
window: 30, target: MAX_NPAY_CNT_24M
/home/workspace/user-workspace/junheon/data/task150/

In [71]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [72]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}average_{target}_wrt_{COLUMN}_{window}"
        
        average_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(average_df, on=[COLUMN, 'REQ_DD'], how='left')
        df_test = df_test.merge(average_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: NPAY_AMT_24M
window: 3, target: MAX_NPAY_CNT_24M
window: 3, target: NIGHT_TRD_RT_6M
window: 10, target: NPAY_AMT_24M
window: 10, target: MAX_NPAY_CNT_24M
window: 10, target: NIGHT_TRD_RT_6M
window: 30, target: NPAY_AMT_24M
window: 30, target: MAX_NPAY_CNT_24M
window: 30, target: NIGHT_TRD_RT_6M


In [73]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [19]:
df_test.drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [20]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 11:28:20.351843


## proportion: 'ARS_AUTHTI_YN', 'NPAY_YN', 'GNDR', 'FOREI_YN'

#### 각 feature 조건의 COUNT

In [167]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 08:02:12.314960


In [168]:
WINDOW_SIZE = [3, 10, 30]

In [169]:
FILE_NAME = "count_for_prop_wrt_cp"
COLUMN = "CP_CD"
TARGET_LIST = ["ARS_AUTHTI_YN", "NPAY_YN", "GNDR", "FOREI_YN"]

In [170]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [171]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}{target}_count_for_prop_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        count_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            count_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)&(temp_df[target]==1)][[COLUMN, target]]
            count_df = count_df.groupby([COLUMN]).count().reset_index()
            count_df.columns = [COLUMN, f'{target}_count_for_prop_wrt_{COLUMN}_{window}']
            count_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            count_list.append(count_df)
            start_date = start_date + datetime.timedelta(days=1)
        count_df = pd.concat(count_list, axis=0)
        count_df.to_parquet(data_path)        

window: 3, target: ARS_AUTHTI_YN
/home/workspace/user-workspace/junheon/data/task150/ARS_AUTHTI_YN_count_for_prop_wrt_CP_CD_3 exists!
window: 3, target: NPAY_YN
/home/workspace/user-workspace/junheon/data/task150/NPAY_YN_count_for_prop_wrt_CP_CD_3 exists!
window: 3, target: GNDR
/home/workspace/user-workspace/junheon/data/task150/GNDR_count_for_prop_wrt_CP_CD_3 exists!
window: 3, target: FOREI_YN
/home/workspace/user-workspace/junheon/data/task150/FOREI_YN_count_for_prop_wrt_CP_CD_3 exists!
window: 10, target: ARS_AUTHTI_YN
/home/workspace/user-workspace/junheon/data/task150/ARS_AUTHTI_YN_count_for_prop_wrt_CP_CD_10 exists!
window: 10, target: NPAY_YN
/home/workspace/user-workspace/junheon/data/task150/NPAY_YN_count_for_prop_wrt_CP_CD_10 exists!
window: 10, target: GNDR
/home/workspace/user-workspace/junheon/data/task150/GNDR_count_for_prop_wrt_CP_CD_10 exists!
window: 10, target: FOREI_YN
/home/workspace/user-workspace/junheon/data/task150/FOREI_YN_count_for_prop_wrt_CP_CD_10 exists!


In [172]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [173]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}{target}_count_for_prop_wrt_{COLUMN}_{window}"
        
        count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
        df_test = df_test.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: ARS_AUTHTI_YN
window: 3, target: NPAY_YN
window: 3, target: GNDR
window: 3, target: FOREI_YN
window: 10, target: ARS_AUTHTI_YN
window: 10, target: NPAY_YN


KeyboardInterrupt: 

In [None]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [44]:
df_test.fillna(0).drop(columns=[COLUMN, 'REQ_DD']).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [45]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 11:44:24.141346


#### count와 count_for_prop 불러와서 비율 구하기 

In [174]:
BASE_FILE_NAME = "cp_count"
RESULT_FILE_NAME = "prop_wrt_cp"

In [175]:
for seed in range(bagging_size):
    count_df = pd.read_parquet(f"{data_dir}{BASE_FILE_NAME}_{seed}.parquet")
    count_for_prop_df = pd.read_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")
    COLUMNS = ["id"]
    for window in WINDOW_SIZE:
        for target in TARGET_LIST:
            count_df[f"prop_{target}_wrt_{COLUMN}_{window}"] = \
                count_for_prop_df[f'{target}_count_for_prop_wrt_{COLUMN}_{window}'] / \
                count_df[f'{BASE_FILE_NAME}_{window}']
            COLUMNS.append(f"prop_{target}_wrt_{COLUMN}_{window}")
    count_df[COLUMNS].to_parquet(f"{data_dir}{RESULT_FILE_NAME}_{seed}.parquet")

In [176]:
count_df = pd.read_parquet(f"{data_dir}{BASE_FILE_NAME}_test.parquet")
count_for_prop_df = pd.read_parquet(f"{data_dir}{FILE_NAME}_test.parquet")
COLUMNS = ["id"]
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        count_df[f"prop_{target}_wrt_{COLUMN}_{window}"] = \
            count_for_prop_df[f'{target}_count_for_prop_wrt_{COLUMN}_{window}'] / \
            count_df[f'{BASE_FILE_NAME}_{window}']
        COLUMNS.append(f"prop_{target}_wrt_{COLUMN}_{window}")
count_df[COLUMNS].to_parquet(f"{data_dir}{RESULT_FILE_NAME}_test.parquet")

## ratio: unique('PAYR_IP')/unique('MPHN_NO'), unique('GODS_NM')/unique('PAYR_SEQ'), unique('PAYR_IP')/unique('PAYR_SEQ'), unique('MPHN_NO')/unique('PAYR_SEQ')

In [177]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 08:03:15.185034


In [178]:
WINDOW_SIZE = [3, 10, 30]

In [179]:
LOAD_FILE_NAME = "unique_count_wrt_cp"
FILE_NAME = "unique_divide_wrt_cp"
COLUMN = "CP_CD"
COLUMNS = []
TARGET_LIST = ["GODS_NM", "PAYR_SEQ", "PAYR_IP", "MPHN_NO"]
NEW_TARGET_LIST = [("PAYR_IP", "MPHN_NO"), ("GODS_NM", "PAYR_SEQ"), ("PAYR_IP", "PAYR_SEQ"), ("MPHN_NO", "PAYR_SEQ")]

In [180]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        COLUMNS.append(f'unique_{target}_wrt_{COLUMN}_{window}')

In [181]:
for seed in range(bagging_size):
    df_train = pd.read_parquet(f"{data_dir}{LOAD_FILE_NAME}_{seed}.parquet")[["id"] + COLUMNS]
    new_columns = []
    for window in WINDOW_SIZE:
        for new_target in NEW_TARGET_LIST:
            new_column = f"unique_{new_target[0]}_by_{new_target[1]}_wrt_{COLUMN}_{window}"
            df_train[new_column] = df_train[f"unique_{new_target[0]}_wrt_{COLUMN}_{window}"] / df_train[f"unique_{new_target[1]}_wrt_{COLUMN}_{window}"]
            new_columns.append(new_column)
    df_train = df_train[["id"] + new_columns].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [128]:
df_test = pd.read_parquet(f"{data_dir}{LOAD_FILE_NAME}_test.parquet")[["id"] + COLUMNS]

new_columns = []
for window in WINDOW_SIZE:
    for new_target in NEW_TARGET_LIST:
        new_column = f"unique_{new_target[0]}_by_{new_target[1]}_wrt_{COLUMN}_{window}"
        df_test[new_column] = df_test[f"unique_{new_target[0]}_wrt_{COLUMN}_{window}"] / df_test[f"unique_{new_target[1]}_wrt_{COLUMN}_{window}"]
        new_columns.append(new_column)
df_test = df_test[["id"] + new_columns].to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [129]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 12:16:45.475957


# Item(GODS_NM)

## count

In [112]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 05:21:47.850967


In [113]:
WINDOW_SIZE = [3, 10, 30]

In [114]:
FILE_NAME = "gods_count"
COLUMN = "GODS_NM"

In [115]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD']], test_df[['id', COLUMN, 'REQ_DD']]], ignore_index=True)

In [116]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [117]:
for window_size in WINDOW_SIZE:
    print(f"window size: {window_size}D")
    count = df.groupby([COLUMN, 'REQ_DD']).count().reset_index().rename(columns={"id": "count"})
    count['datetime'] = pd.to_datetime(count['REQ_DD'], format='%Y%m%d')
    count = count.drop(columns=['REQ_DD'])
    cum_count = count.sort_values(by=['datetime']).groupby([COLUMN]).rolling(f"{window_size+1}D", min_periods=1, on='datetime')['count'].sum().reset_index().rename(columns={"count": f"cum_{window_size}_count"})
    cum_count['REQ_DD'] = cum_count['datetime'].dt.strftime('%Y%m%d')
    cum_count = cum_count.drop(columns=['datetime'])
    count['REQ_DD'] = count['datetime'].dt.strftime('%Y%m%d')
    count = count.drop(columns=['datetime'])
    count = count.merge(cum_count, on=[COLUMN, 'REQ_DD'], how='left')
    count[f'{FILE_NAME}_{window_size}'] = count[f'cum_{window_size}_count'] - count['count']
    count = count[[COLUMN, 'REQ_DD', f'{FILE_NAME}_{window_size}']].astype({'REQ_DD': 'int32'})
    df_train = df_train.merge(count, on=[COLUMN, 'REQ_DD'], how='left')
    df_test = df_test.merge(count, on=[COLUMN, 'REQ_DD'], how='left')

window size: 3D
window size: 10D
window size: 30D


In [118]:
df_train = df_train.drop(columns=['REQ_DD', COLUMN])
df_test = df_test.drop(columns=['REQ_DD', COLUMN])

In [119]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [92]:
df_test.fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [93]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 07:49:06.811148


## unique count: 'CP_CD', 'PAYR_SEQ', 'PAYR_IP', 'MPHN_NO', 'COMMC_CLF'

In [86]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 05:11:29.312257


In [87]:
WINDOW_SIZE = [3, 10, 30]

In [88]:
FILE_NAME = "unique_count_wrt_gods"
COLUMN = "GODS_NM"
TARGET_LIST = ["CP_CD", "PAYR_SEQ", "PAYR_IP", "MPHN_NO", "COMMC_CLF"]

In [89]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [90]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}unique_{target}_count_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].drop_duplicates(subset=[COLUMN, target, 'REQ_DD']).sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        count_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            count_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, target]]
            count_df = count_df.groupby([COLUMN]).agg(['nunique']).reset_index()
            count_df.columns = [COLUMN, f'unique_{target}_wrt_{COLUMN}_{window}']
            count_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            count_list.append(count_df)
            start_date = start_date + datetime.timedelta(days=1)
        count_df = pd.concat(count_list, axis=0)
        count_df.to_parquet(data_path)        

window: 3, target: CP_CD
/home/workspace/user-workspace/junheon/data/task150/unique_CP_CD_count_wrt_GODS_NM_3 exists!
window: 3, target: PAYR_SEQ
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_SEQ_count_wrt_GODS_NM_3 exists!
window: 3, target: PAYR_IP
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_IP_count_wrt_GODS_NM_3 exists!
window: 3, target: MPHN_NO
/home/workspace/user-workspace/junheon/data/task150/unique_MPHN_NO_count_wrt_GODS_NM_3 exists!
window: 3, target: COMMC_CLF
/home/workspace/user-workspace/junheon/data/task150/unique_COMMC_CLF_count_wrt_GODS_NM_3 exists!
window: 10, target: CP_CD
/home/workspace/user-workspace/junheon/data/task150/unique_CP_CD_count_wrt_GODS_NM_10 exists!
window: 10, target: PAYR_SEQ
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_SEQ_count_wrt_GODS_NM_10 exists!
window: 10, target: PAYR_IP
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_IP_count_wrt_GODS_NM_10 exists!
window: 10, target

In [91]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [92]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}unique_{target}_count_wrt_{COLUMN}_{window}"
        
        count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
        df_test = df_test.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: CP_CD
window: 3, target: PAYR_SEQ
window: 3, target: PAYR_IP
window: 3, target: MPHN_NO
window: 3, target: COMMC_CLF
window: 10, target: CP_CD
window: 10, target: PAYR_SEQ
window: 10, target: PAYR_IP
window: 10, target: MPHN_NO
window: 10, target: COMMC_CLF
window: 30, target: CP_CD
window: 30, target: PAYR_SEQ
window: 30, target: PAYR_IP
window: 30, target: MPHN_NO
window: 30, target: COMMC_CLF


In [93]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [102]:
df_test.drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [103]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 08:06:55.861451


## average: 'NPAY_AMT_24M', 'MAX_NPAY_CNT_24M', 'NIGHT_TRD_RT_6M'

In [94]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 05:16:02.921330


In [95]:
WINDOW_SIZE = [3, 10, 30]

In [96]:
FILE_NAME = "average_wrt_gods"
COLUMN = "GODS_NM"
TARGET_LIST = ["NPAY_AMT_24M", "MAX_NPAY_CNT_24M", "NIGHT_TRD_RT_6M"]

In [97]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [98]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}average_{target}_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        average_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            average_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, target]]
            average_df = average_df.groupby([COLUMN]).mean().reset_index()
            average_df.columns = [COLUMN, f'average_{target}_wrt_{COLUMN}_{window}']
            average_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            average_list.append(average_df)
            start_date = start_date + datetime.timedelta(days=1)
        average_df = pd.concat(average_list, axis=0)
        average_df.to_parquet(data_path)        

window: 3, target: NPAY_AMT_24M
/home/workspace/user-workspace/junheon/data/task150/average_NPAY_AMT_24M_wrt_GODS_NM_3 exists!
window: 3, target: MAX_NPAY_CNT_24M
/home/workspace/user-workspace/junheon/data/task150/average_MAX_NPAY_CNT_24M_wrt_GODS_NM_3 exists!
window: 3, target: NIGHT_TRD_RT_6M
/home/workspace/user-workspace/junheon/data/task150/average_NIGHT_TRD_RT_6M_wrt_GODS_NM_3 exists!
window: 10, target: NPAY_AMT_24M
/home/workspace/user-workspace/junheon/data/task150/average_NPAY_AMT_24M_wrt_GODS_NM_10 exists!
window: 10, target: MAX_NPAY_CNT_24M
/home/workspace/user-workspace/junheon/data/task150/average_MAX_NPAY_CNT_24M_wrt_GODS_NM_10 exists!
window: 10, target: NIGHT_TRD_RT_6M
/home/workspace/user-workspace/junheon/data/task150/average_NIGHT_TRD_RT_6M_wrt_GODS_NM_10 exists!
window: 30, target: NPAY_AMT_24M
/home/workspace/user-workspace/junheon/data/task150/average_NPAY_AMT_24M_wrt_GODS_NM_30 exists!
window: 30, target: MAX_NPAY_CNT_24M
/home/workspace/user-workspace/junheon

In [99]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [100]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}average_{target}_wrt_{COLUMN}_{window}"
        
        average_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(average_df, on=[COLUMN, 'REQ_DD'], how='left')
        df_test = df_test.merge(average_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: NPAY_AMT_24M
window: 3, target: MAX_NPAY_CNT_24M
window: 3, target: NIGHT_TRD_RT_6M
window: 10, target: NPAY_AMT_24M
window: 10, target: MAX_NPAY_CNT_24M
window: 10, target: NIGHT_TRD_RT_6M
window: 30, target: NPAY_AMT_24M
window: 30, target: MAX_NPAY_CNT_24M
window: 30, target: NIGHT_TRD_RT_6M


In [101]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [31]:
df_test.drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [32]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 11:37:17.865382


## proposition: 'ARS_AUTHTI_YN', 'NPAY_YN', 'GNDR', 'FOREI_YN'

#### 각 feature 조건의 COUNT

In [123]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 05:27:11.757375


In [124]:
WINDOW_SIZE = [3, 10, 30]

In [125]:
FILE_NAME = "count_for_prop_wrt_gods"
COLUMN = "GODS_NM"
TARGET_LIST = ["ARS_AUTHTI_YN", "NPAY_YN", "GNDR", "FOREI_YN"]

In [126]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [106]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}{target}_count_for_prop_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        count_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            count_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)&(temp_df[target]==1)][[COLUMN, target]]
            count_df = count_df.groupby([COLUMN]).count().reset_index()
            count_df.columns = [COLUMN, f'{target}_count_for_prop_wrt_{COLUMN}_{window}']
            count_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            count_list.append(count_df)
            start_date = start_date + datetime.timedelta(days=1)
        count_df = pd.concat(count_list, axis=0)
        count_df.to_parquet(data_path)        

window: 3, target: ARS_AUTHTI_YN
/home/workspace/user-workspace/junheon/data/task150/ARS_AUTHTI_YN_count_for_prop_wrt_GODS_NM_3 exists!
window: 3, target: NPAY_YN
/home/workspace/user-workspace/junheon/data/task150/NPAY_YN_count_for_prop_wrt_GODS_NM_3 exists!
window: 3, target: GNDR
/home/workspace/user-workspace/junheon/data/task150/GNDR_count_for_prop_wrt_GODS_NM_3 exists!
window: 3, target: FOREI_YN
/home/workspace/user-workspace/junheon/data/task150/FOREI_YN_count_for_prop_wrt_GODS_NM_3 exists!
window: 10, target: ARS_AUTHTI_YN
/home/workspace/user-workspace/junheon/data/task150/ARS_AUTHTI_YN_count_for_prop_wrt_GODS_NM_10 exists!
window: 10, target: NPAY_YN
/home/workspace/user-workspace/junheon/data/task150/NPAY_YN_count_for_prop_wrt_GODS_NM_10 exists!
window: 10, target: GNDR
/home/workspace/user-workspace/junheon/data/task150/GNDR_count_for_prop_wrt_GODS_NM_10 exists!
window: 10, target: FOREI_YN
/home/workspace/user-workspace/junheon/data/task150/FOREI_YN_count_for_prop_wrt_GOD

In [107]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [108]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}{target}_count_for_prop_wrt_{COLUMN}_{window}"
        
        count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
        df_test = df_test.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: ARS_AUTHTI_YN
window: 3, target: NPAY_YN
window: 3, target: GNDR
window: 3, target: FOREI_YN
window: 10, target: ARS_AUTHTI_YN
window: 10, target: NPAY_YN
window: 10, target: GNDR
window: 10, target: FOREI_YN
window: 30, target: ARS_AUTHTI_YN
window: 30, target: NPAY_YN
window: 30, target: GNDR
window: 30, target: FOREI_YN


In [109]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [69]:
df_test.fillna(0).drop(columns=[COLUMN, 'REQ_DD']).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [70]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 11:57:13.557496


#### count와 count_for_prop 불러와서 비율 구하기 

In [127]:
BASE_FILE_NAME = "gods_count"
RESULT_FILE_NAME = "prop_wrt_gods"

In [128]:
for seed in range(bagging_size):
    count_df = pd.read_parquet(f"{data_dir}{BASE_FILE_NAME}_{seed}.parquet")
    count_for_prop_df = pd.read_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")
    COLUMNS = ["id"]
    for window in WINDOW_SIZE:
        for target in TARGET_LIST:
            count_df[f"prop_{target}_wrt_{COLUMN}_{window}"] = \
                count_for_prop_df[f'{target}_count_for_prop_wrt_{COLUMN}_{window}'] / \
                count_df[f'{BASE_FILE_NAME}_{window}']
            COLUMNS.append(f"prop_{target}_wrt_{COLUMN}_{window}")
    count_df[COLUMNS].to_parquet(f"{data_dir}{RESULT_FILE_NAME}_{seed}.parquet")

In [None]:
count_df = pd.read_parquet(f"{data_dir}{BASE_FILE_NAME}_test.parquet")
count_for_prop_df = pd.read_parquet(f"{data_dir}{FILE_NAME}_test.parquet")
COLUMNS = ["id"]
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        count_df[f"prop_{target}_wrt_{COLUMN}_{window}"] = \
            count_for_prop_df[f'{target}_count_for_prop_wrt_{COLUMN}_{window}'] / \
            count_df[f'{BASE_FILE_NAME}_{window}']
        COLUMNS.append(f"prop_{target}_wrt_{COLUMN}_{window}")
count_df[COLUMNS].to_parquet(f"{data_dir}{RESULT_FILE_NAME}_test.parquet")

## ratio: unique('PAYR_IP')/unique('MPHN_NO'), unique('CP_CD')/unique('PAYR_SEQ'), unique('PAYR_IP')/unique('PAYR_SEQ'), unique('MPHN_NO')/unique('PAYR_SEQ')

In [129]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 05:27:44.041148


In [130]:
WINDOW_SIZE = [3, 10, 30]

In [131]:
LOAD_FILE_NAME = "unique_count_wrt_gods"
FILE_NAME = "unique_divide_wrt_gods"
COLUMN = "GODS_NM"
COLUMNS = []
TARGET_LIST = ["CP_CD", "PAYR_SEQ", "PAYR_IP", "MPHN_NO"]
NEW_TARGET_LIST = [("PAYR_IP", "MPHN_NO"), ("CP_CD", "PAYR_SEQ"), ("PAYR_IP", "PAYR_SEQ"), ("MPHN_NO", "PAYR_SEQ")]

In [132]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        COLUMNS.append(f'unique_{target}_wrt_{COLUMN}_{window}')

In [133]:
for seed in range(bagging_size):
    df_train = pd.read_parquet(f"{data_dir}{LOAD_FILE_NAME}_{seed}.parquet")[["id"] + COLUMNS]
    new_columns = []
    for window in WINDOW_SIZE:
        for new_target in NEW_TARGET_LIST:
            new_column = f"unique_{new_target[0]}_by_{new_target[1]}_wrt_{COLUMN}_{window}"
            df_train[new_column] = df_train[f"unique_{new_target[0]}_wrt_{COLUMN}_{window}"] / df_train[f"unique_{new_target[1]}_wrt_{COLUMN}_{window}"]
            new_columns.append(new_column)
    df_train = df_train[["id"] + new_columns].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test = pd.read_parquet(f"{data_dir}{LOAD_FILE_NAME}_test.parquet")[["id"] + COLUMNS]

new_columns = []
for window in WINDOW_SIZE:
    for new_target in NEW_TARGET_LIST:
        new_column = f"unique_{new_target[0]}_by_{new_target[1]}_wrt_{COLUMN}_{window}"
        df_test[new_column] = df_test[f"unique_{new_target[0]}_wrt_{COLUMN}_{window}"] / df_test[f"unique_{new_target[1]}_wrt_{COLUMN}_{window}"]
        new_columns.append(new_column)
df_test = df_test[["id"] + new_columns].to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [136]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 12:17:26.079300


# User(PAYR_SEQ)

## count

In [134]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 05:28:38.347491


In [135]:
WINDOW_SIZE = [3, 10, 30]

In [136]:
FILE_NAME = "seq_count"
COLUMN = "PAYR_SEQ"

In [137]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD']], test_df[['id', COLUMN, 'REQ_DD']]], ignore_index=True)

In [138]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [139]:
for window_size in WINDOW_SIZE:
    print(f"window size: {window_size}D")
    count = df.groupby([COLUMN, 'REQ_DD']).count().reset_index().rename(columns={"id": "count"})
    count['datetime'] = pd.to_datetime(count['REQ_DD'], format='%Y%m%d')
    count = count.drop(columns=['REQ_DD'])
    cum_count = count.sort_values(by=['datetime']).groupby([COLUMN]).rolling(f"{window_size+1}D", min_periods=1, on='datetime')['count'].sum().reset_index().rename(columns={"count": f"cum_{window_size}_count"})
    cum_count['REQ_DD'] = cum_count['datetime'].dt.strftime('%Y%m%d')
    cum_count = cum_count.drop(columns=['datetime'])
    count['REQ_DD'] = count['datetime'].dt.strftime('%Y%m%d')
    count = count.drop(columns=['datetime'])
    count = count.merge(cum_count, on=[COLUMN, 'REQ_DD'], how='left')
    count[f'{FILE_NAME}_{window_size}'] = count[f'cum_{window_size}_count'] - count['count']
    count = count[[COLUMN, 'REQ_DD', f'{FILE_NAME}_{window_size}']].astype({'REQ_DD': 'int32'})
    df_train = df_train.merge(count, on=[COLUMN, 'REQ_DD'], how='left')
    df_test = df_test.merge(count, on=[COLUMN, 'REQ_DD'], how='left')

window size: 3D
window size: 10D
window size: 30D


In [140]:
df_train = df_train.drop(columns=['REQ_DD', COLUMN])
df_test = df_test.drop(columns=['REQ_DD', COLUMN])

In [141]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [112]:
df_test.fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [113]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-12 08:25:26.436829


## unique count of 'CP_CD', 'GODS_NM', 'PAYR_IP', 'MPHN_NO', 'COMMC_CLF'

In [9]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 06:00:07.688342


In [10]:
WINDOW_SIZE = [3, 10, 30]

In [11]:
FILE_NAME = "unique_count_wrt_seq"
COLUMN = "PAYR_SEQ"
TARGET_LIST = ["CP_CD", "GODS_NM", "PAYR_IP", "MPHN_NO", "COMMC_CLF"]

In [12]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [13]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}unique_{target}_count_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].drop_duplicates(subset=[COLUMN, target, 'REQ_DD']).sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        count_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            count_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, target]]
            count_df = count_df.groupby([COLUMN]).agg(['nunique']).reset_index()
            count_df.columns = [COLUMN, f'unique_{target}_wrt_{COLUMN}_{window}']
            count_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            count_list.append(count_df)
            start_date = start_date + datetime.timedelta(days=1)
        count_df = pd.concat(count_list, axis=0)
        count_df.to_parquet(data_path)        

window: 3, target: CP_CD
/home/workspace/user-workspace/junheon/data/task150/unique_CP_CD_count_wrt_PAYR_SEQ_3 exists!
window: 3, target: GODS_NM
/home/workspace/user-workspace/junheon/data/task150/unique_GODS_NM_count_wrt_PAYR_SEQ_3 exists!
window: 3, target: PAYR_IP
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_IP_count_wrt_PAYR_SEQ_3 exists!
window: 3, target: MPHN_NO
/home/workspace/user-workspace/junheon/data/task150/unique_MPHN_NO_count_wrt_PAYR_SEQ_3 exists!
window: 3, target: COMMC_CLF
/home/workspace/user-workspace/junheon/data/task150/unique_COMMC_CLF_count_wrt_PAYR_SEQ_3 exists!
window: 10, target: CP_CD
/home/workspace/user-workspace/junheon/data/task150/unique_CP_CD_count_wrt_PAYR_SEQ_10 exists!
window: 10, target: GODS_NM
/home/workspace/user-workspace/junheon/data/task150/unique_GODS_NM_count_wrt_PAYR_SEQ_10 exists!
window: 10, target: PAYR_IP
/home/workspace/user-workspace/junheon/data/task150/unique_PAYR_IP_count_wrt_PAYR_SEQ_10 exists!
window: 10, ta

In [14]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'}).iloc[indices]
    
    for window in WINDOW_SIZE:
        for target in TARGET_LIST:
            print(f"window: {window}, target: {target}")
            data_path = f"{data_dir}unique_{target}_count_wrt_{COLUMN}_{window}"

            count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
            df_train = df_train.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
    
    df_train.drop(columns=["REQ_DD", COLUMN]).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

window: 3, target: CP_CD
window: 3, target: GODS_NM
window: 3, target: PAYR_IP
window: 3, target: MPHN_NO
window: 3, target: COMMC_CLF
window: 10, target: CP_CD
window: 10, target: GODS_NM
window: 10, target: PAYR_IP
window: 10, target: MPHN_NO
window: 10, target: COMMC_CLF
window: 30, target: CP_CD
window: 30, target: GODS_NM
window: 30, target: PAYR_IP
window: 30, target: MPHN_NO
window: 30, target: COMMC_CLF
window: 3, target: CP_CD
window: 3, target: GODS_NM
window: 3, target: PAYR_IP
window: 3, target: MPHN_NO
window: 3, target: COMMC_CLF
window: 10, target: CP_CD
window: 10, target: GODS_NM
window: 10, target: PAYR_IP
window: 10, target: MPHN_NO
window: 10, target: COMMC_CLF
window: 30, target: CP_CD
window: 30, target: GODS_NM
window: 30, target: PAYR_IP
window: 30, target: MPHN_NO
window: 30, target: COMMC_CLF
window: 3, target: CP_CD
window: 3, target: GODS_NM
window: 3, target: PAYR_IP
window: 3, target: MPHN_NO
window: 3, target: COMMC_CLF
window: 10, target: CP_CD
window: 1

In [None]:
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}unique_{target}_count_wrt_{COLUMN}_{window}"
        
        count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_test = df_test.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
df_test.drop(columns=["REQ_DD", COLUMN]).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

# 마지막 거래일까지의 diff (window X)

In [15]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 07:21:49.739012


In [16]:
FILE_NAME = "last_transaction_diff"

In [17]:
df_train = train_df[['id', 'REQ_DD', "PAYR_SEQ", "MPHN_NO"]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', "PAYR_SEQ", "MPHN_NO"]].astype({'REQ_DD': 'int32'})

## PAYR_SEQ 기준 

In [18]:
COLUMN = "PAYR_SEQ"

In [19]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD']], test_df[['id', COLUMN, 'REQ_DD']]], 
               ignore_index=True)

In [20]:
prev_df = df.drop_duplicates(subset=[COLUMN, 'REQ_DD']).sort_values(by=[COLUMN, "REQ_DD"])

In [21]:
prev_df['datetime'] = pd.to_datetime(prev_df['REQ_DD'], format='%Y%m%d')

In [22]:
prev_df[f'prev_{COLUMN}'] = prev_df[COLUMN].shift(1)
prev_df[f'prev_datetime'] = prev_df['datetime'].shift(1)

In [23]:
prev_df[f'diff_wrt_{COLUMN}'] = (prev_df['datetime']-prev_df['prev_datetime']).dt.days

In [24]:
prev_df[f'diff_wrt_{COLUMN}'] = np.where((prev_df[COLUMN]==prev_df[f"prev_{COLUMN}"]), prev_df[f'diff_wrt_{COLUMN}'], 1000000)

In [25]:
prev_df = prev_df[[COLUMN, "REQ_DD", f"diff_wrt_{COLUMN}"]]

In [26]:
df_train = df_train.merge(prev_df, on=[f"{COLUMN}", "REQ_DD"], how='left').drop(columns=[COLUMN])
df_test = df_test.merge(prev_df, on=[f"{COLUMN}", "REQ_DD"], how='left').drop(columns=[COLUMN])

## MPHN_NO 기준 

In [27]:
COLUMN = "MPHN_NO"

In [28]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD']], test_df[['id', COLUMN, 'REQ_DD']]], 
               ignore_index=True)

In [29]:
prev_df = df.drop_duplicates(subset=[COLUMN, 'REQ_DD']).sort_values(by=[COLUMN, "REQ_DD"])

In [30]:
prev_df['datetime'] = pd.to_datetime(prev_df['REQ_DD'], format='%Y%m%d')

In [31]:
prev_df[f'prev_{COLUMN}'] = prev_df[COLUMN].shift(1)
prev_df[f'prev_datetime'] = prev_df['datetime'].shift(1)

In [32]:
prev_df[f'diff_wrt_{COLUMN}'] = (prev_df['datetime']-prev_df['prev_datetime']).dt.days

In [33]:
prev_df[f'diff_wrt_{COLUMN}'] = np.where((prev_df[COLUMN]==prev_df[f"prev_{COLUMN}"]), prev_df[f'diff_wrt_{COLUMN}'], 1000000)

In [34]:
prev_df = prev_df[[COLUMN, "REQ_DD", f"diff_wrt_{COLUMN}"]]

In [35]:
df_train = df_train.merge(prev_df, on=[f"{COLUMN}", "REQ_DD"], how='left').drop(columns=[COLUMN])
df_test = df_test.merge(prev_df, on=[f"{COLUMN}", "REQ_DD"], how='left').drop(columns=[COLUMN])

In [36]:
df_train = df_train.drop(columns=["REQ_DD"])
df_test = df_test.drop(columns=["REQ_DD"])

In [37]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

# IP AND SEQ 

## ip_count_wrt_seq

In [38]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 07:23:02.419519


In [39]:
WINDOW_SIZE = [3, 10, 30]

In [40]:
FILE_NAME = "ip_count_wrt_seq"
COLUMN = "PAYR_SEQ"
TARGET = "PAYR_IP"

In [41]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD', TARGET]], test_df[['id', COLUMN, 'REQ_DD', TARGET]]], 
               ignore_index=True)

In [42]:
for window in WINDOW_SIZE:
    print(f"window: {window}, target: {TARGET}")
    data_path = f"{data_dir}{FILE_NAME}_window{window}"
    if path.exists(data_path):
        print(f"{data_path} exists!")
        continue


    temp_df = df[[COLUMN, TARGET, 'REQ_DD']].sort_values(by=['REQ_DD'])
    temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
    temp_df = temp_df.reset_index(drop=True)

    count_list = []
    start_date = "2019-07-01"
    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    while True:
        end_date = start_date + datetime.timedelta(days=window)

        if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
            break

        count_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, TARGET]]
        count_df = count_df.groupby([COLUMN]).count().reset_index()
        count_df.columns = [COLUMN, f'count_{TARGET}_wrt_{COLUMN}_{window}']
        count_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

        count_list.append(count_df)
        start_date = start_date + datetime.timedelta(days=1)
    count_df = pd.concat(count_list, axis=0)
    count_df.to_parquet(data_path)        

window: 3, target: PAYR_IP
/home/workspace/user-workspace/junheon/data/task150/ip_count_wrt_seq_window3 exists!
window: 10, target: PAYR_IP
/home/workspace/user-workspace/junheon/data/task150/ip_count_wrt_seq_window10 exists!
window: 30, target: PAYR_IP
/home/workspace/user-workspace/junheon/data/task150/ip_count_wrt_seq_window30 exists!


In [43]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'}).iloc[indices]
    
    for window in WINDOW_SIZE:
        print(f"window: {window}, target: {TARGET}")
        data_path = f"{data_dir}{FILE_NAME}_window{window}"

        count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
    
    df_train.drop(columns=["REQ_DD", COLUMN]).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

window: 3, target: PAYR_IP
window: 10, target: PAYR_IP
window: 30, target: PAYR_IP
window: 3, target: PAYR_IP
window: 10, target: PAYR_IP
window: 30, target: PAYR_IP
window: 3, target: PAYR_IP
window: 10, target: PAYR_IP
window: 30, target: PAYR_IP
window: 3, target: PAYR_IP
window: 10, target: PAYR_IP
window: 30, target: PAYR_IP
window: 3, target: PAYR_IP
window: 10, target: PAYR_IP
window: 30, target: PAYR_IP


In [None]:
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
for window in WINDOW_SIZE:
    print(f"window: {window}, target: {TARGET}")
    data_path = data_path = f"{data_dir}{FILE_NAME}_window{window}"

    count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
    df_test = df_test.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
df_test.drop(columns=["REQ_DD", COLUMN]).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

## seq_count_wrt_ip

In [44]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 07:44:34.042583


In [45]:
WINDOW_SIZE = [3, 10, 30]

In [46]:
FILE_NAME = "seq_count_wrt_ip"
COLUMN = "PAYR_IP"
TARGET = "PAYR_SEQ"

In [47]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD', TARGET]], test_df[['id', COLUMN, 'REQ_DD', TARGET]]], 
               ignore_index=True)

In [48]:
for window in WINDOW_SIZE:
    print(f"window: {window}, target: {TARGET}")
    data_path = f"{data_dir}{FILE_NAME}_window{window}"
    if path.exists(data_path):
        print(f"{data_path} exists!")
        continue


    temp_df = df[[COLUMN, TARGET, 'REQ_DD']].sort_values(by=['REQ_DD'])
    temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
    temp_df = temp_df.reset_index(drop=True)

    count_list = []
    start_date = "2019-07-01"
    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    while True:
        end_date = start_date + datetime.timedelta(days=window)

        if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
            break

        count_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, TARGET]]
        count_df = count_df.groupby([COLUMN]).count().reset_index()
        count_df.columns = [COLUMN, f'count_{TARGET}_wrt_{COLUMN}_{window}']
        count_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

        count_list.append(count_df)
        start_date = start_date + datetime.timedelta(days=1)
    count_df = pd.concat(count_list, axis=0)
    count_df.to_parquet(data_path)        

window: 3, target: PAYR_SEQ
/home/workspace/user-workspace/junheon/data/task150/seq_count_wrt_ip_window3 exists!
window: 10, target: PAYR_SEQ
/home/workspace/user-workspace/junheon/data/task150/seq_count_wrt_ip_window10 exists!
window: 30, target: PAYR_SEQ
/home/workspace/user-workspace/junheon/data/task150/seq_count_wrt_ip_window30 exists!


In [49]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'}).iloc[indices]
    
    for window in WINDOW_SIZE:
        print(f"window: {window}, target: {TARGET}")
        data_path = f"{data_dir}{FILE_NAME}_window{window}"

        count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
    
    df_train.drop(columns=["REQ_DD", COLUMN]).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

window: 3, target: PAYR_SEQ
window: 10, target: PAYR_SEQ
window: 30, target: PAYR_SEQ
window: 3, target: PAYR_SEQ
window: 10, target: PAYR_SEQ
window: 30, target: PAYR_SEQ
window: 3, target: PAYR_SEQ
window: 10, target: PAYR_SEQ
window: 30, target: PAYR_SEQ
window: 3, target: PAYR_SEQ
window: 10, target: PAYR_SEQ
window: 30, target: PAYR_SEQ
window: 3, target: PAYR_SEQ
window: 10, target: PAYR_SEQ
window: 30, target: PAYR_SEQ


In [None]:
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
for window in WINDOW_SIZE:
    print(f"window: {window}, target: {TARGET}")
    data_path = data_path = f"{data_dir}{FILE_NAME}_window{window}"

    count_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
    df_test = df_test.merge(count_df, on=[COLUMN, 'REQ_DD'], how='left')
df_test.drop(columns=["REQ_DD", COLUMN]).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

# Newness

In [50]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 07:57:37.970057


In [51]:
FILE_NAME = "newness"

## PAYR_SEQ, CP_CD, PAYR_IP, MPHN_NO

In [52]:
TARGET_LIST = ["PAYR_SEQ", "CP_CD", "PAYR_IP", "MPHN_NO"]

In [53]:
for target in TARGET_LIST:
    prev_df = pd.concat([train_df[[target, 'REQ_DD']], test_df[[target, 'REQ_DD']]], ignore_index=True)
    prev_df = prev_df.drop_duplicates(subset=[target, 'REQ_DD']).sort_values(by=[target, "REQ_DD"])
    prev_df["cum_count"] = prev_df.groupby(target).cumcount()
    prev_df[f'newness_{target}'] = np.where((prev_df["cum_count"]>0), 0, 1)
    prev_df[[target, "REQ_DD", f"newness_{target}"]].to_parquet(f"{data_dir}newness_{target}.parquet")

In [54]:
df_train = train_df[['id', 'REQ_DD'] + TARGET_LIST].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD'] + TARGET_LIST].astype({'REQ_DD': 'int32'})

In [55]:
for target in TARGET_LIST:
    prev_df = pd.read_parquet(f"{data_dir}newness_{target}.parquet")
    df_train = df_train.merge(prev_df, on=["REQ_DD", target], how='left')
    df_test = df_test.merge(prev_df, on=["REQ_DD", target], how='left')

In [56]:
df_train = df_train.drop(columns=TARGET_LIST + ['REQ_DD'])
df_test = df_test.drop(columns=TARGET_LIST + ['REQ_DD'])

In [57]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

# EDA로부터 얻은 것들 

## CP_M_CLF_NM가 후원/기부 인지 여부 

In [58]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 07:59:21.341120


In [59]:
FILE_NAME = "filter_CP_M_CLF_NM"

In [60]:
donate_num = decoder['CP_M_CLF_NM']['후원/기부']

In [61]:
donate_num

9

In [62]:
df_train = train_df[['id', 'CP_M_CLF_NM']]
df_test = test_df[['id', 'CP_M_CLF_NM']]

In [63]:
df_train["is_donate"] = (df_train['CP_M_CLF_NM']==donate_num)
df_test["is_donate"] = (df_test['CP_M_CLF_NM']==donate_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_donate"] = (df_train['CP_M_CLF_NM']==donate_num)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_donate"] = (df_test['CP_M_CLF_NM']==donate_num)


In [64]:
df_train = df_train[['id', 'is_donate']]
df_test = df_test[['id', 'is_donate']]

In [65]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

## REMD_LMT_AMT가 60만원 이상인지 여부

In [66]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 07:59:22.507385


In [67]:
FILE_NAME = "filter_REMD_LMT_AMT"

In [68]:
df_train = train_df[['id', 'REMD_LMT_AMT']]
df_test = test_df[['id', 'REMD_LMT_AMT']]

In [69]:
df_train["big_REMD_LMT_AMT"] = (df_train['REMD_LMT_AMT'] > 600000)
df_test["big_REMD_LMT_AMT"] = (df_test['REMD_LMT_AMT'] > 600000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["big_REMD_LMT_AMT"] = (df_train['REMD_LMT_AMT'] > 600000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["big_REMD_LMT_AMT"] = (df_test['REMD_LMT_AMT'] > 600000)


In [70]:
df_train = df_train[['id', 'big_REMD_LMT_AMT']]
df_test = df_test[['id', 'big_REMD_LMT_AMT']]

In [71]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

## ACUM_RCPT_AMT가 1 미만인지 여부

In [72]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 07:59:23.576649


In [73]:
FILE_NAME = "filter_ACUM_RCPT_AMT"

In [74]:
df_train = train_df[['id', 'ACUM_RCPT_AMT']]
df_test = test_df[['id', 'ACUM_RCPT_AMT']]

In [75]:
df_train["negative_ACUM_RCPT_AMT"] = (df_train['ACUM_RCPT_AMT'] < 1)
df_test["negative_ACUM_RCPT_AMT"] = (df_test['ACUM_RCPT_AMT'] < 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["negative_ACUM_RCPT_AMT"] = (df_train['ACUM_RCPT_AMT'] < 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["negative_ACUM_RCPT_AMT"] = (df_test['ACUM_RCPT_AMT'] < 1)


In [76]:
df_train = df_train[['id', 'negative_ACUM_RCPT_AMT']]
df_test = df_test[['id', 'negative_ACUM_RCPT_AMT']]

In [77]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

## CP_S_CLF_NM이 high risk list에 포함되는지 여부

In [78]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 07:59:24.610860


In [79]:
FILE_NAME = "filter_CP_S_CLF_NM"

In [80]:
df_train = train_df[['id', 'CP_S_CLF_NM']]
df_test = test_df[['id', 'CP_S_CLF_NM']]

In [81]:
high_risk_list = train_df[train_df['target']==1]['CP_S_CLF_NM'].unique()

In [82]:
df_train["high_risk_CP_S_CLF_NM"] = df_train['CP_S_CLF_NM'].isin(high_risk_list)
df_test["high_risk_CP_S_CLF_NM"] = df_test['CP_S_CLF_NM'].isin(high_risk_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["high_risk_CP_S_CLF_NM"] = df_train['CP_S_CLF_NM'].isin(high_risk_list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["high_risk_CP_S_CLF_NM"] = df_test['CP_S_CLF_NM'].isin(high_risk_list)


In [83]:
df_train = df_train[['id', 'high_risk_CP_S_CLF_NM']]
df_test = df_test[['id', 'high_risk_CP_S_CLF_NM']]

In [84]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

## IP_SUB_A가 high risk list에 포함되는지 여부

In [85]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 07:59:26.220557


In [86]:
FILE_NAME = "filter_SUB_IP_A"

In [87]:
df_train = train_df[['id', 'SUB_IP_A']]
df_test = test_df[['id', 'SUB_IP_A']]

In [88]:
high_risk_list = train_df[train_df['target']==1]['SUB_IP_A'].unique()

In [89]:
df_train["high_risk_SUB_IP_A"] = df_train['SUB_IP_A'].isin(high_risk_list)
df_test["high_risk_SUB_IP_A"] = df_test['SUB_IP_A'].isin(high_risk_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["high_risk_SUB_IP_A"] = df_train['SUB_IP_A'].isin(high_risk_list)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["high_risk_SUB_IP_A"] = df_test['SUB_IP_A'].isin(high_risk_list)


In [90]:
df_train = df_train[['id', 'high_risk_SUB_IP_A']]
df_test = df_test[['id', 'high_risk_SUB_IP_A']]

In [91]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [None]:
print(f"Ends at {datetime.datetime.now()}")

## AC_PAY_AMT == 49900, 50000, 99000, 100000

In [98]:
FILE_NAME = "bin_AC_PAY_AMT"

In [99]:
df_train = train_df[['id', 'AC_PAY_AMT']]
df_test = test_df[['id', 'AC_PAY_AMT']]

In [100]:
df_train["is_AC_PAY_AMT_49900"] = (df_train['AC_PAY_AMT'] == 49900)
df_test["is_AC_PAY_AMT_49900"] = (df_test['AC_PAY_AMT'] == 49900)
df_train["is_AC_PAY_AMT_50000"] = (df_train['AC_PAY_AMT'] == 50000)
df_test["is_AC_PAY_AMT_50000"] = (df_test['AC_PAY_AMT'] == 50000)
df_train["is_AC_PAY_AMT_99000"] = (df_train['AC_PAY_AMT'] == 99000)
df_test["is_AC_PAY_AMT_99000"] = (df_test['AC_PAY_AMT'] == 99000)
df_train["is_AC_PAY_AMT_100000"] = (df_train['AC_PAY_AMT'] == 100000)
df_test["is_AC_PAY_AMT_100000"] = (df_test['AC_PAY_AMT'] == 100000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_AC_PAY_AMT_49900"] = (df_train['AC_PAY_AMT'] == 49900)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_AC_PAY_AMT_49900"] = (df_test['AC_PAY_AMT'] == 49900)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_AC_PAY_AMT_50000"] = (df_train['AC_PAY_AMT'] == 50000)
A v

In [101]:
df_train = df_train[['id', 'is_AC_PAY_AMT_49900', 'is_AC_PAY_AMT_50000', 'is_AC_PAY_AMT_99000', 'is_AC_PAY_AMT_100000']]
df_test = df_test[['id', 'is_AC_PAY_AMT_49900', 'is_AC_PAY_AMT_50000', 'is_AC_PAY_AMT_99000', 'is_AC_PAY_AMT_100000']]

In [102]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [103]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## MM_LMT_AMT  == 600000, 40000 

In [104]:
FILE_NAME = "bin_MM_LMT_AMT"

In [105]:
df_train = train_df[['id', 'MM_LMT_AMT']]
df_test = test_df[['id', 'MM_LMT_AMT']]

In [106]:
df_train["is_MM_LMT_AMT_600000"] = (df_train['MM_LMT_AMT'] == 600000)
df_test["is_MM_LMT_AMT_600000"] = (df_test['MM_LMT_AMT'] == 600000)
df_train["is_MM_LMT_AMT_40000"] = (df_train['MM_LMT_AMT'] == 40000)
df_test["is_MM_LMT_AMT_40000"] = (df_test['MM_LMT_AMT'] == 40000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_MM_LMT_AMT_600000"] = (df_train['MM_LMT_AMT'] == 600000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_MM_LMT_AMT_600000"] = (df_test['MM_LMT_AMT'] == 600000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_MM_LMT_AMT_40000"] = (df_train['MM_LMT_AMT'] == 40000)

In [107]:
df_train = df_train[['id', 'is_MM_LMT_AMT_600000', 'is_MM_LMT_AMT_40000']]
df_test = df_test[['id', 'is_MM_LMT_AMT_600000', 'is_MM_LMT_AMT_40000']]

In [108]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## REMD_LMT_AMT == 30만원/60만원

In [109]:
FILE_NAME = "bin_REMD_LMT_AMT"

In [110]:
df_train = train_df[['id', 'REMD_LMT_AMT']]
df_test = test_df[['id', 'REMD_LMT_AMT']]

In [111]:
df_train["is_REMD_LMT_AMT_600000"] = (df_train['REMD_LMT_AMT'] == 600000)
df_test["is_REMD_LMT_AMT_600000"] = (df_test['REMD_LMT_AMT'] == 600000)
df_train["is_REMD_LMT_AMT_300000"] = (df_train['REMD_LMT_AMT'] == 40000)
df_test["is_REMD_LMT_AMT_300000"] = (df_test['REMD_LMT_AMT'] == 40000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_REMD_LMT_AMT_600000"] = (df_train['REMD_LMT_AMT'] == 600000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_REMD_LMT_AMT_600000"] = (df_test['REMD_LMT_AMT'] == 600000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_REMD_LMT_AMT_300000"] = (df_train['REMD_LMT_AM

In [112]:
df_train = df_train[['id', 'is_REMD_LMT_AMT_600000', 'is_REMD_LMT_AMT_300000']]
df_test = df_test[['id', 'is_REMD_LMT_AMT_600000', 'is_REMD_LMT_AMT_300000']]

In [113]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## AGE < 26 , 27<=AGE<=31

In [114]:
FILE_NAME = "bin_AGE"

In [115]:
df_train = train_df[['id', 'AGE']]
df_test = test_df[['id', 'AGE']]

In [116]:
df_train["AGE_under_26"] = (df_train['AGE'] < 26)
df_test["AGE_under_26"] = (df_test['AGE'] < 26)
df_train["AGE_27_31"] = ((df_train['AGE'] > 26) & (df_train['AGE'] < 32))
df_test["AGE_27_31"] = ((df_test['AGE'] > 26) & (df_test['AGE'] < 32))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["AGE_under_26"] = (df_train['AGE'] < 26)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["AGE_under_26"] = (df_test['AGE'] < 26)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["AGE_27_31"] = ((df_train['AGE'] > 26) & (df_train['AGE'] < 32))
A value is trying to be set on a 

In [117]:
df_train = df_train[['id', 'AGE_under_26', 'AGE_27_31']]
df_test = df_test[['id', 'AGE_under_26', 'AGE_27_31']]

In [118]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## AUTHTI_CLF_FLG == A

In [119]:
FILE_NAME = "bin_AUTHTI_CLF_FLG"

In [120]:
df_train = train_df[['id', 'AUTHTI_CLF_FLG']]
df_test = test_df[['id', 'AUTHTI_CLF_FLG']]

In [121]:
df_train["is_AUTHTI_CLF_FLG_A"] = (df_train['AUTHTI_CLF_FLG'] == "A")
df_test["is_AUTHTI_CLF_FLG_A"] = (df_test['AUTHTI_CLF_FLG'] == "A")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_AUTHTI_CLF_FLG_A"] = (df_train['AUTHTI_CLF_FLG'] == "A")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_AUTHTI_CLF_FLG_A"] = (df_test['AUTHTI_CLF_FLG'] == "A")


In [122]:
df_train = df_train[['id', 'is_AUTHTI_CLF_FLG_A']]
df_test = df_test[['id', 'is_AUTHTI_CLF_FLG_A']]

In [123]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## ACUM_RCPT_AMT == 0, 11000, 49900

In [124]:
FILE_NAME = "bin_ACUM_RCPT_AMT"

In [125]:
df_train = train_df[['id', 'ACUM_RCPT_AMT']]
df_test = test_df[['id', 'ACUM_RCPT_AMT']]

In [126]:
df_train["is_ACUM_RCPT_AMT_0"] = (df_train['ACUM_RCPT_AMT'] == 0)
df_test["is_ACUM_RCPT_AMT_0"] = (df_test['ACUM_RCPT_AMT'] == 0)
df_train["is_ACUM_RCPT_AMT_11000"] = (df_train['ACUM_RCPT_AMT'] == 11000)
df_test["is_ACUM_RCPT_AMT_11000"] = (df_test['ACUM_RCPT_AMT'] == 11000)
df_train["is_ACUM_RCPT_AMT_49900"] = (df_train['ACUM_RCPT_AMT'] == 49900)
df_test["is_ACUM_RCPT_AMT_49900"] = (df_test['ACUM_RCPT_AMT'] == 49900)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_ACUM_RCPT_AMT_0"] = (df_train['ACUM_RCPT_AMT'] == 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_ACUM_RCPT_AMT_0"] = (df_test['ACUM_RCPT_AMT'] == 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_ACUM_RCPT_AMT_11000"] = (df_train['ACUM_RCPT_AMT'] == 11000)
A

In [127]:
df_train = df_train[['id', 'is_ACUM_RCPT_AMT_0', 'is_ACUM_RCPT_AMT_11000', 'is_ACUM_RCPT_AMT_49900']]
df_test = df_test[['id', 'is_ACUM_RCPT_AMT_0', 'is_ACUM_RCPT_AMT_11000', 'is_ACUM_RCPT_AMT_49900']]

In [128]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## CP_M_CLF_NM  == 게임

In [129]:
FILE_NAME = "bin_CP_M_CLF_NM"

In [130]:
df_train = train_df[['id', 'CP_M_CLF_NM']]
df_test = test_df[['id', 'CP_M_CLF_NM']]

In [131]:
donate_num = decoder['CP_M_CLF_NM']['게임']

In [132]:
df_train["is_CP_M_CLF_NM_game"] = (df_train['CP_M_CLF_NM'] == donate_num)
df_test["is_CP_M_CLF_NM_game"] = (df_test['CP_M_CLF_NM'] == donate_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_CP_M_CLF_NM_game"] = (df_train['CP_M_CLF_NM'] == donate_num)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_CP_M_CLF_NM_game"] = (df_test['CP_M_CLF_NM'] == donate_num)


In [133]:
df_train = df_train[['id', 'is_CP_M_CLF_NM_game']]
df_test = df_test[['id', 'is_CP_M_CLF_NM_game']]

In [134]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## CP_S_CLF_NM == 게임 / 교통카드 충전 / 음악으로

In [135]:
FILE_NAME = "bin_CP_S_CLF_NM"

In [136]:
df_train = train_df[['id', 'CP_S_CLF_NM']]
df_test = test_df[['id', 'CP_S_CLF_NM']]

In [137]:
game_num = decoder['CP_S_CLF_NM']['게임']
bus_num = decoder['CP_S_CLF_NM']['교통카드 충전']
music_num = decoder['CP_S_CLF_NM']['음악']

In [139]:
df_train["is_CP_S_CLF_NM_game"] = (df_train['CP_S_CLF_NM'] == game_num)
df_test["is_CP_S_CLF_NM_game"] = (df_test['CP_S_CLF_NM'] == game_num)
df_train["is_CP_S_CLF_NM_bus"] = (df_train['CP_S_CLF_NM'] == bus_num)
df_test["is_CP_S_CLF_NM_bus"] = (df_test['CP_S_CLF_NM'] == bus_num)
df_train["is_CP_S_CLF_NM_music"] = (df_train['CP_S_CLF_NM'] == music_num)
df_test["is_CP_S_CLF_NM_music"] = (df_test['CP_S_CLF_NM'] == music_num)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_CP_S_CLF_NM_game"] = (df_train['CP_S_CLF_NM'] == game_num)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_CP_S_CLF_NM_game"] = (df_test['CP_S_CLF_NM'] == game_num)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_CP_S_CLF_NM_bus"] = (df_train['CP_S_CLF_NM'] == bu

In [140]:
df_train = df_train[['id', 'is_CP_S_CLF_NM_game', 'is_CP_S_CLF_NM_bus', 'is_CP_S_CLF_NM_music']]
df_test = df_test[['id', 'is_CP_S_CLF_NM_game', 'is_CP_S_CLF_NM_bus', 'is_CP_S_CLF_NM_music']]

In [141]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## NPAY_AMT_24M > 40000

In [142]:
FILE_NAME = "bin_NPAY_AMT_24M"

In [143]:
df_train = train_df[['id', 'NPAY_AMT_24M']]
df_test = test_df[['id', 'NPAY_AMT_24M']]

In [144]:
df_train["NPAY_AMT_24M_over_40000"] = (df_train['NPAY_AMT_24M'] > 40000)
df_test["NPAY_AMT_24M_over_40000"] = (df_test['NPAY_AMT_24M'] > 40000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["NPAY_AMT_24M_over_40000"] = (df_train['NPAY_AMT_24M'] > 40000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["NPAY_AMT_24M_over_40000"] = (df_test['NPAY_AMT_24M'] > 40000)


In [145]:
df_train = df_train[['id', 'NPAY_AMT_24M_over_40000']]
df_test = df_test[['id', 'NPAY_AMT_24M_over_40000']]

In [146]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## AVG_AMT_6M > 33000

In [147]:
FILE_NAME = "bin_AVG_AMT_6M"

In [148]:
df_train = train_df[['id', 'AVG_AMT_6M']]
df_test = test_df[['id', 'AVG_AMT_6M']]

In [149]:
df_train["AVG_AMT_6M_over_33000"] = (df_train['AVG_AMT_6M'] > 33000)
df_test["AVG_AMT_6M_over_33000"] = (df_test['AVG_AMT_6M'] > 33000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["AVG_AMT_6M_over_33000"] = (df_train['AVG_AMT_6M'] > 33000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["AVG_AMT_6M_over_33000"] = (df_test['AVG_AMT_6M'] > 33000)


In [150]:
df_train = df_train[['id', 'AVG_AMT_6M_over_33000']]
df_test = df_test[['id', 'AVG_AMT_6M_over_33000']]

In [151]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## MAX_LMT_3M_RT == 1

In [152]:
FILE_NAME = "bin_MAX_LMT_3M_RT"

In [153]:
df_train = train_df[['id', 'MAX_LMT_3M_RT']]
df_test = test_df[['id', 'MAX_LMT_3M_RT']]

In [154]:
df_train["is_MAX_LMT_3M_RT_1"] = (df_train['MAX_LMT_3M_RT'] == 1)
df_test["is_MAX_LMT_3M_RT_1"] = (df_test['MAX_LMT_3M_RT'] == 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_MAX_LMT_3M_RT_1"] = (df_train['MAX_LMT_3M_RT'] == 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_MAX_LMT_3M_RT_1"] = (df_test['MAX_LMT_3M_RT'] == 1)


In [155]:
df_train = df_train[['id', 'is_MAX_LMT_3M_RT_1']]
df_test = df_test[['id', 'is_MAX_LMT_3M_RT_1']]

In [156]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## NPAY_CNT_24M == 0

In [157]:
FILE_NAME = "bin_NPAY_CNT_24M"

In [158]:
df_train = train_df[['id', 'NPAY_CNT_24M']]
df_test = test_df[['id', 'NPAY_CNT_24M']]

In [159]:
df_train["is_NPAY_CNT_24M_0"] = (df_train['NPAY_CNT_24M'] == 0)
df_test["is_NPAY_CNT_24M_0"] = (df_test['NPAY_CNT_24M'] == 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_NPAY_CNT_24M_0"] = (df_train['NPAY_CNT_24M'] == 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_NPAY_CNT_24M_0"] = (df_test['NPAY_CNT_24M'] == 0)


In [160]:
df_train = df_train[['id', 'is_NPAY_CNT_24M_0']]
df_test = df_test[['id', 'is_NPAY_CNT_24M_0']]

In [161]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

## NPAY_AMT_60M >= 40000

In [162]:
FILE_NAME = "bin_NPAY_AMT_60M"

In [163]:
df_train = train_df[['id', 'NPAY_AMT_60M']]
df_test = test_df[['id', 'NPAY_AMT_60M']]

In [164]:
df_train["is_NPAY_AMT_60M_over_40000"] = (df_train['NPAY_AMT_60M'] > 40000)
df_test["is_NPAY_AMT_60M_over_40000"] = (df_test['NPAY_AMT_60M'] > 40000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["is_NPAY_AMT_60M_over_40000"] = (df_train['NPAY_AMT_60M'] > 40000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["is_NPAY_AMT_60M_over_40000"] = (df_test['NPAY_AMT_60M'] > 40000)


In [165]:
df_train = df_train[['id', 'is_NPAY_AMT_60M_over_40000']]
df_test = df_test[['id', 'is_NPAY_AMT_60M_over_40000']]

In [166]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [None]:
df_test.to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [3]:
! cp data/task150/backup_11140751/* data/task150/

## 고객별 거래금액 AC_PAY_AMT sum

In [17]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 11:55:09.286570


In [18]:
WINDOW_SIZE = [3, 10, 30]

In [19]:
FILE_NAME = "sum_pay_wrt_seq"
COLUMN = "PAYR_SEQ"
TARGET_LIST = ["AC_PAY_AMT"]

In [17]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [20]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}sum_{target}_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        sum_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            sum_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, target]]
            sum_df = sum_df.groupby([COLUMN]).sum().reset_index()
            sum_df.columns = [COLUMN, f'sum_{target}_wrt_{COLUMN}_{window}']
            sum_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            sum_list.append(sum_df)
            start_date = start_date + datetime.timedelta(days=1)
        sum_df = pd.concat(sum_list, axis=0)
        sum_df.to_parquet(data_path)        

window: 3, target: AC_PAY_AMT
window: 10, target: AC_PAY_AMT
window: 30, target: AC_PAY_AMT


In [None]:
del df
gc.collect()

In [20]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [21]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}sum_{target}_wrt_{COLUMN}_{window}"
        
        sum_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(sum_df, on=[COLUMN, 'REQ_DD'], how='left')
        df_test = df_test.merge(sum_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: AC_PAY_AMT
window: 10, target: AC_PAY_AMT
window: 30, target: AC_PAY_AMT


In [22]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [23]:
df_test.drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [24]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-14 12:01:29.325313


## 휴대폰별 거래금액 AC_PAY_AMT sum

In [8]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 12:30:01.533477


In [9]:
WINDOW_SIZE = [3, 10, 30]

In [10]:
FILE_NAME = "sum_pay_wrt_phone"
COLUMN = "MPHN_NO"
TARGET_LIST = ["AC_PAY_AMT"]

In [24]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [25]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}sum_{target}_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        sum_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            sum_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, target]]
            sum_df = sum_df.groupby([COLUMN]).sum().reset_index()
            sum_df.columns = [COLUMN, f'sum_{target}_wrt_{COLUMN}_{window}']
            sum_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            sum_list.append(sum_df)
            start_date = start_date + datetime.timedelta(days=1)
        sum_df = pd.concat(sum_list, axis=0)
        sum_df.to_parquet(data_path)        

window: 3, target: AC_PAY_AMT
window: 10, target: AC_PAY_AMT
window: 30, target: AC_PAY_AMT


In [26]:
del df
gc.collect()

15

In [11]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}sum_{target}_wrt_{COLUMN}_{window}"
        
        sum_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(sum_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: AC_PAY_AMT
window: 10, target: AC_PAY_AMT
window: 30, target: AC_PAY_AMT


In [12]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [13]:
del df_train
gc.collect

<function gc.collect(generation=2)>

In [16]:
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}sum_{target}_wrt_{COLUMN}_{window}"
        
        sum_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_test = df_test.merge(sum_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: AC_PAY_AMT
window: 10, target: AC_PAY_AMT
window: 30, target: AC_PAY_AMT


In [17]:
df_test.drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [18]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-14 12:37:38.436752


# 최근 미납 횟수 NPAY_YN

In [7]:
print(f"Starts at {datetime.datetime.now()}")

Starts at 2020-11-14 12:12:45.995999


In [8]:
WINDOW_SIZE = [3, 10, 30]

In [9]:
FILE_NAME = "npay_wrt_seq"
COLUMN = "PAYR_SEQ"
TARGET_LIST = ["NPAY_YN"]

In [10]:
df = pd.concat([train_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST], test_df[['id', COLUMN, 'REQ_DD'] + TARGET_LIST]], 
               ignore_index=True)

In [11]:
df[COLUMN] = np.where((df[COLUMN]==1), 1, 0)

In [13]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}sum_{target}_wrt_{COLUMN}_{window}"
        if path.exists(data_path):
            print(f"{data_path} exists!")
            continue
        
        
        temp_df = df[[COLUMN, target, 'REQ_DD']].sort_values(by=['REQ_DD'])
        temp_df['datetime'] = pd.to_datetime(temp_df['REQ_DD'], format='%Y%m%d')
        temp_df = temp_df.reset_index(drop=True)
        
        sum_list = []
        start_date = "2019-07-01"
        start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
        while True:
            end_date = start_date + datetime.timedelta(days=window)

            if end_date > datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"):
                break

            sum_df = temp_df[(temp_df['datetime']>=start_date)&(temp_df['datetime']<end_date)][[COLUMN, target]]
            sum_df = sum_df.groupby([COLUMN]).sum().reset_index()
            sum_df.columns = [COLUMN, f'sum_{target}_wrt_{COLUMN}_{window}']
            sum_df['REQ_DD'] = datetime.datetime.strftime(end_date, "%Y%m%d")

            sum_list.append(sum_df)
            start_date = start_date + datetime.timedelta(days=1)
        sum_df = pd.concat(sum_list, axis=0)
        sum_df.to_parquet(data_path)        

window: 3, target: NPAY_YN
window: 10, target: NPAY_YN
window: 30, target: NPAY_YN


In [14]:
del df
gc.collect()

19

In [15]:
df_train = train_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})
df_test = test_df[['id', 'REQ_DD', COLUMN]].astype({'REQ_DD': 'int32'})

In [16]:
for window in WINDOW_SIZE:
    for target in TARGET_LIST:
        print(f"window: {window}, target: {target}")
        data_path = f"{data_dir}sum_{target}_wrt_{COLUMN}_{window}"
        
        sum_df = pd.read_parquet(data_path).astype({'REQ_DD': 'int32'})
        df_train = df_train.merge(sum_df, on=[COLUMN, 'REQ_DD'], how='left')
        df_test = df_test.merge(sum_df, on=[COLUMN, 'REQ_DD'], how='left')

window: 3, target: NPAY_YN
window: 10, target: NPAY_YN
window: 30, target: NPAY_YN


In [17]:
for seed in range(bagging_size):
    with open(f"{data_dir}indices_{seed}.pkl", 'rb') as f:
        indices = joblib.load(f)
    df_train.iloc[indices].drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_{seed}.parquet")

In [18]:
df_test.drop(columns=[COLUMN, 'REQ_DD']).fillna(0).to_parquet(f"{data_dir}{FILE_NAME}_test.parquet")

In [19]:
print(f"Ends at {datetime.datetime.now()}")

Ends at 2020-11-14 12:17:04.043575
