In [1]:
import pandas as pd
import numpy as np
import os
import sys
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

In [2]:
# load data
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
inner_total_df = pd.read_csv(os.path.join(save_dir, 'final_inner_total_df.csv'))
total_df = pd.read_pickle(os.path.join(save_dir, 'total_df.pkl'))

In [3]:
print('inner_total: ', inner_total_df.shape)
print('total: ', total_df.shape)

inner_total:  (209488, 67)
total:  (209488, 1928)


In [4]:
# 대상자 선정
target_df = inner_total_df[inner_total_df['DEV_TARGET_FG'] == 1]
print('target: ', target_df.shape)

target:  (98669, 67)


In [5]:
# 모형 검증 data 분리
print(target_df['신청월'].unique())
seg1_df = target_df[target_df['FINAL_SEG'] == 1]
print('seg1: ', seg1_df.shape)

[201612 201701 201702 201703 201704 201705 201706 201707 201708 201709
 201710 201711 201712 201801 201802 201803 201804 201805 201806 201807
 201808 201809 201810 201908 201909 201910]
seg1:  (74012, 67)


In [6]:
seg1_model_valid_df = seg1_df[seg1_df['신청월'].isin([201908, 201909, 201910])]
seg1_dev_df = seg1_df[~(seg1_df['신청월'].isin([201908, 201909, 201910]))]

In [7]:
seg1_dev_df = seg1_dev_df[['no', 'BAD']].merge(total_df, on=['no'], how='left')
seg1_model_valid_df = seg1_model_valid_df[['no', 'BAD']].merge(total_df, on=['no'], how='left')

In [8]:
print('dev: ', seg1_dev_df.shape)
print('model_valid: ', seg1_model_valid_df.shape)

dev:  (62857, 1929)
model_valid:  (11155, 1929)


In [9]:
# 층화추출
need_cols = ['BAD', 'Grd_NK0200_000', 'Grd_RK0400_000', 'Grd_RK0400_700']
seg1_grp = seg1_dev_df.groupby(by=need_cols, as_index=False).agg({'no': ('count', 'unique')})
seg1_grp.columns = need_cols + ['cnt', 'no']

print(seg1_grp)

     BAD  Grd_NK0200_000  Grd_RK0400_000  Grd_RK0400_700  cnt  \
0      0               2               1               1    3   
1      0               2               1               2    9   
2      0               2               1               3    1   
3      0               2               2               1    7   
4      0               2               2               2   68   
..   ...             ...             ...             ...  ...   
421    1               7               8               7   18   
422    1               7               8               8    5   
423    1               7               8               9    1   
424    1               7               9               7    1   
425    1               7               9               8    1   

                                                    no  
0                                [27209, 31866, 79298]  
1    [14560, 21041, 24233, 30474, 43774, 50749, 779...  
2                                               

In [10]:
def _sampling(no, sample_pct):
    N = len(no)
    vv = np.random.choice(no, replace=False, size=int(N * sample_pct))
    return vv


def _drop(total, selected):
    rest = np.setdiff1d(total, selected)
    return rest

In [11]:
np.random.seed(111)
seg1_grp['tr_no'] = seg1_grp['no'].apply(lambda x: _sampling(x, sample_pct=0.6))
seg1_grp['val_tst_no'] = seg1_grp[['no', 'tr_no']].apply(lambda x: _drop(x['no'], x['tr_no']), axis=1)
np.random.seed(111)
seg1_grp['val_no'] = seg1_grp['val_tst_no'].apply(lambda x: _sampling(x, sample_pct=0.5))
seg1_grp['tst_no'] = seg1_grp[['val_tst_no', 'val_no']].apply(lambda x: _drop(x['val_tst_no'], x['val_no']), axis=1)

print(seg1_grp)

     BAD  Grd_NK0200_000  Grd_RK0400_000  Grd_RK0400_700  cnt  \
0      0               2               1               1    3   
1      0               2               1               2    9   
2      0               2               1               3    1   
3      0               2               2               1    7   
4      0               2               2               2   68   
..   ...             ...             ...             ...  ...   
421    1               7               8               7   18   
422    1               7               8               8    5   
423    1               7               8               9    1   
424    1               7               9               7    1   
425    1               7               9               8    1   

                                                    no  \
0                                [27209, 31866, 79298]   
1    [14560, 21041, 24233, 30474, 43774, 50749, 779...   
2                                            

In [12]:
def unique_one_to_train(tr_no, val_no, tst_no):
    if len(tr_no) == 0:
        a = tr_no.copy()
        b = tst_no.copy()
        tr_no = b
        tst_no = a

    return tr_no, val_no, tst_no

In [13]:
seg1_grp.head()

Unnamed: 0,BAD,Grd_NK0200_000,Grd_RK0400_000,Grd_RK0400_700,cnt,no,tr_no,val_tst_no,val_no,tst_no
0,0,2,1,1,3,"[27209, 31866, 79298]",[31866],"[27209, 79298]",[79298],[27209]
1,0,2,1,2,9,"[14560, 21041, 24233, 30474, 43774, 50749, 779...","[85333, 14560, 50749, 24233, 21041]","[30474, 43774, 77982, 116931]","[43774, 77982]","[30474, 116931]"
2,0,2,1,3,1,[1685],[],[1685],[],[1685]
3,0,2,2,1,7,"[19983, 25093, 39881, 57762, 64301, 72225, 93683]","[57762, 93683, 72225, 39881]","[19983, 25093, 64301]",[19983],"[25093, 64301]"
4,0,2,2,2,68,"[1897, 3689, 5010, 5809, 10017, 11090, 11530, ...","[43343, 122891, 73232, 28716, 117679, 31756, 4...","[10017, 11090, 11530, 16650, 16770, 18854, 203...","[16770, 97201, 10017, 76125, 16650, 47742, 123...","[11090, 11530, 18854, 22079, 30425, 35839, 362..."


In [14]:
tr_no_arr = np.concatenate(seg1_grp['tr_no'])
val_no_arr = np.concatenate(seg1_grp['val_no'])
tst_no_arr = np.concatenate(seg1_grp['tst_no'])

tr = seg1_dev_df[seg1_dev_df['no'].isin(tr_no_arr)]
val = seg1_dev_df[seg1_dev_df['no'].isin(val_no_arr)]
tst = seg1_dev_df[seg1_dev_df['no'].isin(tst_no_arr)]

In [15]:
print('train: ', tr.shape)
print('valid: ', val.shape)
print('test: ', tst.shape)

print(len(set(tr['no']).intersection(val['no'])))
print(len(set(tr['no']).intersection(tst['no'])))
print(len(set(val['no']).intersection(tst['no'])))

train:  (37539, 1929)
valid:  (12566, 1929)
test:  (12752, 1929)
0
0
0


In [16]:
tr.head(2)

Unnamed: 0,no,BAD,신청월,직군그룹,근속기간1,웰컴_소득구간,심사원장_소득구간,AS0000136,AS0000137,AS0000138,...,P2O000500_1_s6,LA0000001_1_s9,LA0000020_1_s9,LA0000227_1_s9,P2E000500_1_s9,LA0000001_1_s12,LA0000020_1_s12,LA0000204_1_s12,LA0000227_1_s12,P2O000500_1_s12
1,2,0,201612,,2.0,2.0,0.0,-1,31,1,...,0.0,0.16,0.16,453.519989,0.0,0.1,0.1,447.5,447.5,0.0
2,3,0,201612,,5.0,3.0,0.0,-1,47,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
print(tr['BAD'].value_counts(normalize=True))
print(val['BAD'].value_counts(normalize=True))
print(tst['BAD'].value_counts(normalize=True))

0    0.597592
1    0.402408
Name: BAD, dtype: float64
0    0.597087
1    0.402913
Name: BAD, dtype: float64
0    0.595514
1    0.404486
Name: BAD, dtype: float64


In [None]:
# 저장
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
seg1_model_valid_df.to_csv(os.path.join(save_dir, 'model_valid_df.csv'), index=False, encoding='utf-8')
tr.to_csv(os.path.join(save_dir, 'seg1_train_df.csv'), index=False, encoding='utf-8')
val.to_csv(os.path.join(save_dir, 'seg1_valid_df.csv'), index=False, encoding='utf-8')
tst.to_csv(os.path.join(save_dir, 'seg1_test_df.csv'), index=False, encoding='utf-8')