In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)

In [2]:
# load data
path = r'D:\Seminar Documents\신용평가 세미나\pickle데이터'
filename_list = os.listdir(path)
print(filename_list)

['destruct_17_df.pkl', 'destruct_18_df.pkl', 'destruct_19_df.pkl', 'execute_17_df.pkl', 'execute_18_df.pkl', 'execute_19_df.pkl']


In [3]:
data_dict = dict()
for file in filename_list:
    df = pd.read_pickle(os.path.join(path, file))
    keyname = file.replace('.pkl', '')
    data_dict[keyname] = df
    print(keyname, df.shape)

destruct_17_df (69196, 1933)
destruct_18_df (57160, 1933)
destruct_19_df (72992, 1933)
execute_17_df (4920, 1933)
execute_18_df (2786, 1933)
execute_19_df (2434, 1933)


In [4]:
# 내부항목 생성
# 실행(execute)/파기(destruct) 구분 column 생성
for key, df in data_dict.items():
    if key.find('exec') != -1:
        df['EXECUTE_FG'] = 1
        df['DESTRUCT_FG'] = 0
    else:
        df['EXECUTE_FG'] = 0
        df['DESTRUCT_FG'] = 1

In [5]:
# 연도 구분 column 생성
for key, df in data_dict.items():
    if key.find('17') != -1:
        df['Y_2017_FG'] = 1
        df['Y_2018_FG'] = 0
        df['Y_2019_FG'] = 0
    elif key.find('18') != -1:
        df['Y_2017_FG'] = 0
        df['Y_2018_FG'] = 1
        df['Y_2019_FG'] = 0
    else:
        df['Y_2017_FG'] = 0
        df['Y_2018_FG'] = 0
        df['Y_2019_FG'] = 1

In [6]:
# 고유 no 부여
total_df = pd.concat(data_dict.values(), axis=0)
total_df = total_df.sort_values(by=['신청월'], ascending=True)
total_df['no'] = list(range(1, total_df.shape[0] + 1))

In [7]:
data_dict['execute_17_df'] = total_df[(total_df['EXECUTE_FG'] == 1) & (total_df['Y_2017_FG'] == 1)]
data_dict['execute_18_df'] = total_df[(total_df['EXECUTE_FG'] == 1) & (total_df['Y_2018_FG'] == 1)]
data_dict['execute_19_df'] = total_df[(total_df['EXECUTE_FG'] == 1) & (total_df['Y_2019_FG'] == 1)]
data_dict['destruct_17_df'] = total_df[(total_df['DESTRUCT_FG'] == 1) & (total_df['Y_2017_FG'] == 1)]
data_dict['destruct_18_df'] = total_df[(total_df['DESTRUCT_FG'] == 1) & (total_df['Y_2018_FG'] == 1)]
data_dict['destruct_19_df'] = total_df[(total_df['DESTRUCT_FG'] == 1) & (total_df['Y_2019_FG'] == 1)]
del total_df

In [8]:
for key, df in data_dict.items():
    print(key, df.shape)

destruct_17_df (69196, 1933)
destruct_18_df (57160, 1933)
destruct_19_df (72992, 1933)
execute_17_df (4920, 1933)
execute_18_df (2786, 1933)
execute_19_df (2434, 1933)


In [9]:
# 중복 여부 column 생성
print(data_dict['execute_17_df'].columns)

Index(['no', '신청월', '직군그룹', '근속기간1', '웰컴_소득구간', '심사원장_소득구간', 'AS0000136',
       'AS0000137', 'AS0000138', 'AS0000139',
       ...
       'LA0000001_1_s12', 'LA0000020_1_s12', 'LA0000204_1_s12',
       'LA0000227_1_s12', 'P2O000500_1_s12', 'EXECUTE_FG', 'DESTRUCT_FG',
       'Y_2017_FG', 'Y_2018_FG', 'Y_2019_FG'],
      dtype='object', length=1933)


In [10]:
duplicate_chk_cols_list = data_dict['execute_17_df'].columns[2:-5].tolist()

In [11]:
for df in data_dict.values():
    df['DUP_FG_1'] = df.duplicated(duplicate_chk_cols_list, keep='last')  # boolean Series return
    df['DUP_FG_1'] = df['DUP_FG_1'].map({True: 1, False: 0})  # map으로 T or F을 0 or 1로 변환

In [12]:
temp_list = list()
for key, df in data_dict.items():
    value = df['DUP_FG_1'].sum()
    temp_list.append(value)
    print(key, 'dup: ', value)
print('dup_sum: ', np.sum(temp_list))

destruct_17_df dup:  1263
destruct_18_df dup:  1833
destruct_19_df dup:  7741
execute_17_df dup:  0
execute_18_df dup:  0
execute_19_df dup:  64
dup_sum:  10901


In [13]:
# 전체 데이터 결합
total_df = pd.concat(data_dict.values(), axis=0)
total_df = total_df.sort_values(by=['EXECUTE_FG', 'DESTRUCT_FG', '신청월'], ascending=True)

In [14]:
total_df.head(2)

Unnamed: 0,no,신청월,직군그룹,근속기간1,웰컴_소득구간,심사원장_소득구간,AS0000136,AS0000137,AS0000138,AS0000139,B12000100,B12000200,B14000600,B22000100,B22000200,B22000300,B24000600,B28000001,B29000110,B32000200,B34000600,B41090101,B41100100,B41100200,B41100300,B41100400,B41110200,B41110400,B4E000002,BE0000021,BS0000058,BS0000114,BS0000145,BS0000146,BS0000171,BS0000917,C00000029,C11060700,C11060600,C11060400,C11060500,C11060800,CA0000601,CA0000602,CA0000603,CA0000604,CF0300309,CF0600309,CF1200309,CF0200309,...,GU0024201_1_3,GU0024001_1_3,LU0024002_1_4,LU0024202_1_4,LU0024608_1_4,GU0024201_1_4,GU0024001_1_4,P2O000500_1_s9,P2O000500_1_s3,P2E000500_1_s6,P2E000500_1_s3,P2E000500_1_s12,P27000100_1_s9,P27000100_1_s6,P27000100_1_s3,P27000100_1_s12,P21010500_1_s9,P21010500_1_s6,P21010500_1_s12,LA0000227_1_s6,LA0000227_1_s3,LA0000222_1_s9,LA0000222_1_s6,LA0000222_1_s3,LA0000222_1_s12,LA0000204_1_s9,LA0000203_1_s9,LA0000203_1_s6,LA0000001_1_s3,LA0000020_1_s3,LA0000203_1_s3,P21010500_1_s3,LA0000001_1_s6,LA0000204_1_s6,P2O000500_1_s6,LA0000001_1_s9,LA0000020_1_s9,LA0000227_1_s9,P2E000500_1_s9,LA0000001_1_s12,LA0000020_1_s12,LA0000204_1_s12,LA0000227_1_s12,P2O000500_1_s12,EXECUTE_FG,DESTRUCT_FG,Y_2017_FG,Y_2018_FG,Y_2019_FG,DUP_FG_1
0,1,201612,,5.0,5.0,5.0,23,-1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1382,104,0,0,0,0,...,0,0,0,0,113,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-318.640015,0.0,-339.390015,-321.429993,0.0,-236.259995,-204.740005,-339.390015,-321.429993,0.0,0.0,0.0,0.0,-0.11,-318.640015,0.0,-0.15,-0.15,-204.740005,0.0,-0.12,-0.12,-166.75,-166.75,0.0,0,1,1,0,0,0
24,2,201612,,2.0,2.0,0.0,-1,31,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2814,0,0,0,0,0,3347,2094,3347,742,1856,2825,2253,1502,...,0,0,3,9000,10,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,537.609985,-15.3,-548.669983,-6.32,-17.700001,-375.570007,453.519989,-548.669983,-6.32,0.0,0.0,-17.700001,0.0,0.25,537.609985,0.0,0.16,0.16,453.519989,0.0,0.1,0.1,447.5,447.5,0.0,0,1,1,0,0,0


In [15]:
# 중복 여부 column 생성: 실행/파기
temp_df = total_df[total_df['DUP_FG_1'] == 0]
duplicate_no_list = temp_df.loc[temp_df.duplicated(duplicate_chk_cols_list, keep='last'), 'no'].tolist()

In [16]:
total_df['DUP_FG_2'] = 0
total_df.loc[total_df['no'].isin(duplicate_no_list), 'DUP_FG_2'] = 1

In [17]:
total_df['FINAL_DUP_FG'] = total_df['DUP_FG_1'] + total_df['DUP_FG_2']
total_df['FINAL_DUP_FG'] = total_df['FINAL_DUP_FG'].apply(lambda x: 1 if x > 0 else 0)
print('final_dup: ', total_df['FINAL_DUP_FG'].sum())

final_dup:  10939


In [18]:
# 저장
total_df.sort_values(by=['no'], ascending=True, inplace=True)
total_df.reset_index(drop=True, inplace=True)

In [19]:
need_cols = total_df.columns[:6].tolist()
fg_cols = total_df.columns[total_df.columns.str.contains('FG')].tolist()
final_need_cols = need_cols + fg_cols
print(final_need_cols)

['no', '신청월', '직군그룹', '근속기간1', '웰컴_소득구간', '심사원장_소득구간', 'EXECUTE_FG', 'DESTRUCT_FG', 'Y_2017_FG', 'Y_2018_FG', 'Y_2019_FG', 'DUP_FG_1', 'DUP_FG_2', 'FINAL_DUP_FG']


In [20]:
inner_total_df = total_df[final_need_cols]
inner_total_df.sort_values(by=['no'], ascending=True, inplace=True)

In [None]:
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
inner_total_df.to_csv(os.path.join(save_dir, 'temp1_inner_total_df.csv'), index=False)
total_df = total_df.drop(fg_cols, axis=1)
total_df.to_pickle(os.path.join(save_dir, 'total_df.pkl'))

In [22]:
# 내부항목 생성
# 0등급(CB,SP) column 생성
def grade_zero(cb, sp):
    result = 0
    if cb == 0 or sp == 0:
        result = 1
    return result

In [23]:
input_cols = ['Grd_RK0400_000', 'Grd_RK0400_700']
total_df['GRADE_ZERO_FG'] = total_df[input_cols].apply(lambda x: grade_zero(x['Grd_RK0400_000'], x['Grd_RK0400_700']), axis=1)
print('total_grade_0: ', total_df['GRADE_ZERO_FG'].sum())

total_grade_0:  36


In [24]:
inner_total_df['GRADE_ZERO_FG'] = total_df['GRADE_ZERO_FG']
print('grade_0: ', inner_total_df['GRADE_ZERO_FG'].sum())

grade_0:  36


In [25]:
# 기부도 column 생성
def pre_bad(col):
    result = 0
    if col > 0:
        result = 1
    return result

In [26]:
need_cols = ['BS0000145', 'BS0000172', 'BS0000169', 'BS0000171', 'B12000200', 'B22000200', 'PS0000296_1', 'P27000100']
result_cols = ['PRE_BAD_FG_1', 'PRE_BAD_FG_2', 'PRE_BAD_FG_3', 'PRE_BAD_FG_4', 'PRE_BAD_FG_5', 'PRE_BAD_FG_6','PRE_BAD_FG_7', 'PRE_BAD_FG_8']

for in_col, out_col in zip(need_cols, result_cols):
    total_df[out_col] = total_df[in_col].apply(lambda x: pre_bad(x))
print(total_df[result_cols].sum())
inner_total_df[result_cols] = total_df[result_cols]

PRE_BAD_FG_1     3192
PRE_BAD_FG_2      752
PRE_BAD_FG_3     1521
PRE_BAD_FG_4      922
PRE_BAD_FG_5    15426
PRE_BAD_FG_6     4042
PRE_BAD_FG_7     4596
PRE_BAD_FG_8    19875
dtype: int64


In [27]:
need_cols = ['PRE_BAD_FG_1', 'PRE_BAD_FG_2', 'PRE_BAD_FG_3', 'PRE_BAD_FG_4', 'PRE_BAD_FG_5', 'PRE_BAD_FG_6','PRE_BAD_FG_7', 'PRE_BAD_FG_8']

total_df['TOTAL_PRE_BAD_FG'] = total_df[need_cols].sum(axis=1)
total_df['TOTAL_PRE_BAD_FG'] = total_df['TOTAL_PRE_BAD_FG'].apply(lambda x: 1 if x > 0 else 0)
print('total_pre_bad: ', total_df['TOTAL_PRE_BAD_FG'].sum())

total_pre_bad:  32681


In [28]:
inner_total_df['TOTAL_PRE_BAD_FG'] = total_df['TOTAL_PRE_BAD_FG']

In [None]:
# 저장
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
inner_total_df.to_csv(os.path.join(save_dir, 'temp2_inner_total_df.csv'), index=False)

In [30]:
# 주택담보대출 column 생성
house_cols = ['LA0000126', 'L23001001', 'A5WC0000008500', 'A5WC0000008400', 'LA0001016', 'LA0005008',
              'LA0006011', 'LA0007201', 'LA0008008', 'LA0012005', 'LA0012208', 'LA0014008', 'LA0029203',
              'LA0099252', 'LU0021006_1', 'LU0024013_1', 'LU0025004_1']
house_cols_name = ['TOTAL_HOUSE_FG_1', 'TOTAL_HOUSE_FG_2', 'TOTAL_HOUSE_FG_3', 'TOTAL_HOUSE_FG_4',
                   'BANK_HOUSE_FG', 'INS_HOUSE_FG', 'CARD_HOUSE_FG', 'LEASE_HOUSE_FG', 'SB_HOUSE_FG',
                   'COOP_HOUSE_FG', 'CREDUNION_HOUSE_FG', 'CAP_HOUSE_FG', 'MG_HOUSE_FG', 'ETC_HOUSE_FG',
                   'ACC_SB_HOUSE_FG', 'ACC_ML_HOUSE_FG', 'ACC_P2P_HOUSE_FG']

In [31]:
for house_col, col_name in zip(house_cols, house_cols_name):
    total_df[col_name] = total_df[house_col].apply(lambda x: 1 if x > 0 else 0)
    
print(total_df[['TOTAL_HOUSE_FG_3', 'TOTAL_HOUSE_FG_4']].sum())

TOTAL_HOUSE_FG_3    209488
TOTAL_HOUSE_FG_4    209488
dtype: int64


In [32]:
def welc_house_fg(welc_house_col):
    result = 0
    if welc_house_col == 1:
        result = 1
    return result

In [33]:
need_cols = ['A5WC0000008500', 'A5WC0000008400']
result_cols = ['TOTAL_HOUSE_FG_3', 'TOTAL_HOUSE_FG_4']
for in_col, out_col in zip(need_cols, result_cols):
    total_df[out_col] = total_df[in_col].apply(lambda x: welc_house_fg(x))

In [34]:
inner_total_df[house_cols_name] = total_df[house_cols_name]

In [35]:
need_cols = ['TOTAL_HOUSE_FG_1', 'TOTAL_HOUSE_FG_2', 'TOTAL_HOUSE_FG_3', 'TOTAL_HOUSE_FG_4']
total_df['TOTAL_HOUSE_FG'] = total_df[need_cols].sum(axis=1)
total_df['TOTAL_HOUSE_FG'] = total_df['TOTAL_HOUSE_FG'].apply(lambda x: 1 if x > 0 else 0)
print('total_house: ', total_df['TOTAL_HOUSE_FG'].sum())

total_house:  16557


In [36]:
inner_total_df['TOTAL_HOUSE_FG'] = total_df['TOTAL_HOUSE_FG']

In [None]:
# 저장
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
inner_total_df.to_csv(os.path.join(save_dir, 'temp3_inner_total_df.csv'), index=False)

In [38]:
# 내부항목 생성
# 연소득 column 생성
def _get_year_income(A5WC0000008000, A5WC0000000200, IE0300007):
    year_income = 0
    if A5WC0000008000 > 0:
        year_income = A5WC0000008000
    elif A5WC0000000200 > 0:
        year_income = A5WC0000000200
    elif IE0300007 > 0:
        year_income = IE0300007 * 10
    else:
        year_income = 0
    return year_income

In [39]:
need_cols = ['A5WC0000008000', 'A5WC0000000200', 'IE0300007']
total_df['INCOME'] = total_df[need_cols].apply(
    lambda x: _get_year_income(x['A5WC0000008000'], x['A5WC0000000200'], x['IE0300007']), axis=1)

In [40]:
inner_total_df['INCOME'] = total_df['INCOME']

In [41]:
# 연소득 구간 column 생성
income_edge_list = [0, 12000, 16000, 20000, 24000, 27000, 30000, 33000, 35000, 40000, 45000, 50000, 55000, 60000, np.inf]
income_interval = pd.cut(total_df['INCOME'], bins=income_edge_list, right=False)

In [42]:
income_interval_categories = income_interval.cat.categories.values
income_interval_codes = income_interval.cat.codes.values
income_interval_codes = income_interval_codes + 1
total_df['INCOME_INTERVAL'] = income_interval_codes

In [43]:
inner_total_df['INCOME_INTERVAL'] = total_df['INCOME_INTERVAL']

In [44]:
# UDIR column 생성
def _get_udir_interval(A5RCLSRL013400):
    udir_interval = 0

    if 0 <= A5RCLSRL013400 < 50:
        udir_interval = 1
    elif 50 <= A5RCLSRL013400 < 100:
        udir_interval = 2
    elif 100 <= A5RCLSRL013400 < 150:
        udir_interval = 3
    elif 150 <= A5RCLSRL013400 < 200:
        udir_interval = 4
    elif 200 <= A5RCLSRL013400 < 250:
        udir_interval = 5
    elif 250 <= A5RCLSRL013400 <= 300:
        udir_interval = 6
    elif 300 < A5RCLSRL013400:
        udir_interval = 7

    return udir_interval

In [45]:
total_df['UDIR_INTERVAL'] = total_df['A5RCLSRL013400'].apply(lambda x: _get_udir_interval(x))
inner_total_df['UDIR'] = total_df['A5RCLSRL013400']

In [46]:
inner_total_df['UDIR_INTERVAL'] = total_df['UDIR_INTERVAL']

In [47]:
# SEG column 생성
def _get_seg(year_income_interval, udir_interval):
    seg = 0
    if (year_income_interval in [1, 2, 3, 4]) and (udir_interval in [1, 2, 3, 4]):
        seg = 1
    elif (year_income_interval in [5, 6, 7, 8]) and (udir_interval in [1, 2, 3, 4, 5]):
        seg = 2
    elif (year_income_interval in [9, 10, 11, 12, 13, 14]) and (udir_interval in [1, 2, 3, 4, 5, 6]):
        seg = 3

    return seg

In [48]:
need_cols = ['INCOME_INTERVAL', 'UDIR_INTERVAL']
total_df['SEG'] = total_df[need_cols].apply(lambda x: _get_seg(x['INCOME_INTERVAL'], x['UDIR_INTERVAL']), axis=1)

In [49]:
inner_total_df['SEG'] = total_df['SEG']

In [50]:
def _get_final_seg(seg):
    final_seg = 0

    if seg == 1 or seg == 2:
        final_seg = 1
    elif seg == 3:
        final_seg = 2
    return final_seg

In [51]:
total_df['FINAL_SEG'] = total_df['SEG'].apply(lambda x: _get_final_seg(x))

In [52]:
inner_total_df['FINAL_SEG'] = total_df['FINAL_SEG']

In [None]:
# 저장
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
inner_total_df.to_csv(os.path.join(save_dir, 'temp4_inner_total_df.csv'), index=False)

In [54]:
# 최저생계비(CAP_34) column 생성
def cap34(job, income):
    result = 0
    if job != 'D' and income <= 12288:
        result = 1
    return result

In [55]:
input_cols = ['직군그룹', 'INCOME']
inner_total_df['CAP_34'] = inner_total_df[input_cols].apply(lambda x: cap34(x['직군그룹'], x['INCOME']), axis=1)
print('cap34: ', inner_total_df['CAP_34'].sum())

cap34:  14900


In [56]:
# 사망자 column 생성
inner_total_df['THE_DEAD'] = total_df['LIVESTAT2']
print('dead: ', inner_total_df['THE_DEAD'].sum())

dead:  1


In [57]:
# 개발기간 column 생성
def not_dev_date(ym):
    result = 0
    if ym >= 201811 and ym <= 201907:
        result = 1
    elif ym >= 201911:
        result = 1

    return result

In [58]:
inner_total_df['NOT_DEV_DATE_FG'] = inner_total_df['신청월'].apply(lambda x: not_dev_date(x))
print('dev_date: ', inner_total_df['NOT_DEV_DATE_FG'].sum())

dev_date:  58332


In [59]:
# 개발대상 column 생성
def final_dev_fg(dup, grade_zero, pre_bad, house, seg, cap, dead, date):
    result = 0
    if dup == 0 and grade_zero == 0 and pre_bad == 0 and house == 0 and seg != 0 and cap == 0 and dead == 0 and date == 0:
        result = 1
    return result

In [60]:
input_cols = ['FINAL_DUP_FG', 'GRADE_ZERO_FG', 'TOTAL_PRE_BAD_FG', 'TOTAL_HOUSE_FG', 'FINAL_SEG', 'CAP_34', 'THE_DEAD', 'NOT_DEV_DATE_FG']

inner_total_df['DEV_TARGET_FG'] = inner_total_df[input_cols].apply(
    lambda x: final_dev_fg(x['FINAL_DUP_FG'], x['GRADE_ZERO_FG'], x['TOTAL_PRE_BAD_FG'], x['TOTAL_HOUSE_FG'],
                           x['FINAL_SEG'], x['CAP_34'], x['THE_DEAD'], x['NOT_DEV_DATE_FG'],), axis=1)

print('dev_target: ', inner_total_df['DEV_TARGET_FG'].sum())

dev_target:  98669


In [None]:
# 저장
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
inner_total_df.to_csv(os.path.join(save_dir, 'temp5_inner_total_df.csv'), index=False)

In [62]:
# 부도 정의
def _get_bad_common(TPERF0003, TPERF0005):
    bad_common = 0
    if TPERF0003 > 0 or TPERF0005 > 0:
        bad_common = 1
    else:
        bad_common = 0
    return bad_common

In [64]:
bad_need_cols = ['TPERF0003', 'TPERF0005']
total_df['BAD_공공'] = total_df[bad_need_cols].apply(lambda x: _get_bad_common(x['TPERF0003'], x['TPERF0005']), axis=1)
print('BAD_공공: ', total_df['BAD_공공'].sum())

BAD_공공:  39220


In [65]:
inner_total_df['BAD_공공'] = total_df['BAD_공공']

In [66]:
def _get_bad_finan(TPERF0013, DQ_BANK_12, DQ_LEASE_12, DQ_CARD_12, DQ_CAPITAL_12, DQ_SB_12):
    bad_finan = 0
    if (TPERF0013 >= 60 or DQ_BANK_12 >= 60 or DQ_LEASE_12 >= 60 or DQ_CARD_12 >= 30 or DQ_CAPITAL_12 >= 30 or DQ_SB_12 >= 30):
        bad_finan = 1
    else:
        bad_finan = 0
    return bad_finan

In [67]:
bad_need_cols = ['TPERF0013', 'DQ_BANK_12', 'DQ_LEASE_12', 'DQ_CARD_12','DQ_CAPITAL_12', 'DQ_SB_12']

total_df['BAD_금융_12'] = total_df[bad_need_cols].apply(
    lambda x: _get_bad_finan(x['TPERF0013'], x['DQ_BANK_12'], x['DQ_LEASE_12'], x['DQ_CARD_12'],  x['DQ_CAPITAL_12'], x['DQ_SB_12']), axis=1)

print('BAD_금융_12: ', total_df['BAD_금융_12'].sum())

BAD_금융_12:  58075


In [68]:
inner_total_df['BAD_금융_12'] = total_df['BAD_금융_12']

In [69]:
def _get_bad_busi(TPERF0013, DQ_BANK_12, DQ_LEASE_12, DQ_CARD_12, DQ_CAPITAL_12, DQ_SB_12):
    bad_short = 0
    bad_bank = 0
    bad_lease = 0
    bad_card = 0
    bad_capital = 0
    bad_sb = 0
    if TPERF0013 >= 60:
        bad_short = 1
    if DQ_BANK_12 >= 60:
        bad_bank = 1
    if DQ_LEASE_12 >= 60:
        bad_card = 1
    if DQ_CARD_12 >= 30:
        bad_lease = 1
    if DQ_CAPITAL_12 >= 30:
        bad_capital = 1
    if DQ_SB_12 >= 30:
        bad_sb = 1
    return bad_short, bad_bank, bad_lease, bad_card, bad_capital, bad_sb

In [70]:
bad_need_cols = ['TPERF0013', 'DQ_BANK_12', 'DQ_LEASE_12', 'DQ_CARD_12', 'DQ_CAPITAL_12', 'DQ_SB_12']
result_cols = ['BAD_금융_단기_12','BAD_금융_은행_12','BAD_금융_리스_12', 'BAD_금융_카드_12','BAD_금융_캐피탈_12','BAD_금융_저축은행_12']

total_df[result_cols] = total_df[bad_need_cols].apply(
    lambda x: _get_bad_busi(x['TPERF0013'], x['DQ_BANK_12'], x['DQ_LEASE_12'], x['DQ_CARD_12'], x['DQ_CAPITAL_12'], x['DQ_SB_12']), axis=1, result_type='expand')

for col in result_cols:
    print(col, ': ', total_df[col].sum())

BAD_금융_단기_12 :  48555
BAD_금융_은행_12 :  14183
BAD_금융_리스_12 :  38436
BAD_금융_카드_12 :  903
BAD_금융_캐피탈_12 :  16868
BAD_금융_저축은행_12 :  26423


In [71]:
inner_total_df[result_cols] = total_df[result_cols]

In [72]:
def _get_bad_ml(DQ_DB_12):
    bad_ml = 0
    if DQ_DB_12 >= 30:
        bad_ml = 1
    return bad_ml

In [73]:
total_df['BAD_대부_12'] = total_df['DQ_DB_12'].apply(lambda x: _get_bad_ml(x))
print('BAD_대부_12: ', total_df['BAD_대부_12'].sum())

BAD_대부_12:  26528


In [74]:
inner_total_df['BAD_대부_12'] = total_df['BAD_대부_12']

In [75]:
def _get_bad_finan2(DQ_CARD_6, DQ_CAPITAL_6, DQ_SB_6):
    bad_finan = 0
    if (DQ_CARD_6 > 10 or DQ_CAPITAL_6 > 10 or DQ_SB_6 > 10):
        bad_finan = 1
    else:
        bad_finan = 0
    return bad_finan

In [76]:
bad_need_cols = ['DQ_CARD_6', 'DQ_CAPITAL_6', 'DQ_SB_6']

total_df['BAD_금융_6'] = total_df[bad_need_cols].apply(
    lambda x: _get_bad_finan2(x['DQ_CARD_6'], x['DQ_CAPITAL_6'], x['DQ_SB_6']), axis=1)

print('BAD_금융_6: ', total_df['BAD_금융_6'].sum())

BAD_금융_6:  57947


In [77]:
inner_total_df['BAD_금융_6'] = total_df['BAD_금융_6']

In [78]:
def _get_bad_busi2(DQ_CARD_6, DQ_CAPITAL_6, DQ_SB_6):
    bad_card = 0
    bad_capital = 0
    bad_sb = 0
    if DQ_CARD_6 > 10:
        bad_card = 1
    if DQ_CAPITAL_6 > 10:
        bad_capital = 1
    if DQ_SB_6 > 10:
        bad_sb = 1
    return bad_card, bad_capital, bad_sb

In [79]:
bad_need_cols = ['DQ_CARD_6', 'DQ_CAPITAL_6', 'DQ_SB_6']
result_cols = ['BAD_금융_카드_6', 'BAD_금융_캐피탈_6', 'BAD_금융_저축은행_6']

total_df[result_cols] = total_df[bad_need_cols].apply(lambda x: _get_bad_busi2(x['DQ_CARD_6'], x['DQ_CAPITAL_6'], x['DQ_SB_6']), axis=1, result_type='expand')

for col in result_cols:
    print(col, ': ', total_df[col].sum())

BAD_금융_카드_6 :  42651
BAD_금융_캐피탈_6 :  17089
BAD_금융_저축은행_6 :  26871


In [80]:
inner_total_df[result_cols] = total_df[result_cols]

In [81]:
def _get_bad_ml2(DQ_DB_6):
    bad_ml = 0
    if DQ_DB_6 > 10:
        bad_ml = 1
    return bad_ml

In [82]:
total_df['BAD_대부_6'] = total_df['DQ_DB_6'].apply(lambda x: _get_bad_ml2(x))
print('BAD_대부_6: ', total_df['BAD_대부_6'].sum())

BAD_대부_6:  22811


In [83]:
inner_total_df['BAD_대부_6'] = total_df['BAD_대부_6']

In [84]:
def _get_bad(bad_common, bad_finan_12, bad_ml_12, bad_finan_6, bad_ml_6):
    bad = 0
    if (bad_common > 0 or bad_finan_12 > 0 or bad_ml_12 > 0 or bad_finan_6 > 0 or bad_ml_6 > 0):
        bad = 1
    return bad

In [85]:
bad_need_cols = ['BAD_공공', 'BAD_금융_12', 'BAD_대부_12', 'BAD_금융_6','BAD_대부_6']
total_df['BAD'] = total_df[bad_need_cols].apply(lambda x: _get_bad(x['BAD_공공'], x['BAD_금융_12'], x['BAD_대부_12'], x['BAD_금융_6'], x['BAD_대부_6']), axis=1)
print('bad: ', total_df['BAD'].sum())

bad:  79933


In [86]:
inner_total_df['BAD'] = total_df['BAD']

In [None]:
# 저장
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
inner_total_df.to_csv(os.path.join(save_dir, 'final_inner_total_df.csv'), index=False)

In [87]:
print('inner_total: ', inner_total_df.shape)
print('total: ', total_df.shape)

inner_total:  (209488, 67)
total:  (209488, 1984)


In [88]:
# 개발/검증 data 분리
dev_inner_df = inner_total_df[(inner_total_df['FINAL_DUP_FG'] == 0) & (inner_total_df['GRADE_ZERO_FG'] == 0)]
print('dev_inner: ', dev_inner_df.shape)

dev_inner:  (198513, 67)


In [89]:
dev_no_list = dev_inner_df['no'].tolist()
dev_cb_df = total_df[total_df['no'].isin(dev_no_list)]
print('dev_cb: ', dev_cb_df.shape)

dev_cb:  (198513, 1984)


In [90]:
model_dev_inner_df = dev_inner_df[dev_inner_df['신청월'] < 201811]
model_valid_inner_df = dev_inner_df[(dev_inner_df['신청월'].isin([201908, 201909, 201910]))]

In [91]:
model_dev_cb_df = dev_cb_df[dev_cb_df['신청월'] < 201811]
model_valid_cb_df = dev_cb_df[(dev_cb_df['신청월'].isin([201908, 201909, 201910]))]

In [None]:
# 저장
model_dev_inner_df.to_csv(os.path.join(save_dir, 'model_dev_inner_df.csv'), index=False)
model_valid_inner_df.to_csv(os.path.join(save_dir, 'model_valid_inner_df.csv'), index=False)
model_dev_cb_df.to_csv(os.path.join(save_dir, 'model_dev_cb_df.csv'), index=False)
model_valid_cb_df.to_csv(os.path.join(save_dir, 'model_valid_cb_df.csv'), index=False)