In [5]:
import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# <font color='orange'>01. load data

In [6]:
data_rootpath = r'C:\Users\w10\Desktop\신용평가모형 세미나\2주차\데이터\dev_data'

In [12]:
os.listdir(data_rootpath)

['model_dev_cb_df.csv',
 'model_dev_inner_df.csv',
 'model_valid_cb_df.csv',
 'model_valid_inner_df.csv']

In [13]:
dev_inner_df = pd.read_csv(os.path.join(data_rootpath, 'model_dev_inner_df.csv'))
valid_inner_df = pd.read_csv(os.path.join(data_rootpath, 'model_valid_inner_df.csv'))

In [14]:
# dictionary에 담기
df_dict = dict()

df_dict['dev_inner'] = dev_inner_df
df_dict['valid_inner'] = valid_inner_df

In [15]:
for key, df in df_dict.items():
    print(key, df.shape)

dev_inner (240016, 67)
valid_inner (40942, 67)


In [17]:
df_dict['dev_inner'].head(2)

Unnamed: 0,no,신청월,직군그룹,근속기간1,웰컴_소득구간,심사원장_소득구간,EXECUTE_FG,DESTRUCT_FG,Y_2017_FG,Y_2018_FG,...,BAD_금융_카드_12,BAD_금융_캐피탈_12,BAD_금융_저축은행_12,BAD_대부_12,BAD_금융_6,BAD_금융_카드_6,BAD_금융_캐피탈_6,BAD_금융_저축은행_6,BAD_대부_6,BAD
0,1,201612,,5.0,5.0,0.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,201612,D,5.0,2.0,0.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


# <font color='orange'>02. 요건별 BAD 현황파악

In [62]:
# 모형개발데이터 불량률 
df_dict['dev_inner']["BAD"].value_counts(normalize = True)

0    0.553109
1    0.446891
Name: BAD, dtype: float64

In [36]:
# 불량 관련 항목 생성
## ^BAD는 문자열의 맨처음과 일치
bad_cols = df_dict['dev_inner'].columns[df_dict['dev_inner'].columns.str.contains('^BAD')]

In [37]:
bad_cols

Index(['BAD_공공', 'BAD_금융_12', 'BAD_금융_단기_12', 'BAD_금융_은행_12', 'BAD_금융_리스_12',
       'BAD_금융_카드_12', 'BAD_금융_캐피탈_12', 'BAD_금융_저축은행_12', 'BAD_대부_12',
       'BAD_금융_6', 'BAD_금융_카드_6', 'BAD_금융_캐피탈_6', 'BAD_금융_저축은행_6', 'BAD_대부_6',
       'BAD'],
      dtype='object')

In [38]:
bad_dict = dict()

for key, df in df_dict.items():
    print('======',key,'======')
    this = pd.DataFrame()
    for col in bad_cols:
        value = df[col].value_counts().sort_index(ascending = False).values
        value = pd.DataFrame(value).transpose()
        this = this.append(value, )
        
        
    this.index = bad_cols
    this.columns = ['불량자','우량자']
    bad_dict[key] = this    
        
    



In [48]:
bad_dict['valid_inner']

Unnamed: 0,불량자,우량자
BAD_공공,572,40370
BAD_금융_12,3803,37139
BAD_금융_단기_12,3439,37503
BAD_금융_은행_12,236,40706
BAD_금융_리스_12,1462,39480
BAD_금융_카드_12,12,40930
BAD_금융_캐피탈_12,357,40585
BAD_금융_저축은행_12,735,40207
BAD_대부_12,375,40567
BAD_금융_6,5161,35781


In [50]:
# 한 엑셀파일에 여러 sheet로 저장


save_path = os.path.join(r'C:\Users\w10\Desktop\신용평가모형 세미나\2주차\데이터\부도현황', '요건별_부도현황_결과.xlsx')

with pd.ExcelWriter(save_path) as writer:
    for key, result in bad_dict.items():
        result.to_excel(writer, sheet_name = key, )

# <font color='orange'>03. 요건별 누적 BAD 현황

In [52]:
bad_cols = df_dict['dev_inner'].columns[df_dict['dev_inner'].columns.str.contains('^BAD')]

In [53]:
bad_cols

Index(['BAD_공공', 'BAD_금융_12', 'BAD_금융_단기_12', 'BAD_금융_은행_12', 'BAD_금융_리스_12',
       'BAD_금융_카드_12', 'BAD_금융_캐피탈_12', 'BAD_금융_저축은행_12', 'BAD_대부_12',
       'BAD_금융_6', 'BAD_금융_카드_6', 'BAD_금융_캐피탈_6', 'BAD_금융_저축은행_6', 'BAD_대부_6',
       'BAD'],
      dtype='object')

In [54]:
# 누적 현황시 구분별 최종 BAD 항목 제외
bad_cols = bad_cols.drop(['BAD_금융_12','BAD_금융_6','BAD'], )
bad_cols

Index(['BAD_공공', 'BAD_금융_단기_12', 'BAD_금융_은행_12', 'BAD_금융_리스_12',
       'BAD_금융_카드_12', 'BAD_금융_캐피탈_12', 'BAD_금융_저축은행_12', 'BAD_대부_12',
       'BAD_금융_카드_6', 'BAD_금융_캐피탈_6', 'BAD_금융_저축은행_6', 'BAD_대부_6'],
      dtype='object')

In [55]:
# 누적을 위한 순서 배치
## 하위업권에서 상위업권으로
tmp_cols = ['BAD_공공',
            'BAD_금융_단기_12',
            'BAD_금융_저축은행_12', 
            'BAD_금융_캐피탈_12',
            'BAD_금융_카드_12',
            'BAD_금융_리스_12', 
            'BAD_금융_은행_12',
            'BAD_대부_12',
            'BAD_금융_카드_6', 
            'BAD_금융_캐피탈_6',
            'BAD_금융_저축은행_6',
            'BAD_대부_6'
            
           ]

In [57]:
cum_bad_dict = dict()
for key, df in df_dict.items():
    cum_bad = pd.DataFrame()
    total_cnt = df.shape[0]
    for num, col in enumerate(tmp_cols):
        bad_cnt = df[df[col] == 1].shape[0]
        not_bad_cnt = total_cnt - bad_cnt
        value = pd.DataFrame({'대상자' : bad_cnt,
                             '비대상자' : not_bad_cnt},index = [col])
        cum_bad = cum_bad.append(value, )
            
        remove_idx = df[df[col] == 1]['no'].values
        df = df[df['no'].isin(remove_idx) == False]
    
#     cum_bad.columns = ['대상자',]
    cum_bad_dict[key] = cum_bad
    
        

In [58]:
cum_bad_dict['dev_inner']

Unnamed: 0,대상자,비대상자
BAD_공공,61086,178930
BAD_금융_단기_12,15206,224810
BAD_금융_저축은행_12,3996,236020
BAD_금융_캐피탈_12,1964,238052
BAD_금융_카드_12,0,240016
BAD_금융_리스_12,7208,232808
BAD_금융_은행_12,10,240006
BAD_대부_12,1928,238088
BAD_금융_카드_6,10040,229976
BAD_금융_캐피탈_6,1637,238379


In [60]:
# 한 엑셀파일에 여러 sheet로 저장


save_path = os.path.join(r'C:\Users\w10\Desktop\신용평가모형 세미나\2주차\데이터\부도현황', '요건별_누적_부도현황_결과.xlsx')

with pd.ExcelWriter(save_path) as writer:
    for key, result in cum_bad_dict.items():
        result.to_excel(writer, sheet_name = key, )