In [1]:
import pandas as pd
import numpy as np
import os
import warnings 
warnings.filterwarnings('ignore')
from datetime import datetime

In [2]:
pd.set_option('display.max_columns', 100)

# <font color='orange'>01. load data

In [3]:
path = r'C:\Users\w10\Desktop\신용평가모형 세미나\2주차\데이터\raw_cb_data'

In [4]:
# 로드할 파일 리스트 생성
file_list = os.listdir(path)
file_list

['destruct_17_df.csv',
 'destruct_18_df.csv',
 'destruct_19_df.csv',
 'execute_17_df.csv',
 'execute_18_df.csv',
 'execute_19_df.csv']

In [5]:
# 데이터 dictionary 생성
total_df_dict = dict()

for file in file_list:
    # 데이터 로드
    df = pd.read_csv(os.path.join(path, file))
    # 파일명에서 '.csv'를 제외한 string을 dictionary의 key값으로 지정
    key_name = file.replace('.csv', '')
    # dictionary에 각 데이터 넣기
    total_df_dict[key_name] = df
    print(key_name, df.shape)

destruct_17_df (138393, 1933)
destruct_18_df (114319, 1933)
destruct_19_df (145985, 1933)
execute_17_df (9840, 1933)
execute_18_df (5573, 1933)
execute_19_df (4869, 1933)


#  <font color='orange'>02. 실행, 파기 구분 항목 생성

In [6]:
# 데이터별 실행, 파기정보 구분을 위한 항목 생성

for key, df in total_df_dict.items():
    # string.find("aaaaa")
    ## 해당 string에 "aaaaa"라는 문자열이 없으면 -1 반환함
    ### 실행데이터이면 EXECUTE_FG 항목값은 1
    if key.find('exec')!=-1:
        df['EXECUTE_FG'] = 1
        df['DESTRUCT_FG'] = 0
    # 파기데이터이면 DESTRUCT_FG 항목값은 1
    else:
        df['EXECUTE_FG'] = 0
        df['DESTRUCT_FG'] = 1

#  <font color='orange'>03. 연도구분 컬럼 생성

In [7]:
# 데이터별 연도 구분을 위한 항목 생성

for key, df in total_df_dict.items():
    if key.find('17')!=-1:
        df['Y_2017_FG'] = 1
        df['Y_2018_FG'] = 0
        df['Y_2019_FG'] = 0
    elif key.find('18')!=-1:
        df['Y_2017_FG'] = 0
        df['Y_2018_FG'] = 1
        df['Y_2019_FG'] = 0
    else:
        df['Y_2017_FG'] = 0
        df['Y_2018_FG'] = 0
        df['Y_2019_FG'] = 1

#  <font color='orange'>04. 6개의 데이터 set 결합후 "no" 생성 후 다시 분리

* 신청번호, 고객번호 등의 key값이 없으므로 고유한 key값이 필요
  * 데이터 pool에서 고유한 key값을 생성하기 위해 결합 후 "no" 생성

In [8]:
# 6개의 데이터 결합
total_df = pd.concat(total_df_dict.values(), axis = 0)

In [9]:
total_df.shape

(418979, 1933)

In [10]:
# 신청월 기준으로 오름차순 정렬
total_df = total_df.sort_values(by = ['신청월'], ascending = True)

In [11]:
# 1부터 418979까지 고유한 값을 가지는 sequence생성
total_df['no'] = list(range(1, total_df.shape[0] + 1))

In [12]:
# 각 데이터 set별 중복 처리를 위해 다시 분리
total_df_dict['execute_17_df'] = total_df[(total_df['EXECUTE_FG']==1) & (total_df['Y_2017_FG']==1)]
total_df_dict['destruct_17_df'] = total_df[(total_df['DESTRUCT_FG']==1) & (total_df['Y_2017_FG']==1)]

total_df_dict['execute_18_df'] = total_df[(total_df['EXECUTE_FG']==1) & (total_df['Y_2018_FG']==1)]
total_df_dict['destruct_18_df'] = total_df[(total_df['DESTRUCT_FG']==1) & (total_df['Y_2018_FG']==1)]

total_df_dict['execute_19_df'] = total_df[(total_df['EXECUTE_FG']==1) & (total_df['Y_2019_FG']==1)]
total_df_dict['destruct_19_df'] = total_df[(total_df['DESTRUCT_FG']==1) & (total_df['Y_2019_FG']==1)]

In [13]:
for key, df in total_df_dict.items():
    print(key, df.shape)

destruct_17_df (138393, 1933)
destruct_18_df (114319, 1933)
destruct_19_df (145985, 1933)
execute_17_df (9840, 1933)
execute_18_df (5573, 1933)
execute_19_df (4869, 1933)


In [14]:
del total_df, 

#  <font color='orange'>05. 1차 중복 체크

* CB항목(신청정보 포함)을 기준으로 각 6개의 데이터별 중복 여부 항목 생성

In [15]:
total_df_dict['execute_17_df'].columns

Index(['no', '신청월', '직군그룹', '근속기간1', '웰컴_소득구간', '심사원장_소득구간', 'AS0000136',
       'AS0000137', 'AS0000138', 'AS0000139',
       ...
       'LA0000001_1_s12', 'LA0000020_1_s12', 'LA0000204_1_s12',
       'LA0000227_1_s12', 'P2O000500_1_s12', 'EXECUTE_FG', 'DESTRUCT_FG',
       'Y_2017_FG', 'Y_2018_FG', 'Y_2019_FG'],
      dtype='object', length=1933)

In [18]:
# 'no','신청월', 위에서 생성한 내부항목을 제외한 항목들로 중복 제외 
dup_chk_cols = total_df_dict['execute_17_df'].columns[2:-5].tolist()

In [19]:
len(dup_chk_cols)

1926

In [21]:
for df in total_df_dict.values():
    # 신청월 기준으로 오름차순 정렬되있음
    ## keep = 'last'로 중복일 경우 최근 정보를 남김
    df['DUP_FG_1'] = df.duplicated(dup_chk_cols, keep = 'last')
    df['DUP_FG_1'] = df['DUP_FG_1'].map({True : 1, False : 0})

In [22]:
# 각 데이터 set별 중복 제외 대상 건 확인
lst = []
for key, df in total_df_dict.items():
    value = df['DUP_FG_1'].sum()
    lst.append(value)
    print(key)
    print('중복 제외 처리 건 : ',value)
print('전체 데이터 pool 중복 제외 처리 건 : ',np.sum(lst))

destruct_17_df
중복 제외 처리 건 :  5026
destruct_18_df
중복 제외 처리 건 :  7364
destruct_19_df
중복 제외 처리 건 :  27560
execute_17_df
중복 제외 처리 건 :  0
execute_18_df
중복 제외 처리 건 :  0
execute_19_df
중복 제외 처리 건 :  201
전체 데이터 pool 중복 제외 처리 건 :  40151


#   <font color='orange'>06. 전체 데이터 결합

In [23]:
final_total_df = pd.concat(total_df_dict.values(), axis = 0)
# 실행구분, 파기구분, 신청월 기준으로 오름차순 정렬
final_total_df = final_total_df.sort_values(by = ['EXECUTE_FG','DESTRUCT_FG','신청월'], ascending = True)

In [24]:
final_total_df.head(2)

Unnamed: 0,no,신청월,직군그룹,근속기간1,웰컴_소득구간,심사원장_소득구간,AS0000136,AS0000137,AS0000138,AS0000139,B12000100,B12000200,B14000600,B22000100,B22000200,B22000300,B24000600,B28000001,B29000110,B32000200,B34000600,B41090101,B41100100,B41100200,B41100300,B41100400,B41110200,B41110400,B4E000002,BE0000021,BS0000058,BS0000114,BS0000145,BS0000146,BS0000171,BS0000917,C00000029,C11060700,C11060600,C11060400,C11060500,C11060800,CA0000601,CA0000602,CA0000603,CA0000604,CF0300309,CF0600309,CF1200309,CF0200309,...,GU0024201_1_3,GU0024001_1_3,LU0024002_1_4,LU0024202_1_4,LU0024608_1_4,GU0024201_1_4,GU0024001_1_4,P2O000500_1_s9,P2O000500_1_s3,P2E000500_1_s6,P2E000500_1_s3,P2E000500_1_s12,P27000100_1_s9,P27000100_1_s6,P27000100_1_s3,P27000100_1_s12,P21010500_1_s9,P21010500_1_s6,P21010500_1_s12,LA0000227_1_s6,LA0000227_1_s3,LA0000222_1_s9,LA0000222_1_s6,LA0000222_1_s3,LA0000222_1_s12,LA0000204_1_s9,LA0000203_1_s9,LA0000203_1_s6,LA0000001_1_s3,LA0000020_1_s3,LA0000203_1_s3,P21010500_1_s3,LA0000001_1_s6,LA0000204_1_s6,P2O000500_1_s6,LA0000001_1_s9,LA0000020_1_s9,LA0000227_1_s9,P2E000500_1_s9,LA0000001_1_s12,LA0000020_1_s12,LA0000204_1_s12,LA0000227_1_s12,P2O000500_1_s12,EXECUTE_FG,DESTRUCT_FG,Y_2017_FG,Y_2018_FG,Y_2019_FG,DUP_FG_1
0,1,201612,,5.0,5.0,0.0,26,-1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,887,0,0,0,0,0,3113,887,3113,887,1806,1416,1594,1773,...,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,642.86,1500.0,345.45,642.86,1500.0,214.29,345.45,345.45,642.86,0.7,0.7,1500.0,0.0,0.29,642.86,0.0,0.15,0.15,345.45,0.0,0.09,0.09,214.29,214.29,0.0,0,1,1,0,0,0
57,2,201612,D,5.0,2.0,0.0,-1,32,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1222,0,0,0,0,0,3477,1144,3477,47,859,797,803,615,...,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1,0,0,0


#  <font color='orange'>07. 2차 중복 처리

* 실행/파기 정보에서 중복이 발생하는 경우
  * 실행정보는 남기고 파기정보를 제외

In [22]:
# 1차중복처리가 안된 대상들에 한해서 2차 중복 처리
temp = final_total_df[final_total_df['DUP_FG_1']==0]

# 실행/파기 정보에서 중복 제외 처리될 "no" 리스트 생성
dup_no_list= temp.loc[temp.duplicated(dup_chk_cols, keep = 'last'),'no'].tolist()

In [23]:
# 2차 중복 처리
final_total_df['DUP_FG_2'] = 0
final_total_df.loc[final_total_df['no'].isin(dup_no_list),'DUP_FG_2'] =1

# 1차 중복 or 2차 중복 대상건이면 최종 중복 제외 처리 대상건
final_total_df['FINAL_DUP_FG'] = final_total_df['DUP_FG_1'] + final_total_df['DUP_FG_2']
final_total_df['FINAL_DUP_FG'] = final_total_df['FINAL_DUP_FG'].apply(lambda x: 1 if x>0 else 0)

In [24]:
final_total_df['FINAL_DUP_FG'].sum()

40266

In [25]:
final_total_df.head(2)

Unnamed: 0,no,신청월,직군그룹,근속기간1,웰컴_소득구간,심사원장_소득구간,AS0000136,AS0000137,AS0000138,AS0000139,B12000100,B12000200,B14000600,B22000100,B22000200,B22000300,B24000600,B28000001,B29000110,B32000200,B34000600,B41090101,B41100100,B41100200,B41100300,B41100400,B41110200,B41110400,B4E000002,BE0000021,BS0000058,BS0000114,BS0000145,BS0000146,BS0000171,BS0000917,C00000029,C11060700,C11060600,C11060400,C11060500,C11060800,CA0000601,CA0000602,CA0000603,CA0000604,CF0300309,CF0600309,CF1200309,CF0200309,...,LU0024002_1_4,LU0024202_1_4,LU0024608_1_4,GU0024201_1_4,GU0024001_1_4,P2O000500_1_s9,P2O000500_1_s3,P2E000500_1_s6,P2E000500_1_s3,P2E000500_1_s12,P27000100_1_s9,P27000100_1_s6,P27000100_1_s3,P27000100_1_s12,P21010500_1_s9,P21010500_1_s6,P21010500_1_s12,LA0000227_1_s6,LA0000227_1_s3,LA0000222_1_s9,LA0000222_1_s6,LA0000222_1_s3,LA0000222_1_s12,LA0000204_1_s9,LA0000203_1_s9,LA0000203_1_s6,LA0000001_1_s3,LA0000020_1_s3,LA0000203_1_s3,P21010500_1_s3,LA0000001_1_s6,LA0000204_1_s6,P2O000500_1_s6,LA0000001_1_s9,LA0000020_1_s9,LA0000227_1_s9,P2E000500_1_s9,LA0000001_1_s12,LA0000020_1_s12,LA0000204_1_s12,LA0000227_1_s12,P2O000500_1_s12,EXECUTE_FG,DESTRUCT_FG,Y_2017_FG,Y_2018_FG,Y_2019_FG,DUP_FG_1,DUP_FG_2,FINAL_DUP_FG
0,1,201612,,5.0,5.0,0.0,26,-1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,887,0,0,0,0,0,3113,887,3113,887,1806,1416,1594,1773,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,642.86,1500.0,345.45,642.86,1500.0,214.29,345.45,345.45,642.86,0.7,0.7,1500.0,0.0,0.29,642.86,0.0,0.15,0.15,345.45,0.0,0.09,0.09,214.29,214.29,0.0,0,1,1,0,0,0,0,0
57,2,201612,D,5.0,2.0,0.0,-1,32,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1222,0,0,0,0,0,3477,1144,3477,47,859,797,803,615,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1,0,0,0,0,0


In [26]:
# no 오름차순 정렬
final_total_df.sort_values(by = ['no'], ascending = True,inplace = True)

In [27]:
# reset index
final_total_df.reset_index(drop = True, inplace = True)

In [28]:
# 생성한 내부항목 리스트
## dataframe의 columns에서 "FG"가 들어간 columns 반환
fg_cols = final_total_df.columns[final_total_df.columns.str.contains('FG')].tolist()

In [29]:
len(fg_cols)

8

In [30]:
# 신청정보 항목 리스트 생성
need_cols = final_total_df.columns[:6].tolist()

In [31]:
# 신청정보항목+ 내부항목
final_need_cols = need_cols + fg_cols
final_need_cols

['no',
 '신청월',
 '직군그룹',
 '근속기간1',
 '웰컴_소득구간',
 '심사원장_소득구간',
 'EXECUTE_FG',
 'DESTRUCT_FG',
 'Y_2017_FG',
 'Y_2018_FG',
 'Y_2019_FG',
 'DUP_FG_1',
 'DUP_FG_2',
 'FINAL_DUP_FG']

In [32]:
# 내부항목 dataframe 생성
inner_total_df = final_total_df[final_need_cols]

In [33]:
inner_total_df.sort_values(by = ['no'], ascending = True,inplace = True)

#  <font color='orange'>08. 저장

In [34]:
save_path = r'C:\Users\w10\Desktop\신용평가모형 세미나\2주차\데이터\temp'

In [35]:
# 내부항목 dataframe 저장
inner_total_df.to_csv(os.path.join(save_path, 'temp1_inner_total_df.csv'), index = False)

In [36]:
# 오직 CB항목 dataframe 저장을 위해 내부항목 제거
final_total_df = final_total_df.drop(fg_cols, axis = 1)

In [37]:
# 신청정보 + CB항목 dataframe 저장
final_total_df.to_pickle(os.path.join(save_path,'total_df.pkl'),)