In [1]:
import pandas as pd
import numpy as np
import os
import sys
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

In [2]:
# load data
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
CB_trn = pd.read_csv(os.path.join(save_dir, 'seg1_train_df.csv'))
CB_val = pd.read_csv(os.path.join(save_dir, 'seg1_valid_df.csv'))
CB_tst = pd.read_csv(os.path.join(save_dir, 'seg1_test_df.csv'))
CB_model_valid = pd.read_csv(os.path.join(save_dir, 'model_valid_df.csv'))

In [3]:
print('CB_trn: ', CB_trn.shape)
print('CB_val: ', CB_val.shape)
print('CB_tst: ', CB_tst.shape)
print('CB_model_valid: ', CB_model_valid.shape)

CB_trn:  (37539, 1929)
CB_val:  (12566, 1929)
CB_tst:  (12752, 1929)
CB_model_valid:  (11155, 1929)


In [4]:
print(CB_trn.head(2))

   no  BAD     신청월 직군그룹  근속기간1  웰컴_소득구간  심사원장_소득구간  AS0000136  AS0000137  \
0   2    0  201612  NaN    2.0      2.0        0.0         -1         31   
1   3    0  201612  NaN    5.0      3.0        0.0         -1         47   

   AS0000138  ...  P2O000500_1_s6  LA0000001_1_s9  LA0000020_1_s9  \
0          1  ...             0.0            0.16            0.16   
1          1  ...             0.0            0.00            0.00   

   LA0000227_1_s9  P2E000500_1_s9  LA0000001_1_s12  LA0000020_1_s12  \
0          453.52             0.0              0.1              0.1   
1            0.00             0.0              0.0              0.0   

   LA0000204_1_s12  LA0000227_1_s12  P2O000500_1_s12  
0            447.5            447.5              0.0  
1              0.0              0.0              0.0  

[2 rows x 1929 columns]


In [5]:
def set_index_and_sort(df, index_col):
    df.set_index(index_col, inplace=True, drop=True, verify_integrity=True)
    df.sort_index(inplace=True)

    return df

In [6]:
CB_trn = set_index_and_sort(CB_trn, 'no')
CB_val = set_index_and_sort(CB_val, 'no')
CB_tst = set_index_and_sort(CB_tst, 'no')
CB_model_valid = set_index_and_sort(CB_model_valid, 'no')

In [7]:
print(CB_trn.head(2))

    BAD     신청월 직군그룹  근속기간1  웰컴_소득구간  심사원장_소득구간  AS0000136  AS0000137  \
no                                                                      
2     0  201612  NaN    2.0      2.0        0.0         -1         31   
3     0  201612  NaN    5.0      3.0        0.0         -1         47   

    AS0000138  AS0000139  ...  P2O000500_1_s6  LA0000001_1_s9  LA0000020_1_s9  \
no                        ...                                                   
2           1          0  ...             0.0            0.16            0.16   
3           1          0  ...             0.0            0.00            0.00   

    LA0000227_1_s9  P2E000500_1_s9  LA0000001_1_s12  LA0000020_1_s12  \
no                                                                     
2           453.52             0.0              0.1              0.1   
3             0.00             0.0              0.0              0.0   

    LA0000204_1_s12  LA0000227_1_s12  P2O000500_1_s12  
no                                   

In [8]:
# load layout
layout_path = r'D:\Seminar Documents\신용평가 세미나\설명자료'
col_df = pd.read_excel(os.path.join(layout_path, '(SD)AS프로젝트_SEG1_CB_레이아웃_200211_v2.0.xlsx'))

In [9]:
print(col_df.head(2))

   NO Variable aggvar_name group_code1 group_code2  group_2  min_value  \
0   1       no         NaN         NaN         NaN      NaN        NaN   
1   2      신청월         NaN         NaN         NaN      NaN        NaN   

   max_value  median  is_na  min_check  max_check  date_var  cate_var  \
0        NaN     NaN      0          0          0         0         1   
1        NaN     NaN      0          0          0         1         0   

  welc_can_use  
0        활용 불가  
1        활용 불가  


In [10]:
# layout 수정
dev_cols_df = pd.DataFrame({'Variable': CB_trn.columns.tolist()})
col_df = col_df.merge(dev_cols_df, on=['Variable'], how='inner')

In [11]:
print(col_df.shape)
print(col_df.head(2))

(1928, 15)
   NO Variable aggvar_name group_code1 group_code2  group_2  min_value  \
0   2      신청월         NaN         NaN         NaN      NaN        NaN   
1   3     직군그룹         NaN        내부변수        웰컴정보      6.0        NaN   

   max_value  median  is_na  min_check  max_check  date_var  cate_var  \
0        NaN     NaN      0          0          0         1         0   
1        NaN     NaN      1          0          0         0         1   

  welc_can_use  
0        활용 불가  
1        활용 불가  


In [12]:
# 활용 불가 변수 제외
not_use_col_df = col_df[col_df['welc_can_use'] == '활용 불가']
use_col_df = col_df[col_df['welc_can_use'] == '활용 가능']

not_use_col_df['group_code2'].value_counts().sort_index()
use_col_df['group_code2'].value_counts().sort_index()

group_col_df = col_df[(col_df['group_2'] == 3) | (col_df['welc_can_use'] == '활용 가능')]
group_cols = group_col_df['Variable'].tolist()
need_cols = ['BAD'] + group_cols

CB_trn = CB_trn[need_cols]
CB_val = CB_val[need_cols]
CB_tst = CB_tst[need_cols]
CB_model_valid = CB_model_valid[need_cols]

In [13]:
print('CB_trn: ', CB_trn.shape)
print('CB_val: ', CB_val.shape)
print('CB_tst: ', CB_tst.shape)

CB_trn:  (37539, 1692)
CB_val:  (12566, 1692)
CB_tst:  (12752, 1692)


In [14]:
print(CB_trn.head(2))

    BAD  AS0000136  AS0000137  AS0000138  AS0000139  B12000100  B12000200  \
no                                                                          
2     0         -1         31          1          0          0          0   
3     0         -1         47          1          0          0          0   

    B14000600  B22000100  B22000200  ...  P2O000500_1_s6  LA0000001_1_s9  \
no                                   ...                                   
2           0          0          0  ...             0.0            0.16   
3           0          0          0  ...             0.0            0.00   

    LA0000020_1_s9  LA0000227_1_s9  P2E000500_1_s9  LA0000001_1_s12  \
no                                                                    
2             0.16          453.52             0.0              0.1   
3             0.00            0.00             0.0              0.0   

    LA0000020_1_s12  LA0000204_1_s12  LA0000227_1_s12  P2O000500_1_s12  
no                          

In [15]:
# 범주형(categorical) 변수 column
cate_cols = group_col_df.loc[group_col_df['cate_var'] == 1, 'Variable'].tolist()
print(cate_cols)

['AS0000138', 'BS0000114', 'BS0000145', 'CA0000801', 'EW0002801', 'PS0000388', 'PS0000392', 'BE0000801', 'BE0000802', 'BS0000113', 'LA0200801', 'PE0000145', 'KC1400010', 'KC1400011', 'KC1400012', 'KC1400013', 'KC1400014', 'KC1400015', 'KC1400016', 'AS0000140', 'BE0000801_1', 'A5RCLSRL025700', 'Thinfile_final', 'S_3323', 'cap2', 'S_3318', 'C_3319', 'cap3', 'S_3317', 'cap1', 'S23', 'S24', 'cap4', 'C_3318', 'S_3319', 'C_3317', 'C_3311', 'A5RCLSRL047500', 'A5RCLSRL047600', 'A5RCLSRL047700']


In [16]:
cate_cols = list(set(CB_trn.columns).intersection(cate_cols))
print(cate_cols)

['cap4', 'A5RCLSRL047600', 'AS0000140', 'BE0000801_1', 'cap1', 'PE0000145', 'BS0000114', 'A5RCLSRL047500', 'S_3323', 'C_3318', 'KC1400011', 'BS0000145', 'BE0000802', 'A5RCLSRL025700', 'PS0000388', 'AS0000138', 'cap3', 'KC1400010', 'EW0002801', 'C_3319', 'KC1400015', 'A5RCLSRL047700', 'CA0000801', 'S24', 'KC1400016', 'BS0000113', 'KC1400013', 'LA0200801', 'S_3317', 'C_3311', 'S_3318', 'S_3319', 'KC1400012', 'PS0000392', 'KC1400014', 'cap2', 'S23', 'C_3317', 'BE0000801', 'Thinfile_final']


In [17]:
# 전처리
def na_check(data):
    result = np.sum(data.isna())

    return result

In [18]:
na_df = CB_trn.apply(na_check)
print(na_df)

BAD                0
AS0000136          0
AS0000137          0
AS0000138          0
AS0000139          0
                  ..
LA0000001_1_s12    0
LA0000020_1_s12    0
LA0000204_1_s12    0
LA0000227_1_s12    0
P2O000500_1_s12    0
Length: 1692, dtype: int64


In [19]:
# min_value check
need_cols = ['Variable', 'min_value', 'min_check']
min_check_df = group_col_df[need_cols][group_col_df['min_check'] == 1]
print(min_check_df.head(2))

     Variable  min_value  min_check
60  CL1200918     -999.0          1
61  CL0600919     -999.0          1


In [20]:
min_check_var = min_check_df['Variable'].values.tolist()
min_check_value = min_check_df['min_value'].values.tolist()

for idx, min_value in enumerate(np.unique(min_check_value)):
    if idx == 0:
        value1 = np.sum(CB_trn == min_value)
        min_check_result = value1
    else:
        value2 = np.sum(CB_trn == min_value)
        min_check_result = value2 + min_check_result

print(min_check_result)

BAD                0
AS0000136          0
AS0000137          0
AS0000138          0
AS0000139          0
                  ..
LA0000001_1_s12    0
LA0000020_1_s12    0
LA0000204_1_s12    0
LA0000227_1_s12    0
P2O000500_1_s12    0
Length: 1692, dtype: int64


In [21]:
# max_value check
need_cols = ['Variable', 'max_value', 'max_check']
max_check_df = group_col_df[need_cols][group_col_df['max_check'] == 1]
print(max_check_df.head(2))

     Variable    max_value  max_check
59  CL0000002  999999992.0          1
60  CL1200918  999999991.0          1


In [22]:
max_check_var = max_check_df['Variable'].values.tolist()
max_check_value = max_check_df['max_value'].values.tolist()

for idx, max_value in enumerate(np.unique(max_check_value)):
    if idx == 0:
        value1 = np.sum(CB_trn == max_value)
        max_check_result = value1
    else:
        value2 = np.sum(CB_trn == max_value)
        max_check_result = value2 + max_check_result

print(max_check_result)

BAD                0
AS0000136          0
AS0000137          0
AS0000138          0
AS0000139          0
                  ..
LA0000001_1_s12    0
LA0000020_1_s12    0
LA0000204_1_s12    0
LA0000227_1_s12    0
P2O000500_1_s12    0
Length: 1692, dtype: int64


In [23]:
# NaN >= 98% check
total_na_df = na_df + min_check_result + max_check_result
print(total_na_df)

BAD                0
AS0000136          0
AS0000137          0
AS0000138          0
AS0000139          0
                  ..
LA0000001_1_s12    0
LA0000020_1_s12    0
LA0000204_1_s12    0
LA0000227_1_s12    0
P2O000500_1_s12    0
Length: 1692, dtype: int64


In [24]:
total_na_p_df = total_na_df / CB_trn.shape[0]
print(total_na_p_df)

BAD                0.0
AS0000136          0.0
AS0000137          0.0
AS0000138          0.0
AS0000139          0.0
                  ... 
LA0000001_1_s12    0.0
LA0000020_1_s12    0.0
LA0000204_1_s12    0.0
LA0000227_1_s12    0.0
P2O000500_1_s12    0.0
Length: 1692, dtype: float64


In [25]:
too_many_na_df = total_na_p_df[total_na_p_df > 0.98]

too_many_na_cols = too_many_na_df.index
print(too_many_na_cols)

Index([], dtype='object')


In [26]:
na_df = pd.DataFrame({'remove_col': too_many_na_cols})
print(na_df)

Empty DataFrame
Columns: [remove_col]
Index: []


In [27]:
CB_trn = CB_trn.drop(too_many_na_cols, axis=1)
CB_val = CB_val.drop(too_many_na_cols, axis=1)
CB_tst = CB_tst.drop(too_many_na_cols, axis=1)
CB_model_valid = CB_model_valid.drop(too_many_na_cols, axis=1)

In [28]:
print('CB_trn: ', CB_trn.shape)
print('CB_val: ', CB_val.shape)
print('CB_tst: ', CB_tst.shape)
print('CB_model_valid: ', CB_model_valid.shape)

CB_trn:  (37539, 1692)
CB_val:  (12566, 1692)
CB_tst:  (12752, 1692)
CB_model_valid:  (11155, 1692)


In [29]:
group_col_df = group_col_df[group_col_df['Variable'].isin(too_many_na_cols) == False]
print(group_col_df.shape)

(1691, 15)


In [30]:
# NaN 변환
# column 정의
cols = CB_trn.columns
cate_cols = group_col_df.loc[group_col_df['cate_var'] == 1, 'Variable'].tolist()
num_cols = list(set(cols).difference(cate_cols))

In [31]:
# NaN 0 변환
CB_trn[num_cols] = CB_trn[num_cols].fillna(0)
CB_val[num_cols] = CB_val[num_cols].fillna(0)
CB_tst[num_cols] = CB_tst[num_cols].fillna(0)
CB_model_valid[num_cols] = CB_model_valid[num_cols].fillna(0)

In [32]:
# NaN NULL 변환
CB_trn[cate_cols] = CB_trn[cate_cols].fillna('NULL')
CB_val[cate_cols] = CB_val[cate_cols].fillna('NULL')
CB_tst[cate_cols] = CB_tst[cate_cols].fillna('NULL')
CB_model_valid[cate_cols] = CB_model_valid[cate_cols].fillna('NULL')

In [33]:
# min_value 0 변환
def na_change(data, value):
    data = np.where(data == value, 0, data)

    return data

In [34]:
need_cols = ['Variable', 'min_value', 'min_check']
min_check_df = group_col_df[need_cols][group_col_df['min_check'] == 1]
print(min_check_df.head(2))

     Variable  min_value  min_check
60  CL1200918     -999.0          1
61  CL0600919     -999.0          1


In [35]:
min_check_var = min_check_df['Variable'].values.tolist()
min_check_value = min_check_df['min_value'].values.tolist()

for min_value in np.unique(min_check_value):
    CB_trn = CB_trn.apply(na_change, value=min_value)
    CB_val = CB_val.apply(na_change, value=min_value)
    CB_tst = CB_tst.apply(na_change, value=min_value)
    CB_model_valid = CB_model_valid.apply(na_change, value=min_value)

In [36]:
# max_value 0 변환
need_cols = ['Variable', 'max_value', 'max_check']
max_check_df = group_col_df[need_cols][group_col_df['max_check'] == 1]
print(max_check_df.head(2))

     Variable    max_value  max_check
59  CL0000002  999999992.0          1
60  CL1200918  999999991.0          1


In [37]:
max_check_var = max_check_df['Variable'].values.tolist()
max_check_value = max_check_df['max_value'].values.tolist()

for max_value in np.unique(max_check_value):
    CB_trn = CB_trn.apply(na_change, value=max_value)
    CB_val = CB_val.apply(na_change, value=max_value)
    CB_tst = CB_tst.apply(na_change, value=max_value)
    CB_model_valid = CB_model_valid.apply(na_change, value=max_value)

In [38]:
# 변수 type
CB_trn[cate_cols] = CB_trn[cate_cols].applymap(str)
CB_val[cate_cols] = CB_val[cate_cols].applymap(str)
CB_tst[cate_cols] = CB_tst[cate_cols].applymap(str)
CB_model_valid[cate_cols] = CB_model_valid[cate_cols].applymap(str)

In [39]:
# date_var drop
date_var = group_col_df[group_col_df['date_var'] == 1]['Variable'].values.tolist()
print('date_var: ', len(date_var))
print('cols: ', len(CB_trn.columns[CB_trn.columns.isin(date_var)]))

date_var:  10
cols:  10


In [40]:
CB_trn.drop(date_var, axis=1, inplace=True)
CB_val.drop(date_var, axis=1, inplace=True)
CB_tst.drop(date_var, axis=1, inplace=True)
CB_model_valid.drop(date_var, axis=1, inplace=True)

In [41]:
print('CB_trn: ', CB_trn.shape)

CB_trn:  (37539, 1682)


In [42]:
group_col_df = group_col_df[group_col_df['Variable'].isin(date_var) == False]
print(group_col_df.shape)

(1681, 15)


In [43]:
# column 분류
num_cols = CB_trn.columns[CB_trn.dtypes != object].tolist()
num_cols.remove('BAD')
cate_cols = CB_trn.columns[CB_trn.dtypes == object].tolist()

In [44]:
# 분산이 0인 column drop
std = CB_trn[num_cols].apply(np.std)
print(std[std == 0])
std = std[std != 0]

B12000200         0.0
B22000100         0.0
B22000200         0.0
B22000300         0.0
B24000600         0.0
                 ... 
A5WC0000001300    0.0
A5WC0000001200    0.0
A5WC0000004400    0.0
A5WC0000008400    0.0
A5RCLSRL039100    0.0
Length: 156, dtype: float64


In [45]:
# 범주가 1개인 column drop
cate_std = CB_trn[cate_cols].apply(lambda x: len(pd.unique(x)))
print(len(cate_std[cate_std == 1]))
print(cate_std[cate_std == 1])
cate_std = cate_std[cate_std != 1]

6
BS0000145         1
PE0000145         1
KC1400011         1
KC1400015         1
KC1400016         1
A5RCLSRL025700    1
dtype: int64


In [46]:
# column 분류 update
num_cols = list(std.keys())
cate_cols = list(cate_std.keys())

In [47]:
X_cols = cate_cols + num_cols
print('x_cols: ', len(X_cols))
print('num_cols: ', len(num_cols))
print('cate_cols: ', len(cate_cols))

x_cols:  1519
num_cols:  1485
cate_cols:  34


In [48]:
# X,Y 분리
CB_trn_y = CB_trn[['BAD']]
CB_val_y = CB_val[['BAD']]
CB_tst_y = CB_tst[['BAD']]
CB_model_valid_y = CB_model_valid[['BAD']]

In [49]:
print('CB_trn_y: ', CB_trn_y.shape)
print('CB_val_y: ', CB_val_y.shape)
print('CB_tst_y: ', CB_tst_y.shape)

CB_trn_y:  (37539, 1)
CB_val_y:  (12566, 1)
CB_tst_y:  (12752, 1)


In [50]:
CB_trn_X = CB_trn[X_cols]
CB_val_X = CB_val[X_cols]
CB_tst_X = CB_tst[X_cols]
CB_model_valid_X = CB_model_valid[X_cols]

In [51]:
use_var = CB_trn_X.columns
use_var = list(use_var)

In [52]:
CB_uni_trn_X = CB_trn_X[use_var]
CB_uni_val_X = CB_val_X[use_var]
CB_uni_tst_X = CB_tst_X[use_var]
CB_uni_model_valid_X = CB_model_valid_X[use_var]

In [53]:
print('CB_uni_trn_X: ', CB_uni_trn_X.shape)
print('CB_uni_val_X: ', CB_uni_val_X.shape)
print('CB_uni_tst_X: ', CB_uni_tst_X.shape)

CB_uni_trn_X:  (37539, 1519)
CB_uni_val_X:  (12566, 1519)
CB_uni_tst_X:  (12752, 1519)


In [54]:
cate_cols = CB_trn_X.columns[CB_trn_X.dtypes == object]
num_cols = CB_trn_X.columns.drop(cate_cols)

In [55]:
print('num_cols: ', len(num_cols))
print('cate_cols: ', len(cate_cols))

num_cols:  1485
cate_cols:  34


In [56]:
print(CB_trn_X.head(2))

   AS0000138 BS0000114 CA0000801 EW0002801 PS0000388 PS0000392 BE0000801  \
no                                                                         
2          1         0         1         1         0         0         0   
3          1         0         1         8         0         0         0   

   BE0000802 BS0000113 LA0200801  ... P2O000500_1_s6 LA0000001_1_s9  \
no                                ...                                 
2          0         0         0  ...            0.0           0.16   
3          0         0         0  ...            0.0           0.00   

   LA0000020_1_s9 LA0000227_1_s9 P2E000500_1_s9 LA0000001_1_s12  \
no                                                                
2            0.16         453.52            0.0             0.1   
3            0.00           0.00            0.0             0.0   

   LA0000020_1_s12 LA0000204_1_s12 LA0000227_1_s12 P2O000500_1_s12  
no                                                                  
2  

In [57]:
# 단변량 분석
# binned df 생성
binned_trn_X = CB_uni_trn_X.copy()
binned_val_X = CB_uni_val_X.copy()
binned_tst_X = CB_uni_tst_X.copy()
binned_model_valid_X = CB_uni_model_valid_X.copy()

In [58]:
# import modules(univariate)
from univariate.fine_classing import fine_classing, parallelize_fine_classing
from univariate.coarse_classing import coarse_classing, parallelize_coarse_classing
from univariate.calc_variable_statistics import calc_variable_stats, parallelize_calc_variable_stats
from univariate.performance_table import base_performance_table
from univariate.univariate_analysis_other_psi import select_candidate_variable

In [59]:
# fine classing
fine_start = datetime.now()
q = np.linspace(0, 1, 51)
fine_classing_result = parallelize_fine_classing(x_train_df=binned_trn_X,
                                                 x_valid_df=binned_val_X,
                                                 x_test_df=binned_tst_X,
                                                 x_model_valid_df=binned_model_valid_X,
                                                 numeric_columns_list=num_cols,
                                                 bin_cut=q,
                                                 func=fine_classing,
                                                 processes=6)

binned_trn_X = binned_trn_X.drop(num_cols, axis=1)
binned_trn_X[num_cols] = fine_classing_result[0]

binned_val_X = binned_val_X.drop(num_cols, axis=1)
binned_val_X[num_cols] = fine_classing_result[1]

binned_tst_X = binned_tst_X.drop(num_cols, axis=1)
binned_tst_X[num_cols] = fine_classing_result[2]

binned_model_valid_X = binned_model_valid_X.drop(num_cols, axis=1)
binned_model_valid_X[num_cols] = fine_classing_result[3]

fine_end = datetime.now()
fine_time = (fine_end - fine_start).total_seconds()
print(f'fine classing time: {fine_time // 60}분 {fine_time % 60}초')

fine classing time: 1.0분 26.771994000000007초


In [60]:
print('binned_trn_X: ', binned_trn_X.shape)
print('fine_classing_result 0: ', fine_classing_result[0].shape)

binned_trn_X:  (37539, 1519)
fine_classing_result 0:  (37539, 1485)


In [61]:
# bin==1 변수 drop
bin_cnt = binned_trn_X.apply(lambda x: len(pd.unique(x)))
remove_vars = bin_cnt[bin_cnt == 1].keys().tolist()
print('remove_vars: ', len(remove_vars))

remove_vars:  326


In [62]:
binned_trn_X = binned_trn_X.drop(remove_vars, axis=1)
binned_val_X = binned_val_X.drop(remove_vars, axis=1)
binned_tst_X = binned_tst_X.drop(remove_vars, axis=1)
binned_model_valid_X = binned_model_valid_X.drop(remove_vars, axis=1)

use_var = binned_trn_X.columns.tolist()

In [63]:
print(binned_trn_X.head())

   AS0000138 BS0000114 CA0000801 EW0002801 PS0000388 PS0000392 BE0000801  \
no                                                                         
2          1         0         1         1         0         0         0   
3          1         0         1         8         0         0         0   
5          1         0         1         0         0         0         0   
6          1         0         1         1         0         0         0   
7          1         1         1         0         0         0         0   

   BE0000802 BS0000113 LA0200801  ... P2O000500_1_s6 LA0000001_1_s9  \
no                                ...                                 
2          0         0         0  ...    (-inf, 0.0]   (0.15, 0.19]   
3          0         0         0  ...    (-inf, 0.0]   (-0.05, 0.0]   
5          0         0         0  ...    (-inf, 0.0]    (0.0, 0.05]   
6          0         0         0  ...    (-inf, 0.0]   (-0.05, 0.0]   
7          0         0         0  ...    

In [64]:
def intersection(lst1, lst2): 
    lst3 = list(set(lst1).intersection(set(lst2)))
    return lst3 

In [65]:
num_cols = intersection(use_var, num_cols)
print('num_cols: ', len(num_cols))

num_cols:  1159


In [66]:
# coarse classing
bin_cnt = binned_trn_X.apply(lambda x: len(pd.unique(x)))
coarse_variable_list = bin_cnt[bin_cnt > 10].keys().tolist()
print('coarse_variable: ', len(coarse_variable_list))

coarse_variable:  369


In [67]:
coarse_start = datetime.now()
coarse_classing_result = parallelize_coarse_classing(x_binned_train_df = binned_trn_X,
                                                     y_train_df = CB_trn_y,
                                                     x_binned_valid_df = binned_val_X,
                                                     x_binned_test_df = binned_tst_X,
                                                     x_binned_model_valid_df = binned_model_valid_X,
                                                     y_column_name = 'BAD',
                                                     func = coarse_classing,
                                                     processes = 6,
)

binned_trn_X = coarse_classing_result[0]
binned_val_X = coarse_classing_result[1]
binned_tst_X = coarse_classing_result[2]
binned_model_valid_X = coarse_classing_result[3]

coarse_end= datetime.now()
coarse_time = (coarse_end-coarse_start).total_seconds()
print(f'coarse classing 수행시간 : {coarse_time // 60}분 {coarse_time % 60}초')

coarse classing 수행시간 : 2.0분 49.443659999999994초


In [68]:
# concatenate binned_df
trn_binned_df = pd.concat([CB_trn_y, binned_trn_X], axis=1)
val_binned_df = pd.concat([CB_val_y, binned_val_X], axis=1)
tst_binned_df = pd.concat([CB_tst_y, binned_tst_X], axis=1)
model_valid_binned_df = pd.concat([CB_model_valid_y, binned_model_valid_X], axis=1)

In [70]:
# calculate variable statistics
calc_start = datetime.now()
trn_stats_dict = parallelize_calc_variable_stats(x_binned_df = binned_trn_X,
                                                 y_df = CB_trn_y,
                                                 use_columns_list = use_var,
                                                 y_column_name = 'BAD',
                                                 processes = 6,
                                                 func = calc_variable_stats,)

val_stats_dict = parallelize_calc_variable_stats(x_binned_df = binned_val_X,
                                                 y_df = CB_val_y,
                                                 use_columns_list = use_var,
                                                 y_column_name = 'BAD',
                                                 processes = 6,
                                                 func = calc_variable_stats,)

tst_stats_dict = parallelize_calc_variable_stats(x_binned_df = binned_tst_X,
                                                 y_df = CB_tst_y,
                                                 use_columns_list = use_var,
                                                 y_column_name = 'BAD',
                                                 processes = 6,
                                                 func = calc_variable_stats,)

calc_end = datetime.now()
calc_time = (calc_end-calc_start).total_seconds()
print(f'calculate variable statistics 수행시간 : {calc_time // 60}분 {calc_time % 60}초')

calculate variable statistics 수행시간 : 2.0분 32.005255000000005초


In [71]:
# basic performance table
perf_start = datetime.now()
performance_table = base_performance_table(train_calc_stats_dict = trn_stats_dict, 
                                            valid_calc_stats_dict = val_stats_dict, 
                                            test_calc_stats_dict = tst_stats_dict, 
                                            layout = col_df)
perf_end = datetime.now()
perf_time = (perf_end-calc_start).total_seconds()
print(f'basic performance table 수행시간 : {perf_time // 60}분 {perf_time % 60}초')

basic performance table 수행시간 : 2.0분 37.114667초


In [72]:
print(performance_table.head(2))

                  NO  aggvar_name group_code1 group_code2  group_2  min_value  \
Variable                                                                        
A5RCLSDT011100  1488        총대출공여        웰컴항목        웰컴정보      6.0        1.0   
A5RCLSDT011300  1514  제2금융기업총대출공여        웰컴항목        웰컴정보      6.0        1.0   

                max_value  median  is_na  min_check  ...  welc_can_use  \
Variable                                             ...                 
A5RCLSDT011100        6.0     1.0      0          0  ...         활용 가능   
A5RCLSDT011300        6.0     1.0      0          0  ...         활용 가능   

                  ks_trn    ks_val    ks_tst    ar_trn    ar_val    ar_tst  \
Variable                                                                     
A5RCLSDT011100  0.133226  0.043998  0.152178  0.133226  0.043998  0.152178   
A5RCLSDT011300  0.527653  1.027541  0.771895  0.527653  1.027541  0.771895   

                  iv_trn    iv_val    iv_tst  
Variable          

In [73]:
# 후보변수 선정
age_cols = ['AS0000136', 'AS0000137', 'AS0000140']

trn_binned_df.drop(age_cols, axis=1, inplace=True)
val_binned_df.drop(age_cols, axis=1, inplace=True)
tst_binned_df.drop(age_cols, axis=1, inplace=True)
model_valid_binned_df.drop(age_cols, axis=1, inplace=True)

In [75]:
print('trn_binned_df: ', trn_binned_df.shape)

trn_binned_df:  (37539, 1191)


In [76]:
performance_table.drop(age_cols, axis=0, inplace=True)
print('performance_table: ', performance_table.shape)

performance_table:  (1190, 23)


In [77]:
use_var = trn_binned_df.columns.tolist()
use_var.remove('BAD')

In [78]:
cate_cols = performance_table[performance_table['cate_var'] == 1].index.tolist()
print('cate_cols: ', len(cate_cols))

cate_cols:  33


In [79]:
print(trn_binned_df.head(2))

    BAD AS0000138 BS0000114 CA0000801 EW0002801 PS0000388 PS0000392 BE0000801  \
no                                                                              
2     0         1         0         1         1         0         0         0   
3     0         1         0         1         8         0         0         0   

   BE0000802 BS0000113  ... P2O000500_1_s6 LA0000001_1_s9 LA0000020_1_s9  \
no                      ...                                                
2          0         0  ...    (-inf, 0.0]   (0.15, 0.33]   (0.15, 0.21]   
3          0         0  ...    (-inf, 0.0]   (-0.05, 0.0]   (-0.04, 0.0]   

       LA0000227_1_s9 P2E000500_1_s9 LA0000001_1_s12 LA0000020_1_s12  \
no                                                                     
2   (435.556, 545.45]    (-inf, 0.0]    (0.07, 0.11]    (0.07, 0.11]   
3    (-16.58, 22.454]    (-inf, 0.0]    (-0.02, 0.0]    (-0.01, 0.0]   

       LA0000204_1_s12      LA0000227_1_s12 P2O000500_1_s12  
no                 

In [80]:
# 필터링 항목 drop
filter_path = r'D:\Seminar Documents\신용평가 세미나'
near_filter = pd.read_excel(os.path.join(filter_path, 'AS필터링항목_최종_레이아웃_200206_v1.0.xlsx'),
                           sheet_name = 'Near-Prime 필터링 필요항목 레이아웃')
sub_filter = pd.read_excel(os.path.join(filter_path, 'AS필터링항목_최종_레이아웃_200206_v1.0.xlsx'),
                           sheet_name = 'Sub-Prime 필터링 필요항목 레이아웃')

In [81]:
near_var = near_filter['항목코드'].tolist()
sub_var = sub_filter['항목코드'].tolist()
total_var = list(set(near_var).union(sub_var))

In [83]:
print('near_var: ', len(near_var))
print('sub_var: ', len(sub_var))
print('total_var: ', len(total_var))

near_var:  82
sub_var:  86
total_var:  97


In [84]:
df_cols = trn_binned_df.columns.tolist()
need_var = list(set(df_cols).difference(total_var))
print('df_cols: ', len(df_cols))
print('need_var: ', len(need_var))

df_cols:  1191
need_var:  1130


In [85]:
trn_binned_df = trn_binned_df[need_var]
val_binned_df = val_binned_df[need_var]
tst_binned_df = tst_binned_df[need_var]
model_valid_binned_df = model_valid_binned_df[need_var]

In [86]:
print('trn_binned_df: ', trn_binned_df.shape)

trn_binned_df:  (37539, 1130)


In [87]:
print('performance_table: ', performance_table.shape)

performance_table:  (1190, 23)


In [88]:
use_var = trn_binned_df.columns.tolist()
use_var.remove('BAD')

In [89]:
cate_cols = performance_table[performance_table['cate_var'] == 1].index.tolist()
print('cate_cols: ', len(cate_cols))

cate_cols:  33


In [90]:
other_columns_list = performance_table[performance_table['group_2'] == 3].index.tolist()
print('other_columns: ', len(other_columns_list))

other_columns:  188


In [91]:
# select candidate variable
select_start = datetime.now()
final_result = select_candidate_variable(
                                        binned_train_df = trn_binned_df,
                                        binned_valid_df = val_binned_df,
                                        binned_test_df = tst_binned_df,
                                        binned_model_valid_df = model_valid_binned_df,
                                        performance_table = performance_table,
                                        use_columns_list = use_var,
                                        category_columns_list = cate_cols,
                                        other_columns_list = other_columns_list,
                                        other_psi_value=0.05,
                                        performance_item_list = ['iv'],
                                        group_divide_col_name = 'group_2',
                                        base_psi_value = 0.1,
                                        top_value = 0.1,
                                        stable_value = 0.3,
                                        corr_value = 0.8
                                                   )

select_end = datetime.now()
select_time = (select_end-select_start).total_seconds()
print(f'select candidate variable 수행시간 : {select_time//60}(분) {select_time%60}(초)')

select candidate variable 수행시간 : 1.0(분) 51.474988999999994(초)


In [92]:
final_performance_table = final_result[0]
corr_result_dict = final_result[1]
remove_corr_result_dict = final_result[2]

In [93]:
pd.set_option('display.max_columns', 100)

In [94]:
final_performance_table.head(2)

Unnamed: 0_level_0,NO,aggvar_name,group_code1,group_code2,group_2,min_value,max_value,median,is_na,min_check,max_check,date_var,cate_var,welc_can_use,ks_trn,ks_val,ks_tst,ar_trn,ar_val,ar_tst,iv_trn,iv_val,iv_tst,val_psi,val_psi_안정성,tst_psi,tst_psi_안정성,val_2019_psi,val_2019_psi_안정성,PSI_선정여부,iv_top_YN,변별력,iv_stable_YN,안정성,상관분석
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
A5RCLSDT011100,1488,총대출공여,웰컴항목,웰컴정보,6.0,1.0,6.0,1.0,0,0,0,0,0,활용 가능,0.133226,0.043998,0.152178,0.133226,0.043998,0.152178,0.000901,0.0001,0.001029,3.42576e-08,Y,3.8e-05,Y,0.002853,Y,Y,N,N,N,N,N
A5RCLSDT011300,1514,제2금융기업총대출공여,웰컴항목,웰컴정보,6.0,1.0,6.0,1.0,0,0,0,0,0,활용 가능,0.527653,1.027541,0.771895,0.527653,1.027541,0.771895,0.001424,0.005207,0.00311,1.733874e-05,Y,1.3e-05,Y,0.000129,Y,Y,N,N,N,N,N


In [95]:
remove_corr_result_dict['1.0'].head(2)

Variable,LU0000902_1,LA0000203_s12,LA0000222_s12,LA0000203_s6,LA0000204_s3,LS0000058,LA0000020_s12,LS0000607,LA0000001_s9,LH000000E,LA0000227_s9,LC0000901,LA2400901,LC009920E,LA1214204,LA1200203,LA0600203,LA1299203
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
LU0000902_1,0.0,0.048265,0.024924,0.048588,0.013481,0.031327,0.030783,0.146632,0.047194,0.158563,0.041847,0.697336,0.124474,0.090916,0.068714,0.071207,0.073225,0.015429
LA0000203_s12,0.0,0.0,0.799158,0.666606,0.082623,0.578902,0.649058,0.083408,0.688541,0.128725,0.579385,0.13571,0.187969,0.252428,0.300048,0.725277,0.595849,0.479285


In [96]:
# 저장
save_dir = r'D:\Seminar Documents\신용평가 세미나\복습'
final_performance_table.to_excel(os.path.join(save_dir,'단변량모듈사용_단변량결과.xlsx'))