In [128]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

# <font color='orange'>01. load data 

In [129]:
rootpath = r'C:\Users\w10\Desktop\신용평가모형 세미나\2주차\데이터\temp'

In [130]:
os.listdir(rootpath)

['temp1_inner_total_df.csv',
 'temp2_inner_total_df.csv',
 'temp3_inner_total_df.csv',
 'total_df.pkl']

In [131]:
total_df = pd.read_pickle(os.path.join(rootpath, 'total_df.pkl'), )

In [132]:
inner_total_df = pd.read_csv(os.path.join(rootpath, 'temp3_inner_total_df.csv'), )

In [133]:
total_df.head(2)

Unnamed: 0,no,신청월,직군그룹,근속기간1,웰컴_소득구간,심사원장_소득구간,AS0000136,AS0000137,AS0000138,AS0000139,...,P2O000500_1_s6,LA0000001_1_s9,LA0000020_1_s9,LA0000227_1_s9,P2E000500_1_s9,LA0000001_1_s12,LA0000020_1_s12,LA0000204_1_s12,LA0000227_1_s12,P2O000500_1_s12
0,1,201612,,5.0,5.0,0.0,26,-1,1,0,...,0.0,0.15,0.15,345.45,0.0,0.09,0.09,214.29,214.29,0.0
1,2,201612,D,5.0,2.0,0.0,-1,32,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [134]:
inner_total_df.head(2)

Unnamed: 0,no,신청월,직군그룹,근속기간1,웰컴_소득구간,심사원장_소득구간,EXECUTE_FG,DESTRUCT_FG,Y_2017_FG,Y_2018_FG,...,SB_HOUSE_FG,COOP_HOUSE_FG,CREDUNION_HOUSE_FG,CAP_HOUSE_FG,MG_HOUSE_FG,ETC_HOUSE_FG,ACC_SB_HOUSE_FG,ACC_ML_HOUSE_FG,ACC_P2P_HOUSE_FG,TOTAL_HOUSE_FG
0,1,201612,,5.0,5.0,0.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,201612,D,5.0,2.0,0.0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


# <font color='orange'>02. 연소득 컬럼 생성

* A5WC0000008000 웰컴크레디라인대부최종연소득
* A5WC0000000200 웰컴크레디라인대부최종연소득
* IE0300007 NICE최종연소득금액_IE0305_000__금융CB

 

    IF A5WC0000008000 > 0 THEN INCOME = A5WC0000008000 ;
    ELSE IF A5WC0000000200 > 0 THEN INCOME = A5WC0000000200 ;
    ELSE IF IE0300007 > 0 THEN INCOME = IE0300007*10 ;
    ELSE INCOME = 0 ;



* 연소득 컬럼 명명
  * INCOME(천원단위)

In [135]:
def _get_year_income(A5WC0000008000, A5WC0000000200, IE0300007):
    year_income = 0
    if A5WC0000008000 > 0:
        # 천원단위
        year_income = A5WC0000008000 ;
    elif A5WC0000000200 > 0:
        # 천원단위
        year_income = A5WC0000000200 ;
    elif IE0300007 > 0:
        # 만원단위여서 * 10
        year_income = IE0300007*10 ;
    else:
        year_income = 0 ;
        
    return year_income

In [136]:
need_cols = ['A5WC0000008000', 'A5WC0000000200', 'IE0300007']

In [137]:
total_df[need_cols].head()

Unnamed: 0,A5WC0000008000,A5WC0000000200,IE0300007
0,30000,30000,3000
1,19000,19000,1900
2,24000,24000,2400
3,36000,36000,3600
4,33000,33000,3300


In [138]:
total_df['INCOME'] = total_df[need_cols].apply(lambda x: _get_year_income(x['A5WC0000008000'],
                                                                          x['A5WC0000000200'],
                                                                          x['IE0300007']), axis = 1)

In [139]:
inner_total_df['INCOME'] = total_df['INCOME']

# <font color='orange'>03. 연소득 컬럼 구간 interval 컬럼 생성

```
SEG 1
    0~12000미만 = 1
    12000~16000미만 = 2　 　 
    16000~20000미만 = 3　 
    20000~24000미만 = 4 　 　 
　 　 　 　 　 　 
SEG 2 
    24000~27000미만 = 5
    27000~30000미만 = 6
    30000~33000미만 = 7 　 
    33000~35000미만 = 8
　 　 　 　 
SEG 3 
    35000~40000미만 = 9
    40000~45000미만 = 10
    45000~50000미만 = 11
    50000~55000미만 = 12
    55000~60000미만 = 13
    60000이상 = 14
```

* 연소득 구간화 컬럼 명명
  * INCOME_INTERVAL

### 방법1. pandas cut 사용

In [140]:
income_edge_list= [0,12000,16000,20000,24000,
                   27000,30000,33000,35000,
                  40000,45000,50000,55000,60000,np.inf]

In [141]:
# 구간값 생성
## 구간 경계값 : 최소-이상, 최대-미만
income_interval = pd.cut(total_df['INCOME'], bins = income_edge_list, right=False, )

In [142]:
income_interval

0         [30000.0, 33000.0)
1         [16000.0, 20000.0)
2         [24000.0, 27000.0)
3         [35000.0, 40000.0)
4         [33000.0, 35000.0)
                 ...        
418974    [35000.0, 40000.0)
418975    [27000.0, 30000.0)
418976    [24000.0, 27000.0)
418977    [12000.0, 16000.0)
418978    [27000.0, 30000.0)
Name: INCOME, Length: 418979, dtype: category
Categories (14, interval[float64]): [[0.0, 12000.0) < [12000.0, 16000.0) < [16000.0, 20000.0) < [20000.0, 24000.0) ... [45000.0, 50000.0) < [50000.0, 55000.0) < [55000.0, 60000.0) < [60000.0, inf)]

In [143]:
# unique한 구간값 
income_interval_categories = income_interval.cat.categories.values

# 각 대상건의 구간값 코드화
income_interval_codes = income_interval.cat.codes.values
# 각 구간값 1부터 시작하도록 +1
income_interval_codes = income_interval_codes + 1

In [144]:
total_df['INCOME_INTERVAL'] = income_interval_codes

In [145]:
inner_total_df['INCOME_INTERVAL'] = total_df['INCOME_INTERVAL']

### 방법2. 함수 정의

In [146]:
def _get_year_income_interval(year_income):
    year_income_interval = 0
    # SEG 1 
    if 0 <= year_income < 12000:
        year_income_interval = 1
    elif 12000 <= year_income < 16000:
        year_income_interval = 2
    elif 16000 <= year_income < 20000:
        year_income_interval = 3
    elif 20000 <= year_income < 24000:
        year_income_interval = 4
        
    # SEG 2
    elif 24000 <= year_income < 27000:
        year_income_interval = 5
    elif 27000 <= year_income < 30000:
        year_income_interval = 6
    elif 30000 <= year_income < 33000:
        year_income_interval = 7
    elif 33000 <= year_income < 35000:
        year_income_interval = 8
    
    # SEG 3
    elif 35000 <= year_income < 40000:
        year_income_interval = 9
    elif 40000 <= year_income < 45000:
        year_income_interval = 10
    elif 45000 <= year_income < 50000:
        year_income_interval = 11
    elif 50000 <= year_income < 55000:
        year_income_interval = 12
    elif 55000 <= year_income < 60000:
        year_income_interval = 13
    else:
        year_income_interval = 14
        
    return year_income_interval

In [147]:
total_df['INCOME_INTERVAL'] = total_df['INCOME'].apply(lambda x: _get_year_income_interval(x))

In [148]:
inner_total_df['INCOME_INTERVAL'] = total_df['INCOME_INTERVAL']

# <font color='orange'>04. UDIR 구간화 컬럼 생성

* A5RCLSRL013400 웰컴용UDIR 

```
    0~50미만 = 1
    50~100미만 = 2
    100~150미만 = 3
    150~200미만 = 4
    200~250미만 = 5
    250~300이하 = 6
    300초과 = 7
```


* UDIR 구간화 컬럼 명명
  * UDIR_INTERVAL

In [149]:
def _get_udir_interval(A5RCLSRL013400):
    udir_interval = 0
    
    if 0 <= A5RCLSRL013400 < 50:
        udir_interval = 1
    elif 50 <= A5RCLSRL013400 < 100:
        udir_interval = 2
    elif 100 <= A5RCLSRL013400 < 150:
        udir_interval = 3
    elif 150 <= A5RCLSRL013400 < 200:
        udir_interval = 4
    elif 200 <= A5RCLSRL013400 < 250:
        udir_interval = 5
    elif 250 <= A5RCLSRL013400 <= 300:
        udir_interval = 6
    elif 300 < A5RCLSRL013400:
        udir_interval = 7
        
    return udir_interval

In [150]:
total_df['UDIR_INTERVAL'] = total_df['A5RCLSRL013400'].apply(lambda x: _get_udir_interval(x))

In [151]:
inner_total_df['UDIR'] = total_df['A5RCLSRL013400']
inner_total_df['UDIR_INTERVAL'] = total_df['UDIR_INTERVAL']

# <font color='orange'>05. SEG 생성

In [152]:
def _get_seg(year_income_interval, udir_interval):
    seg = 0
    if (year_income_interval in [1,2,3,4]) and (udir_interval in [1,2,3,4]):
        seg = 1
    elif (year_income_interval in [5,6,7,8]) and (udir_interval in [1,2,3,4,5]): 
        seg = 2
    elif (year_income_interval in [9,10,11,12,13,14]) and (udir_interval in [1,2,3,4,5,6]):
        seg = 3
        
    return seg

In [153]:
need_cols = ['INCOME_INTERVAL','UDIR_INTERVAL']

In [154]:
total_df['SEG'] = total_df[need_cols].apply(lambda x: _get_seg(x['INCOME_INTERVAL'],
                                                  x['UDIR_INTERVAL']), axis = 1)

In [155]:
inner_total_df['SEG'] = total_df['SEG']

# <font color='orange'>06. SEG 재구성

* seg1, 2 -> seg1
* seg3 -> 2

In [156]:
def _get_final_seg(seg):
    final_seg = 0
    
    if seg == 1 or seg == 2:
        final_seg = 1
    elif seg == 3:
        final_seg = 2
    return final_seg

In [157]:
total_df['FINAL_SEG'] = total_df['SEG'].apply(lambda x: _get_final_seg(x))

In [158]:
inner_total_df['FINAL_SEG']=total_df['FINAL_SEG'] 

# <font color='orange'>07. 저장

In [159]:
save_path = r'C:\Users\w10\Desktop\신용평가모형 세미나\2주차\데이터\temp'

In [160]:
inner_total_df.to_csv(os.path.join(save_path, 'temp4_inner_total_df.csv'), index = False)