1. 작성 함수

type_transform(): int 타입 변수를 float형태로 변환

replace_work_period(): 근로기간에서 같은 의미를 담고 있지만 다르게 작성된 부분을 통일하는 함수

simple_imputation(): 단순 확률 대치법으로 nan값을 대체하는 함수 

loan_purpose(): 대출 목적 피쳐를 변환하고 통일하는 함수

2. 주요 전처리 사항

근로기간에서 같은 의미를 담고 있지만 다르게 작성된 부분을 통일

대출 목적에서 중요 목적을 제외한 나머지 클래스들을 '기타'로 통일

In [1]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic", 
rc={"axes.unicode_minus":False}, style='white')




In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.drop(['ID'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

In [3]:
train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
0,12480000,36 months,6 years,RENT,72000000,18.90,15,부채 통합,0,0,0.0,0.0,0.0,C
1,14400000,60 months,10+ years,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B
2,12000000,36 months,5 years,MORTGAGE,96000000,8.60,14,부채 통합,0,928644,151944.0,0.0,0.0,A
3,14400000,36 months,8 years,MORTGAGE,132000000,15.09,15,부채 통합,0,325824,153108.0,0.0,0.0,C
4,18000000,60 months,Unknown,RENT,71736000,25.39,19,주요 구매,0,228540,148956.0,0.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96289,14400000,36 months,10+ years,MORTGAGE,210000000,9.33,33,신용 카드,0,974580,492168.0,0.0,0.0,C
96290,28800000,60 months,10+ years,MORTGAGE,132000000,5.16,25,주택 개선,0,583728,855084.0,0.0,0.0,E
96291,14400000,36 months,1 year,MORTGAGE,84000000,11.24,22,신용 카드,0,1489128,241236.0,0.0,0.0,A
96292,15600000,36 months,5 years,MORTGAGE,66330000,17.30,21,부채 통합,2,1378368,818076.0,0.0,0.0,D


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96294 entries, 0 to 96293
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   대출금액          96294 non-null  int64  
 1   대출기간          96294 non-null  object 
 2   근로기간          96294 non-null  object 
 3   주택소유상태        96294 non-null  object 
 4   연간소득          96294 non-null  int64  
 5   부채_대비_소득_비율   96294 non-null  float64
 6   총계좌수          96294 non-null  int64  
 7   대출목적          96294 non-null  object 
 8   최근_2년간_연체_횟수  96294 non-null  int64  
 9   총상환원금         96294 non-null  int64  
 10  총상환이자         96294 non-null  float64
 11  총연체금액         96294 non-null  float64
 12  연체계좌수         96294 non-null  float64
 13  대출등급          96294 non-null  object 
dtypes: float64(4), int64(5), object(5)
memory usage: 10.3+ MB


In [5]:
X = train.drop(['대출등급'], axis=1)
y = train['대출등급']

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
y_train.value_counts()

B    23010
C    22129
A    13501
D    10612
E     5906
F     1537
G      340
Name: 대출등급, dtype: int64

In [8]:
y_val.value_counts()

B    5807
C    5494
A    3271
D    2742
E    1448
F     417
G      80
Name: 대출등급, dtype: int64

In [9]:
X_train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
56034,28800000,36 months,10+ years,MORTGAGE,120000000,10.10,20,신용 카드,0,2914824,625428.0,0.0,0.0
20086,2880000,36 months,2 years,RENT,16200000,33.78,16,신용 카드,0,65172,28428.0,0.0,0.0
43318,14400000,36 months,5 years,RENT,240000000,3.76,37,부채 통합,0,338304,125664.0,0.0,0.0
46440,7800000,36 months,10+ years,RENT,40149600,22.92,28,신용 카드,0,385116,97944.0,0.0,0.0
9774,17760000,60 months,1 year,RENT,54000000,13.41,29,부채 통합,0,442560,407424.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,15360000,36 months,3 years,MORTGAGE,87000000,22.35,47,부채 통합,0,389088,74544.0,0.0,0.0
54886,42000000,60 months,10+ years,MORTGAGE,168000000,23.90,36,부채 통합,0,1089564,648168.0,0.0,0.0
76820,18000000,36 months,Unknown,RENT,48000000,18.06,26,신용 카드,0,1812528,411972.0,0.0,0.0
860,24000000,36 months,10+ years,MORTGAGE,123600000,21.88,73,주택 개선,0,2395752,597180.0,0.0,0.0


In [10]:
X_val

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
79749,14400000,60 months,8 years,MORTGAGE,72000000,33.74,18,신용 카드,0,530268,427224.0,0.0,0.0
61621,34830000,36 months,10+ years,OWN,79200000,9.67,41,기타,3,741732,475968.0,0.0,0.0
127,14400000,36 months,1 year,RENT,91200000,9.81,12,신용 카드,1,309456,176700.0,0.0,0.0
65562,21900000,36 months,< 1 year,MORTGAGE,132000000,17.72,38,부채 통합,0,982080,510948.0,0.0,0.0
26735,12000000,36 months,10+ years,MORTGAGE,102000000,13.64,30,소규모 사업,0,559536,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,29040000,60 months,3 years,RENT,66000000,19.18,12,부채 통합,0,1268280,1549476.0,0.0,0.0
43079,5760000,36 months,10+ years,RENT,53697600,16.06,25,휴가,0,527880,250512.0,0.0,0.0
54846,15570000,60 months,< 1 year,RENT,134400000,4.82,15,신용 카드,0,357120,365328.0,0.0,0.0
88333,28800000,36 months,1 year,MORTGAGE,74400000,14.36,49,신용 카드,2,2110164,614856.0,0.0,0.0


In [11]:
cate_column = [var for var in X_train if X_train[var].dtype == 'O']
num_column = [var for var in X_train if X_train[var].dtype != 'O'] 

In [12]:
# int 타입 변수들을 모두 float 형태로 변환하는 함수 작성 
def type_transform(data_set):
    for var in num_column:
        if data_set[var].dtype == 'int64':
            data_set[var] = data_set[var].astype(float)

type_transform(X_train)
type_transform(X_val)

In [13]:
X_train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
56034,28800000.0,36 months,10+ years,MORTGAGE,120000000.0,10.10,20.0,신용 카드,0.0,2914824.0,625428.0,0.0,0.0
20086,2880000.0,36 months,2 years,RENT,16200000.0,33.78,16.0,신용 카드,0.0,65172.0,28428.0,0.0,0.0
43318,14400000.0,36 months,5 years,RENT,240000000.0,3.76,37.0,부채 통합,0.0,338304.0,125664.0,0.0,0.0
46440,7800000.0,36 months,10+ years,RENT,40149600.0,22.92,28.0,신용 카드,0.0,385116.0,97944.0,0.0,0.0
9774,17760000.0,60 months,1 year,RENT,54000000.0,13.41,29.0,부채 통합,0.0,442560.0,407424.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,15360000.0,36 months,3 years,MORTGAGE,87000000.0,22.35,47.0,부채 통합,0.0,389088.0,74544.0,0.0,0.0
54886,42000000.0,60 months,10+ years,MORTGAGE,168000000.0,23.90,36.0,부채 통합,0.0,1089564.0,648168.0,0.0,0.0
76820,18000000.0,36 months,Unknown,RENT,48000000.0,18.06,26.0,신용 카드,0.0,1812528.0,411972.0,0.0,0.0
860,24000000.0,36 months,10+ years,MORTGAGE,123600000.0,21.88,73.0,주택 개선,0.0,2395752.0,597180.0,0.0,0.0


In [14]:
X_val

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
79749,14400000.0,60 months,8 years,MORTGAGE,72000000.0,33.74,18.0,신용 카드,0.0,530268.0,427224.0,0.0,0.0
61621,34830000.0,36 months,10+ years,OWN,79200000.0,9.67,41.0,기타,3.0,741732.0,475968.0,0.0,0.0
127,14400000.0,36 months,1 year,RENT,91200000.0,9.81,12.0,신용 카드,1.0,309456.0,176700.0,0.0,0.0
65562,21900000.0,36 months,< 1 year,MORTGAGE,132000000.0,17.72,38.0,부채 통합,0.0,982080.0,510948.0,0.0,0.0
26735,12000000.0,36 months,10+ years,MORTGAGE,102000000.0,13.64,30.0,소규모 사업,0.0,559536.0,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,29040000.0,60 months,3 years,RENT,66000000.0,19.18,12.0,부채 통합,0.0,1268280.0,1549476.0,0.0,0.0
43079,5760000.0,36 months,10+ years,RENT,53697600.0,16.06,25.0,휴가,0.0,527880.0,250512.0,0.0,0.0
54846,15570000.0,60 months,< 1 year,RENT,134400000.0,4.82,15.0,신용 카드,0.0,357120.0,365328.0,0.0,0.0
88333,28800000.0,36 months,1 year,MORTGAGE,74400000.0,14.36,49.0,신용 카드,2.0,2110164.0,614856.0,0.0,0.0


In [15]:
def replace_work_period(data_set):

    data_set.replace({'근로기간' : '<1 year'}, '< 1 year', inplace=True)
    data_set.replace({'근로기간' : '10+years'}, '10+ years', inplace=True)
    data_set.replace({'근로기간' : '1 years'}, '1 year', inplace=True)
    data_set.replace({'근로기간' : '3'}, '3 years', inplace=True)
    data_set.replace({'근로기간' : 'Unknown'}, np.NaN, inplace=True)
    data_set.replace({'주택소유상태': 'ANY'}, 'MORTGAGE', inplace=True)

replace_work_period(X_train)
replace_work_period(X_val)

In [16]:
for var in X_train[cate_column]:
    print(var, X_train[var].unique())

대출기간 [' 36 months' ' 60 months']
근로기간 ['10+ years' '2 years' '5 years' '1 year' '4 years' '7 years' '< 1 year'
 '3 years' '8 years' '9 years' '6 years' nan]
주택소유상태 ['MORTGAGE' 'RENT' 'OWN']
대출목적 ['신용 카드' '부채 통합' '주요 구매' '기타' '이사' '주택 개선' '휴가' '소규모 사업' '재생 에너지' '주택'
 '의료' '자동차']


In [17]:
for var in X_train[cate_column]:
    print(var, X_val[var].unique())

대출기간 [' 60 months' ' 36 months']
근로기간 ['8 years' '10+ years' '1 year' '< 1 year' '2 years' nan '7 years'
 '6 years' '4 years' '5 years' '3 years' '9 years']
주택소유상태 ['MORTGAGE' 'OWN' 'RENT']
대출목적 ['신용 카드' '기타' '부채 통합' '소규모 사업' '주요 구매' '주택 개선' '의료' '자동차' '휴가' '이사' '주택'
 '재생 에너지']


In [18]:
def simple_imputation(df, column_name):
    # 결측치가 있는 열을 선택
    missing_values = df[column_name].isnull()
    
    # 결측치가 아닌 값들의 확률 분포 계산
    non_missing_values = df.loc[~missing_values, column_name]
    probabilities = non_missing_values.value_counts(normalize=True)
    
    # 결측치 대체
    imputed_values = np.random.choice(probabilities.index, size=missing_values.sum(), p=probabilities.values)
    
    # 대체된 값으로 결측치 채우기
    df.loc[missing_values, column_name] = imputed_values
    
    return df


# 단순 확률 대치법 적용
column_to_impute = '근로기간'
simple_imputation(X_train, column_to_impute)
simple_imputation(X_val, column_to_impute)

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
79749,14400000.0,60 months,8 years,MORTGAGE,72000000.0,33.74,18.0,신용 카드,0.0,530268.0,427224.0,0.0,0.0
61621,34830000.0,36 months,10+ years,OWN,79200000.0,9.67,41.0,기타,3.0,741732.0,475968.0,0.0,0.0
127,14400000.0,36 months,1 year,RENT,91200000.0,9.81,12.0,신용 카드,1.0,309456.0,176700.0,0.0,0.0
65562,21900000.0,36 months,< 1 year,MORTGAGE,132000000.0,17.72,38.0,부채 통합,0.0,982080.0,510948.0,0.0,0.0
26735,12000000.0,36 months,10+ years,MORTGAGE,102000000.0,13.64,30.0,소규모 사업,0.0,559536.0,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,29040000.0,60 months,3 years,RENT,66000000.0,19.18,12.0,부채 통합,0.0,1268280.0,1549476.0,0.0,0.0
43079,5760000.0,36 months,10+ years,RENT,53697600.0,16.06,25.0,휴가,0.0,527880.0,250512.0,0.0,0.0
54846,15570000.0,60 months,< 1 year,RENT,134400000.0,4.82,15.0,신용 카드,0.0,357120.0,365328.0,0.0,0.0
88333,28800000.0,36 months,1 year,MORTGAGE,74400000.0,14.36,49.0,신용 카드,2.0,2110164.0,614856.0,0.0,0.0


In [19]:
X_train.isnull().sum()

대출금액            0
대출기간            0
근로기간            0
주택소유상태          0
연간소득            0
부채_대비_소득_비율     0
총계좌수            0
대출목적            0
최근_2년간_연체_횟수    0
총상환원금           0
총상환이자           0
총연체금액           0
연체계좌수           0
dtype: int64

In [20]:
X_val.isnull().sum()

대출금액            0
대출기간            0
근로기간            0
주택소유상태          0
연간소득            0
부채_대비_소득_비율     0
총계좌수            0
대출목적            0
최근_2년간_연체_횟수    0
총상환원금           0
총상환이자           0
총연체금액           0
연체계좌수           0
dtype: int64

In [21]:
def loan_purpose(data_set):

    data_set.replace({'대출목적' : '주택 개선'}, '주택', inplace=True)
    data_set.replace({'대출목적' : '이사'}, '주택', inplace=True)
    data_set.replace({'대출목적' : '주요 구매'}, '생활비', inplace=True)
    data_set.replace({'대출목적' : '휴가'}, '생활비', inplace=True)
    data_set.replace({'대출목적':'의료'}, '생활비', inplace=True)
    data_set.replace({'대출목적':'자동차'}, '생활비', inplace=True)
    data_set.replace({'대출목적':'소규모 사업'}, '기타', inplace=True)
    data_set.replace({'대출목적':'재생 에너지'}, '기타', inplace=True)
    data_set.replace({'대출목적':'결혼'}, '기타', inplace = True)
loan_purpose(X_train)
loan_purpose(X_val)

In [22]:
X_train['대출목적'].value_counts()

부채 통합    44132
신용 카드    19673
주택        5573
기타        4379
생활비       3278
Name: 대출목적, dtype: int64

In [23]:
X_val['대출목적'].value_counts()

부채 통합    11018
신용 카드     4827
주택        1394
기타        1193
생활비        827
Name: 대출목적, dtype: int64

In [24]:
# 다중공선성 확인 1

from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = data.columns
    vif_data["VIF"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    return vif_data

# 독립 변수만 선택
independent_vars = X_train[num_column]

# VIF 계산
vif_result = calculate_vif(independent_vars)
print(vif_result)

       Variable       VIF
0          대출금액  6.493396
1          연간소득  2.058218
2   부채_대비_소득_비율  1.244949
3          총계좌수  3.695351
4  최근_2년간_연체_횟수  1.176848
5         총상환원금  2.124199
6         총상환이자  3.379988
7         총연체금액  1.005756
8         연체계좌수  1.024213


In [25]:
X_train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
56034,28800000.0,36 months,10+ years,MORTGAGE,120000000.0,10.10,20.0,신용 카드,0.0,2914824.0,625428.0,0.0,0.0
20086,2880000.0,36 months,2 years,RENT,16200000.0,33.78,16.0,신용 카드,0.0,65172.0,28428.0,0.0,0.0
43318,14400000.0,36 months,5 years,RENT,240000000.0,3.76,37.0,부채 통합,0.0,338304.0,125664.0,0.0,0.0
46440,7800000.0,36 months,10+ years,RENT,40149600.0,22.92,28.0,신용 카드,0.0,385116.0,97944.0,0.0,0.0
9774,17760000.0,60 months,1 year,RENT,54000000.0,13.41,29.0,부채 통합,0.0,442560.0,407424.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,15360000.0,36 months,3 years,MORTGAGE,87000000.0,22.35,47.0,부채 통합,0.0,389088.0,74544.0,0.0,0.0
54886,42000000.0,60 months,10+ years,MORTGAGE,168000000.0,23.90,36.0,부채 통합,0.0,1089564.0,648168.0,0.0,0.0
76820,18000000.0,36 months,10+ years,RENT,48000000.0,18.06,26.0,신용 카드,0.0,1812528.0,411972.0,0.0,0.0
860,24000000.0,36 months,10+ years,MORTGAGE,123600000.0,21.88,73.0,주택,0.0,2395752.0,597180.0,0.0,0.0


In [26]:
X_val

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
79749,14400000.0,60 months,8 years,MORTGAGE,72000000.0,33.74,18.0,신용 카드,0.0,530268.0,427224.0,0.0,0.0
61621,34830000.0,36 months,10+ years,OWN,79200000.0,9.67,41.0,기타,3.0,741732.0,475968.0,0.0,0.0
127,14400000.0,36 months,1 year,RENT,91200000.0,9.81,12.0,신용 카드,1.0,309456.0,176700.0,0.0,0.0
65562,21900000.0,36 months,< 1 year,MORTGAGE,132000000.0,17.72,38.0,부채 통합,0.0,982080.0,510948.0,0.0,0.0
26735,12000000.0,36 months,10+ years,MORTGAGE,102000000.0,13.64,30.0,기타,0.0,559536.0,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,29040000.0,60 months,3 years,RENT,66000000.0,19.18,12.0,부채 통합,0.0,1268280.0,1549476.0,0.0,0.0
43079,5760000.0,36 months,10+ years,RENT,53697600.0,16.06,25.0,생활비,0.0,527880.0,250512.0,0.0,0.0
54846,15570000.0,60 months,< 1 year,RENT,134400000.0,4.82,15.0,신용 카드,0.0,357120.0,365328.0,0.0,0.0
88333,28800000.0,36 months,1 year,MORTGAGE,74400000.0,14.36,49.0,신용 카드,2.0,2110164.0,614856.0,0.0,0.0


In [27]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse=False)
X_train_encoded = encoder.fit_transform(X_train[cate_column])
X_val_encoded = encoder.fit_transform(X_val[cate_column])



In [28]:
X_train_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(cate_column),index=X_train.index)
X_val_df = pd.DataFrame(X_val_encoded, columns=encoder.get_feature_names_out(cate_column), index=X_val.index)

In [29]:
X_train_df

Unnamed: 0,대출기간_ 60 months,근로기간_10+ years,근로기간_2 years,근로기간_3 years,근로기간_4 years,근로기간_5 years,근로기간_6 years,근로기간_7 years,근로기간_8 years,근로기간_9 years,근로기간_< 1 year,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
56034,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
20086,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
43318,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
46440,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9774,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
54886,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
76820,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
860,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [30]:
X_val_df

Unnamed: 0,대출기간_ 60 months,근로기간_10+ years,근로기간_2 years,근로기간_3 years,근로기간_4 years,근로기간_5 years,근로기간_6 years,근로기간_7 years,근로기간_8 years,근로기간_9 years,근로기간_< 1 year,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
79749,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
61621,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
65562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
26735,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
43079,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
54846,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
88333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [31]:
X_train[num_column]

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
56034,28800000.0,120000000.0,10.10,20.0,0.0,2914824.0,625428.0,0.0,0.0
20086,2880000.0,16200000.0,33.78,16.0,0.0,65172.0,28428.0,0.0,0.0
43318,14400000.0,240000000.0,3.76,37.0,0.0,338304.0,125664.0,0.0,0.0
46440,7800000.0,40149600.0,22.92,28.0,0.0,385116.0,97944.0,0.0,0.0
9774,17760000.0,54000000.0,13.41,29.0,0.0,442560.0,407424.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
6265,15360000.0,87000000.0,22.35,47.0,0.0,389088.0,74544.0,0.0,0.0
54886,42000000.0,168000000.0,23.90,36.0,0.0,1089564.0,648168.0,0.0,0.0
76820,18000000.0,48000000.0,18.06,26.0,0.0,1812528.0,411972.0,0.0,0.0
860,24000000.0,123600000.0,21.88,73.0,0.0,2395752.0,597180.0,0.0,0.0


In [32]:
X_val[num_column]

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
79749,14400000.0,72000000.0,33.74,18.0,0.0,530268.0,427224.0,0.0,0.0
61621,34830000.0,79200000.0,9.67,41.0,3.0,741732.0,475968.0,0.0,0.0
127,14400000.0,91200000.0,9.81,12.0,1.0,309456.0,176700.0,0.0,0.0
65562,21900000.0,132000000.0,17.72,38.0,0.0,982080.0,510948.0,0.0,0.0
26735,12000000.0,102000000.0,13.64,30.0,0.0,559536.0,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
50679,29040000.0,66000000.0,19.18,12.0,0.0,1268280.0,1549476.0,0.0,0.0
43079,5760000.0,53697600.0,16.06,25.0,0.0,527880.0,250512.0,0.0,0.0
54846,15570000.0,134400000.0,4.82,15.0,0.0,357120.0,365328.0,0.0,0.0
88333,28800000.0,74400000.0,14.36,49.0,2.0,2110164.0,614856.0,0.0,0.0


In [33]:
X_train_last = pd.concat([X_train[num_column], X_train_df], axis=1)
X_train_last

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출기간_ 60 months,...,근로기간_7 years,근로기간_8 years,근로기간_9 years,근로기간_< 1 year,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
56034,28800000.0,120000000.0,10.10,20.0,0.0,2914824.0,625428.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
20086,2880000.0,16200000.0,33.78,16.0,0.0,65172.0,28428.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
43318,14400000.0,240000000.0,3.76,37.0,0.0,338304.0,125664.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
46440,7800000.0,40149600.0,22.92,28.0,0.0,385116.0,97944.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9774,17760000.0,54000000.0,13.41,29.0,0.0,442560.0,407424.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,15360000.0,87000000.0,22.35,47.0,0.0,389088.0,74544.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
54886,42000000.0,168000000.0,23.90,36.0,0.0,1089564.0,648168.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
76820,18000000.0,48000000.0,18.06,26.0,0.0,1812528.0,411972.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
860,24000000.0,123600000.0,21.88,73.0,0.0,2395752.0,597180.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [34]:
X_val_last = pd.concat([X_val[num_column], X_val_df], axis=1)
X_val_last

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출기간_ 60 months,...,근로기간_7 years,근로기간_8 years,근로기간_9 years,근로기간_< 1 year,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
79749,14400000.0,72000000.0,33.74,18.0,0.0,530268.0,427224.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
61621,34830000.0,79200000.0,9.67,41.0,3.0,741732.0,475968.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
127,14400000.0,91200000.0,9.81,12.0,1.0,309456.0,176700.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
65562,21900000.0,132000000.0,17.72,38.0,0.0,982080.0,510948.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
26735,12000000.0,102000000.0,13.64,30.0,0.0,559536.0,226152.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,29040000.0,66000000.0,19.18,12.0,0.0,1268280.0,1549476.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
43079,5760000.0,53697600.0,16.06,25.0,0.0,527880.0,250512.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
54846,15570000.0,134400000.0,4.82,15.0,0.0,357120.0,365328.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
88333,28800000.0,74400000.0,14.36,49.0,2.0,2110164.0,614856.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [35]:
from sklearn.preprocessing import RobustScaler, StandardScaler

scaler = RobustScaler()
X_train_last[num_column] = scaler.fit_transform(X_train_last[num_column])
X_val_last[num_column] = scaler.transform(X_val_last[num_column])


In [36]:
X_val_last

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출기간_ 60 months,...,근로기간_7 years,근로기간_8 years,근로기간_9 years,근로기간_< 1 year,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
79749,-0.173913,-0.108696,1.161640,-0.400000,0.0,-0.090251,0.324784,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
61621,1.306522,0.021739,-0.699923,1.133333,3.0,0.192788,0.436986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
127,-0.173913,0.239130,-0.689095,-0.800000,1.0,-0.385801,-0.251889,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
65562,0.369565,0.978261,-0.077340,0.933333,0.0,0.514488,0.517506,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
26735,-0.347826,0.434783,-0.392885,0.400000,0.0,-0.051076,-0.138057,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,0.886957,-0.217391,0.035576,-0.800000,0.0,0.897559,2.908059,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
43079,-0.800000,-0.440261,-0.205723,0.066667,0.0,-0.093447,-0.081983,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
54846,-0.089130,1.021739,-1.075019,-0.600000,0.0,-0.322004,0.182308,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
88333,0.869565,-0.065217,-0.337200,1.666667,2.0,2.024398,0.756688,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [37]:
# 초기 랜덤포레스트 모델 생성 후 정확도 측정

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_last, y_train)


# 검증 데이터 정확도 계산
y_val_pred = rf.predict(X_val_last)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("검증 데이터 정확도:", val_accuracy)

검증 데이터 정확도: 0.7686795783789397


In [38]:
# 초기 DT 모델 생성 후 정확도 측정

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_last, y_train)

y_val_pred = dt.predict(X_val_last)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("검증 데이터 정확도:", val_accuracy)

검증 데이터 정확도: 0.8251207227789605


In [39]:
test

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,16800000,36 months,8 years,MORTGAGE,132000000,19.64,12,주택 개선,0,394692,146604.0,0.0,0.0
1,8400000,36 months,5 years,RENT,89971200,15.84,25,부채 통합,0,0,0.0,0.0,0.0
2,17280000,36 months,6 years,RENT,150000000,8.41,20,신용 카드,0,1786980,281820.0,0.0,0.0
3,14400000,36 months,5 years,MORTGAGE,66000000,13.72,30,신용 카드,1,669024,281724.0,0.0,0.0
4,27600000,36 months,5 years,RENT,55200000,30.50,12,신용 카드,0,1250052,614844.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,30000000,36 months,3 years,MORTGAGE,78000000,22.08,27,부채 통합,2,1307532,763380.0,0.0,0.0
64193,30000000,60 months,10+ years,MORTGAGE,109200000,12.06,26,부채 통합,0,960612,1245252.0,0.0,0.0
64194,6120000,36 months,10+ years,RENT,39600000,28.80,33,부채 통합,0,131520,80880.0,0.0,0.0
64195,11520000,36 months,10+ years,MORTGAGE,66000000,25.44,41,부채 통합,1,1339536,601872.0,0.0,0.0


### 예측하고자 하는 데이터셋에 대한 전처리

In [40]:
type_transform(test)
replace_work_period(test)
simple_imputation(test, column_to_impute)
loan_purpose(test)

In [41]:
test

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,16800000.0,36 months,8 years,MORTGAGE,132000000.0,19.64,12.0,주택,0.0,394692.0,146604.0,0.0,0.0
1,8400000.0,36 months,5 years,RENT,89971200.0,15.84,25.0,부채 통합,0.0,0.0,0.0,0.0,0.0
2,17280000.0,36 months,6 years,RENT,150000000.0,8.41,20.0,신용 카드,0.0,1786980.0,281820.0,0.0,0.0
3,14400000.0,36 months,5 years,MORTGAGE,66000000.0,13.72,30.0,신용 카드,1.0,669024.0,281724.0,0.0,0.0
4,27600000.0,36 months,5 years,RENT,55200000.0,30.50,12.0,신용 카드,0.0,1250052.0,614844.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,30000000.0,36 months,3 years,MORTGAGE,78000000.0,22.08,27.0,부채 통합,2.0,1307532.0,763380.0,0.0,0.0
64193,30000000.0,60 months,10+ years,MORTGAGE,109200000.0,12.06,26.0,부채 통합,0.0,960612.0,1245252.0,0.0,0.0
64194,6120000.0,36 months,10+ years,RENT,39600000.0,28.80,33.0,부채 통합,0.0,131520.0,80880.0,0.0,0.0
64195,11520000.0,36 months,10+ years,MORTGAGE,66000000.0,25.44,41.0,부채 통합,1.0,1339536.0,601872.0,0.0,0.0


In [42]:
test_encoded = encoder.fit_transform(test[cate_column])
test_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(cate_column), index=test.index)
test_df



Unnamed: 0,대출기간_ 60 months,근로기간_10+ years,근로기간_2 years,근로기간_3 years,근로기간_4 years,근로기간_5 years,근로기간_6 years,근로기간_7 years,근로기간_8 years,근로기간_9 years,근로기간_< 1 year,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64193,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64194,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
64195,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [43]:
test_last = pd.concat([test[num_column], test_df], axis=1)
test_last

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출기간_ 60 months,...,근로기간_7 years,근로기간_8 years,근로기간_9 years,근로기간_< 1 year,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,16800000.0,132000000.0,19.64,12.0,0.0,394692.0,146604.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,8400000.0,89971200.0,15.84,25.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,17280000.0,150000000.0,8.41,20.0,0.0,1786980.0,281820.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,14400000.0,66000000.0,13.72,30.0,1.0,669024.0,281724.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,27600000.0,55200000.0,30.50,12.0,0.0,1250052.0,614844.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,30000000.0,78000000.0,22.08,27.0,2.0,1307532.0,763380.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64193,30000000.0,109200000.0,12.06,26.0,0.0,960612.0,1245252.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64194,6120000.0,39600000.0,28.80,33.0,0.0,131520.0,80880.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
64195,11520000.0,66000000.0,25.44,41.0,1.0,1339536.0,601872.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [44]:
test_last[num_column] = scaler.fit_transform(test_last[num_column])
test_last

Unnamed: 0,대출금액,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출기간_ 60 months,...,근로기간_7 years,근로기간_8 years,근로기간_9 years,근로기간_< 1 year,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,0.000000,0.978240,0.067285,-0.800000,0.0,-0.270784,-0.318157,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.603448,0.216865,-0.226605,0.066667,0.0,-0.794718,-0.649044,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.034483,1.304319,-0.801237,-0.266667,0.0,1.577409,-0.012973,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.172414,-0.217387,-0.390565,0.400000,1.0,0.093378,-0.013190,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.775862,-0.413034,0.907193,-0.800000,0.0,0.864664,0.738665,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,0.948276,0.000000,0.255994,0.200000,2.0,0.940966,1.073913,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64193,0.948276,0.565205,-0.518948,0.133333,0.0,0.480447,2.161503,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64194,-0.767241,-0.695637,0.775715,0.600000,0.0,-0.620132,-0.466497,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
64195,-0.379310,-0.217387,0.515855,1.133333,1.0,0.983449,0.709387,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [45]:
sub = pd.read_csv('sample_submission.csv')
sub

Unnamed: 0,ID,대출등급
0,TEST_00000,G
1,TEST_00001,G
2,TEST_00002,G
3,TEST_00003,G
4,TEST_00004,G
...,...,...
64192,TEST_64192,G
64193,TEST_64193,G
64194,TEST_64194,G
64195,TEST_64195,G


In [46]:
test_pred = dt.predict(test_last)
test_pred

array(['B', 'C', 'A', ..., 'C', 'C', 'A'], dtype=object)

In [47]:
sub['대출등급'] = test_pred
sub

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,C
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,C
64195,TEST_64195,C


In [48]:
sub.to_csv('0128_DT.csv', index=False)

초기 dt 모델을 통해 test데이터에 대한 대출등급을 예측한 결과 0.7149점 나옴

In [49]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# 탐색할 하이퍼파라미터 분포를 설정
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': randint(1, 30),  
    'min_samples_split': randint(2, 11), 
    'min_samples_leaf': randint(1, 5)  
}

dt = DecisionTreeClassifier(random_state=42)

# RandomizedSearchCV를 사용
random_search = RandomizedSearchCV(dt, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train_last, y_train)

print("최적의 하이퍼파라미터:", random_search.best_params_)

best_dt = random_search.best_estimator_
y_val_pred = best_dt.predict(X_val_last)

val_accuracy = accuracy_score(y_val, y_val_pred)
print("최적 모델의 검증 데이터 정확도:", val_accuracy)

최적의 하이퍼파라미터: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'splitter': 'best'}
최적 모델의 검증 데이터 정확도: 0.8457863855859599


In [62]:
# 수정한 하이퍼 파라미터 적용한 DT모델

tun_dt = DecisionTreeClassifier(criterion='gini', max_depth=20, min_samples_leaf = 4, min_samples_split=10, splitter='best', random_state=42)

tun_dt.fit(X_train_last, y_train)
y_val_pred_dt = tun_dt.predict(X_val_last)
pred_accuracy = accuracy_score(y_val, y_val_pred)
print("검증 데이터 정확도:", pred_accuracy)

검증 데이터 정확도: 0.8457863855859599


In [53]:
tun_dt_pred = tun_dt.predict(test_last)
tun_dt_pred

array(['B', 'B', 'A', ..., 'D', 'C', 'A'], dtype=object)

In [56]:
sub['대출등급'] = tun_dt_pred
sub.to_csv('tun_dt_model.csv', index=False)

tun_dt model: 0.7470으로 test셋 예측 정확도 상승. 이전대비 0.03점 UP 

Feature Selection

In [63]:
# confusion metrics 생성

from sklearn.metrics import confusion_matrix
cf=confusion_matrix(y_val,y_val_pred_dt)
cf

array([[2912,  339,   18,    1,    1,    0,    0],
       [ 277, 5132,  372,   21,    2,    3,    0],
       [  46,  489, 4721,  213,   22,    3,    0],
       [  13,  100,  301, 2130,  191,    6,    1],
       [   3,   18,   55,  262, 1052,   51,    7],
       [   0,   10,   13,   18,   59,  295,   22],
       [   0,    0,    3,    2,    6,   22,   47]], dtype=int64)

In [65]:
print(np.round(tun_dt.feature_importances_, 3))

[0.04  0.019 0.009 0.007 0.002 0.46  0.414 0.    0.    0.04  0.001 0.
 0.    0.    0.    0.    0.    0.    0.    0.001 0.    0.001 0.001 0.
 0.001 0.   ]


In [68]:
feat_importances = pd.Series(tun_dt.feature_importances_, index=X_train_last.columns)
feat_importances_sort = feat_importances.sort_values(ascending=False)
feat_importances_sort

총상환원금              0.460314
총상환이자              0.414198
대출금액               0.040387
대출기간_ 60 months    0.039721
연간소득               0.019346
부채_대비_소득_비율        0.008908
총계좌수               0.007476
최근_2년간_연체_횟수       0.001718
대출목적_신용 카드         0.001479
주택소유상태_RENT        0.001420
근로기간_10+ years     0.001226
대출목적_부채 통합         0.000631
근로기간_< 1 year      0.000554
주택소유상태_OWN         0.000465
근로기간_4 years       0.000273
근로기간_2 years       0.000263
근로기간_6 years       0.000261
대출목적_생활비           0.000251
근로기간_3 years       0.000237
근로기간_9 years       0.000234
근로기간_5 years       0.000189
근로기간_8 years       0.000165
근로기간_7 years       0.000137
대출목적_주택            0.000092
총연체금액              0.000056
연체계좌수              0.000000
dtype: float64

In [69]:
feat_importances_sort[:10].index

Index(['총상환원금', '총상환이자', '대출금액', '대출기간_ 60 months', '연간소득', '부채_대비_소득_비율',
       '총계좌수', '최근_2년간_연체_횟수', '대출목적_신용 카드', '주택소유상태_RENT'],
      dtype='object')

In [87]:
X_train_new = X_train_last[feat_importances_sort[:4].index]
X_val_new = X_val_last[feat_importances_sort[:4].index]

X_train_new

Unnamed: 0,총상환원금,총상환이자,대출금액,대출기간_ 60 months
56034,3.101413,0.781023,0.869565,0.0
20086,-0.712769,-0.593191,-1.008696,0.0
43318,-0.347189,-0.369367,-0.173913,0.0
46440,-0.284533,-0.433175,-0.652174,0.0
9774,-0.207645,0.279207,0.069565,1.0
...,...,...,...,...
6265,-0.279216,-0.487038,-0.104348,0.0
54886,0.658352,0.833368,1.826087,1.0
76820,1.626020,0.289676,0.086957,0.0
860,2.406650,0.716000,0.521739,0.0


In [88]:
tun_dt.fit(X_train_new, y_train)

In [89]:
y_pred_new = tun_dt.predict(X_val_new)

accuracy_score_tun_dt = accuracy_score(y_val, y_pred_new)
print("accuracy:",accuracy_score_tun_dt)

accuracy: 0.8636481644945221


In [94]:
tun_dt_feature_sel = tun_dt.predict(test_last[feat_importances_sort[:4].index])
tun_dt_feature_sel

array(['B', 'B', 'A', ..., 'D', 'C', 'A'], dtype=object)

In [95]:
sub['대출등급'] = tun_dt_feature_sel
sub.to_csv('feature_selection_tun_dt.csv', index=False)

tuning rf model

In [59]:
tuning_rf = RandomForestClassifier(max_depth=20, min_samples_leaf=3, min_samples_split=9,n_estimators=200)

tuning_rf.fit(X_train_last, y_train)
y_val_pred_rf = tuning_rf.predict(X_val_last)
pred_accuracy = accuracy_score(y_val, y_val_pred_rf)
print("검증 데이터 정확도:", pred_accuracy)

검증 데이터 정확도: 0.7442234799314607


In [96]:
tuning_rf.fit(X_train_new, y_train)

In [98]:
y_val_pred_tun_rf = tuning_rf.predict(X_val_new)
pred_accuracy = accuracy_score(y_val, y_val_pred_tun_rf)
print("검증 데이터 정확도:", pred_accuracy)

검증 데이터 정확도: 0.8858715405784309


In [100]:
tuning_rf_feature_sel = tuning_rf.predict(test_last[feat_importances_sort[:4].index])

In [102]:
sub['대출등급'] = tuning_rf_feature_sel
sub

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C


In [104]:
#피쳐 셀렉션을 적용한 튜닝 랜덤 포레스트 모델
sub.to_csv('tuning_rf_feature_sel.csv',index=False)

피쳐 셀렉션을 적용한 튜닝 랜덤 포레스트 모델 결과: 측정 결과 0.7954003으로 현재까지 생성한 모델 중 가장 높은 점수