In [1]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic", 
rc={"axes.unicode_minus":False}, style='white')


In [2]:
X_train = pd.read_csv('preprocessing_train.csv')
X_val = pd.read_csv('preprocessing_validation.csv')
y_train = pd.read_csv('y_train.csv')
y_val = pd.read_csv('y_val.csv')

test = pd.read_csv('test.csv')

In [3]:
cate_column = [var for var in X_train if X_train[var].dtype == 'O']
num_column = [var for var in X_train if X_train[var].dtype != 'O'] 

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse=False)
X_train_encoded = encoder.fit_transform(X_train[cate_column])
X_val_encoded = encoder.fit_transform(X_val[cate_column])



In [5]:
X_train_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(cate_column),index=X_train.index)
X_val_df = pd.DataFrame(X_val_encoded, columns=encoder.get_feature_names_out(cate_column), index=X_val.index)

In [6]:
X_train_df

Unnamed: 0,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
77030,0.0,0.0,1.0,0.0,0.0,0.0
77031,0.0,0.0,1.0,0.0,0.0,0.0
77032,0.0,1.0,0.0,0.0,1.0,0.0
77033,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
X_val_df

Unnamed: 0,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
19254,0.0,1.0,1.0,0.0,0.0,0.0
19255,0.0,1.0,0.0,1.0,0.0,0.0
19256,0.0,1.0,0.0,0.0,1.0,0.0
19257,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
X_train[num_column]

Unnamed: 0,대출금액,대출기간,근로기간,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,28800000.0,36.0,11.0,120000000.0,10.10,20.0,0.0,2914824.0,625428.0,0.0,0.0
1,2880000.0,36.0,2.0,16200000.0,33.78,16.0,0.0,65172.0,28428.0,0.0,0.0
2,14400000.0,36.0,5.0,240000000.0,3.76,37.0,0.0,338304.0,125664.0,0.0,0.0
3,7800000.0,36.0,11.0,40149600.0,22.92,28.0,0.0,385116.0,97944.0,0.0,0.0
4,17760000.0,60.0,1.0,54000000.0,13.41,29.0,0.0,442560.0,407424.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
77030,15360000.0,36.0,3.0,87000000.0,22.35,47.0,0.0,389088.0,74544.0,0.0,0.0
77031,42000000.0,60.0,11.0,168000000.0,23.90,36.0,0.0,1089564.0,648168.0,0.0,0.0
77032,18000000.0,36.0,11.0,48000000.0,18.06,26.0,0.0,1812528.0,411972.0,0.0,0.0
77033,24000000.0,36.0,11.0,123600000.0,21.88,73.0,0.0,2395752.0,597180.0,0.0,0.0


In [10]:
X_val[num_column]

Unnamed: 0,대출금액,대출기간,근로기간,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,14400000.0,60.0,8.0,72000000.0,33.74,18.0,0.0,530268.0,427224.0,0.0,0.0
1,34830000.0,36.0,11.0,79200000.0,9.67,41.0,3.0,741732.0,475968.0,0.0,0.0
2,14400000.0,36.0,1.0,91200000.0,9.81,12.0,1.0,309456.0,176700.0,0.0,0.0
3,21900000.0,36.0,0.0,132000000.0,17.72,38.0,0.0,982080.0,510948.0,0.0,0.0
4,12000000.0,36.0,11.0,102000000.0,13.64,30.0,0.0,559536.0,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
19254,29040000.0,60.0,3.0,66000000.0,19.18,12.0,0.0,1268280.0,1549476.0,0.0,0.0
19255,5760000.0,36.0,11.0,53697600.0,16.06,25.0,0.0,527880.0,250512.0,0.0,0.0
19256,15570000.0,60.0,0.0,134400000.0,4.82,15.0,0.0,357120.0,365328.0,0.0,0.0
19257,28800000.0,36.0,1.0,74400000.0,14.36,49.0,2.0,2110164.0,614856.0,0.0,0.0


In [11]:
X_train_last = pd.concat([X_train[num_column], X_train_df], axis=1)
X_train_last

Unnamed: 0,대출금액,대출기간,근로기간,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,28800000.0,36.0,11.0,120000000.0,10.10,20.0,0.0,2914824.0,625428.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2880000.0,36.0,2.0,16200000.0,33.78,16.0,0.0,65172.0,28428.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,14400000.0,36.0,5.0,240000000.0,3.76,37.0,0.0,338304.0,125664.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,7800000.0,36.0,11.0,40149600.0,22.92,28.0,0.0,385116.0,97944.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,17760000.0,60.0,1.0,54000000.0,13.41,29.0,0.0,442560.0,407424.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77030,15360000.0,36.0,3.0,87000000.0,22.35,47.0,0.0,389088.0,74544.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
77031,42000000.0,60.0,11.0,168000000.0,23.90,36.0,0.0,1089564.0,648168.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
77032,18000000.0,36.0,11.0,48000000.0,18.06,26.0,0.0,1812528.0,411972.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
77033,24000000.0,36.0,11.0,123600000.0,21.88,73.0,0.0,2395752.0,597180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
X_val_last = pd.concat([X_val[num_column], X_val_df], axis=1)
X_val_last

Unnamed: 0,대출금액,대출기간,근로기간,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,14400000.0,60.0,8.0,72000000.0,33.74,18.0,0.0,530268.0,427224.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,34830000.0,36.0,11.0,79200000.0,9.67,41.0,3.0,741732.0,475968.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,14400000.0,36.0,1.0,91200000.0,9.81,12.0,1.0,309456.0,176700.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,21900000.0,36.0,0.0,132000000.0,17.72,38.0,0.0,982080.0,510948.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,12000000.0,36.0,11.0,102000000.0,13.64,30.0,0.0,559536.0,226152.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19254,29040000.0,60.0,3.0,66000000.0,19.18,12.0,0.0,1268280.0,1549476.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
19255,5760000.0,36.0,11.0,53697600.0,16.06,25.0,0.0,527880.0,250512.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
19256,15570000.0,60.0,0.0,134400000.0,4.82,15.0,0.0,357120.0,365328.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
19257,28800000.0,36.0,1.0,74400000.0,14.36,49.0,2.0,2110164.0,614856.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
from sklearn.preprocessing import RobustScaler, StandardScaler

scaler = RobustScaler()
X_train_last[num_column] = scaler.fit_transform(X_train_last[num_column])
X_val_last[num_column] = scaler.transform(X_val_last[num_column])


In [14]:
X_val_last

Unnamed: 0,대출금액,대출기간,근로기간,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,-0.173913,1.0,0.222222,-0.108696,1.161640,-0.400000,0.0,-0.090251,0.324784,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.306522,0.0,0.555556,0.021739,-0.699923,1.133333,3.0,0.192788,0.436986,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.173913,0.0,-0.555556,0.239130,-0.689095,-0.800000,1.0,-0.385801,-0.251889,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.369565,0.0,-0.666667,0.978261,-0.077340,0.933333,0.0,0.514488,0.517506,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.347826,0.0,0.555556,0.434783,-0.392885,0.400000,0.0,-0.051076,-0.138057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19254,0.886957,1.0,-0.333333,-0.217391,0.035576,-0.800000,0.0,0.897559,2.908059,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
19255,-0.800000,0.0,0.555556,-0.440261,-0.205723,0.066667,0.0,-0.093447,-0.081983,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
19256,-0.089130,1.0,-0.666667,1.021739,-1.075019,-0.600000,0.0,-0.322004,0.182308,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
19257,0.869565,0.0,-0.555556,-0.065217,-0.337200,1.666667,2.0,2.024398,0.756688,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
# 초기 랜덤포레스트 모델 생성 후 정확도 측정

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_last, y_train)


# 검증 데이터 정확도 계산
y_val_pred = rf.predict(X_val_last)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("검증 데이터 정확도:", val_accuracy)

  return fit_method(estimator, *args, **kwargs)


검증 데이터 정확도: 0.7972376551222805


In [16]:
# 초기 DT 모델 생성 후 정확도 측정

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_last, y_train)

y_val_pred = dt.predict(X_val_last)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("검증 데이터 정확도:", val_accuracy)

검증 데이터 정확도: 0.8263668934004881


In [17]:
# 초기 xgboost 모델 생성 후 정확도 측정
# y_train, y_val 데이터를 수치형으로 변환 

import xgboost as xgb
label_encoder = LabelEncoder()

y_train_xg = label_encoder.fit_transform(y_train)
y_val_xg = label_encoder.fit_transform(y_val)

xg = xgb.XGBClassifier(objective="multi:softmax", eval_metric="mlogloss", random_state=42, num_class=len(label_encoder.classes_))
xg.fit(X_train_last, y_train_xg)

xg_pred = xg.predict(X_val_last)
accuracy_score(xg_pred, y_val_xg)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.8495248974505426

In [18]:
test = pd.read_csv('preprocessing_test.csv')
test

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,16800000.0,36.0,8.0,MORTGAGE,132000000.0,19.64,12.0,주택,0.0,394692.0,146604.0,0.0,0.0
1,8400000.0,36.0,5.0,RENT,89971200.0,15.84,25.0,부채 통합,0.0,0.0,0.0,0.0,0.0
2,17280000.0,36.0,6.0,RENT,150000000.0,8.41,20.0,신용 카드,0.0,1786980.0,281820.0,0.0,0.0
3,14400000.0,36.0,5.0,MORTGAGE,66000000.0,13.72,30.0,신용 카드,1.0,669024.0,281724.0,0.0,0.0
4,27600000.0,36.0,5.0,RENT,55200000.0,30.50,12.0,신용 카드,0.0,1250052.0,614844.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,30000000.0,36.0,3.0,MORTGAGE,78000000.0,22.08,27.0,부채 통합,2.0,1307532.0,763380.0,0.0,0.0
64193,30000000.0,60.0,11.0,MORTGAGE,109200000.0,12.06,26.0,부채 통합,0.0,960612.0,1245252.0,0.0,0.0
64194,6120000.0,36.0,11.0,RENT,39600000.0,28.80,33.0,부채 통합,0.0,131520.0,80880.0,0.0,0.0
64195,11520000.0,36.0,11.0,MORTGAGE,66000000.0,25.44,41.0,부채 통합,1.0,1339536.0,601872.0,0.0,0.0


In [19]:
test_encoded = encoder.fit_transform(test[cate_column])
test_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(cate_column), index=test.index)
test_df



Unnamed: 0,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...
64192,0.0,0.0,1.0,0.0,0.0,0.0
64193,0.0,0.0,1.0,0.0,0.0,0.0
64194,0.0,1.0,1.0,0.0,0.0,0.0
64195,0.0,0.0,1.0,0.0,0.0,0.0


In [20]:
test_last = pd.concat([test[num_column], test_df], axis=1)
test_last

Unnamed: 0,대출금액,대출기간,근로기간,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,16800000.0,36.0,8.0,132000000.0,19.64,12.0,0.0,394692.0,146604.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,8400000.0,36.0,5.0,89971200.0,15.84,25.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,17280000.0,36.0,6.0,150000000.0,8.41,20.0,0.0,1786980.0,281820.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,14400000.0,36.0,5.0,66000000.0,13.72,30.0,1.0,669024.0,281724.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,27600000.0,36.0,5.0,55200000.0,30.50,12.0,0.0,1250052.0,614844.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,30000000.0,36.0,3.0,78000000.0,22.08,27.0,2.0,1307532.0,763380.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64193,30000000.0,60.0,11.0,109200000.0,12.06,26.0,0.0,960612.0,1245252.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64194,6120000.0,36.0,11.0,39600000.0,28.80,33.0,0.0,131520.0,80880.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
64195,11520000.0,36.0,11.0,66000000.0,25.44,41.0,1.0,1339536.0,601872.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [21]:
test_last[num_column] = scaler.fit_transform(test_last[num_column])
test_last

Unnamed: 0,대출금액,대출기간,근로기간,연간소득,부채_대비_소득_비율,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,주택소유상태_OWN,주택소유상태_RENT,대출목적_부채 통합,대출목적_생활비,대출목적_신용 카드,대출목적_주택
0,0.000000,0.0,0.222222,0.978240,0.067285,-0.800000,0.0,-0.270784,-0.318157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.603448,0.0,-0.111111,0.216865,-0.226605,0.066667,0.0,-0.794718,-0.649044,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0.034483,0.0,0.000000,1.304319,-0.801237,-0.266667,0.0,1.577409,-0.012973,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.172414,0.0,-0.111111,-0.217387,-0.390565,0.400000,1.0,0.093378,-0.013190,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.775862,0.0,-0.111111,-0.413034,0.907193,-0.800000,0.0,0.864664,0.738665,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,0.948276,0.0,-0.333333,0.000000,0.255994,0.200000,2.0,0.940966,1.073913,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64193,0.948276,1.0,0.555556,0.565205,-0.518948,0.133333,0.0,0.480447,2.161503,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64194,-0.767241,0.0,0.555556,-0.695637,0.775715,0.600000,0.0,-0.620132,-0.466497,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
64195,-0.379310,0.0,0.555556,-0.217387,0.515855,1.133333,1.0,0.983449,0.709387,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [22]:
sub = pd.read_csv('sample_submission.csv')
sub

Unnamed: 0,ID,대출등급
0,TEST_00000,G
1,TEST_00001,G
2,TEST_00002,G
3,TEST_00003,G
4,TEST_00004,G
...,...,...
64192,TEST_64192,G
64193,TEST_64193,G
64194,TEST_64194,G
64195,TEST_64195,G


In [23]:
test_pred = dt.predict(test_last)
test_pred

array(['B', 'D', 'A', ..., 'D', 'C', 'A'], dtype=object)

In [24]:
sub['대출등급'] = test_pred
sub

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,D
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C


In [25]:
sub.to_csv('0128_DT.csv', index=False)

초기 dt 모델을 통해 test데이터에 대한 대출등급을 예측한 결과 0.7149점 나옴

In [26]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# 탐색할 하이퍼파라미터 분포를 설정
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': randint(1, 30),  
    'min_samples_split': randint(2, 11), 
    'min_samples_leaf': randint(1, 5)  
}

dt = DecisionTreeClassifier(random_state=42)

# RandomizedSearchCV를 사용
random_search = RandomizedSearchCV(dt, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train_last, y_train)

print("최적의 하이퍼파라미터:", random_search.best_params_)

best_dt = random_search.best_estimator_
y_val_pred = best_dt.predict(X_val_last)

val_accuracy = accuracy_score(y_val, y_val_pred)
print("최적 모델의 검증 데이터 정확도:", val_accuracy)

최적의 하이퍼파라미터: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'splitter': 'best'}
최적 모델의 검증 데이터 정확도: 0.845163300275196


In [27]:
# 수정한 하이퍼 파라미터 적용한 DT모델

tun_dt = DecisionTreeClassifier(criterion='gini', max_depth=20, min_samples_leaf = 4, min_samples_split=10, splitter='best', random_state=42)

tun_dt.fit(X_train_last, y_train)
y_val_pred_dt = tun_dt.predict(X_val_last)
pred_accuracy = accuracy_score(y_val, y_val_pred)
print("검증 데이터 정확도:", pred_accuracy)

검증 데이터 정확도: 0.845163300275196


In [28]:
tun_dt_pred = tun_dt.predict(test_last)
tun_dt_pred

array(['B', 'B', 'A', ..., 'D', 'C', 'A'], dtype=object)

In [29]:
sub['대출등급'] = tun_dt_pred
sub.to_csv('tun_dt_model.csv', index=False)

tun_dt model: 0.7470으로 test셋 예측 정확도 상승. 이전대비 0.03점 UP 

Feature Selection

In [30]:
# confusion metrics 생성

from sklearn.metrics import confusion_matrix
cf=confusion_matrix(y_val,y_val_pred_dt)
cf

array([[2896,  349,   24,    1,    1,    0,    0],
       [ 285, 5121,  372,   22,    4,    3,    0],
       [  46,  473, 4736,  215,   22,    2,    0],
       [  12,   94,  305, 2135,  189,    6,    1],
       [   3,   19,   53,  267, 1049,   50,    7],
       [   0,   10,   13,   18,   62,  293,   21],
       [   0,    0,    3,    2,    6,   22,   47]], dtype=int64)

In [31]:
print(np.round(tun_dt.feature_importances_, 3))

[0.04  0.04  0.005 0.019 0.009 0.008 0.002 0.46  0.413 0.    0.    0.001
 0.001 0.001 0.    0.001 0.   ]


In [32]:
feat_importances = pd.Series(tun_dt.feature_importances_, index=X_train_last.columns)
feat_importances_sort = feat_importances.sort_values(ascending=False)
feat_importances_sort

총상환원금           0.460257
총상환이자           0.413453
대출금액            0.039994
대출기간            0.039687
연간소득            0.019148
부채_대비_소득_비율     0.009082
총계좌수            0.007866
근로기간            0.004529
최근_2년간_연체_횟수    0.001646
주택소유상태_RENT     0.001395
대출목적_신용 카드      0.001377
주택소유상태_OWN      0.000575
대출목적_부채 통합      0.000568
대출목적_생활비        0.000255
대출목적_주택         0.000110
총연체금액           0.000056
연체계좌수           0.000000
dtype: float64

In [33]:
feat_importances_sort[:10].index

Index(['총상환원금', '총상환이자', '대출금액', '대출기간', '연간소득', '부채_대비_소득_비율', '총계좌수', '근로기간',
       '최근_2년간_연체_횟수', '주택소유상태_RENT'],
      dtype='object')

In [40]:
X_train_new = X_train_last[feat_importances_sort[:4].index]
X_val_new = X_val_last[feat_importances_sort[:4].index]

X_train_new

Unnamed: 0,총상환원금,총상환이자,대출금액,대출기간
0,3.101413,0.781023,0.869565,0.0
1,-0.712769,-0.593191,-1.008696,0.0
2,-0.347189,-0.369367,-0.173913,0.0
3,-0.284533,-0.433175,-0.652174,0.0
4,-0.207645,0.279207,0.069565,1.0
...,...,...,...,...
77030,-0.279216,-0.487038,-0.104348,0.0
77031,0.658352,0.833368,1.826087,1.0
77032,1.626020,0.289676,0.086957,0.0
77033,2.406650,0.716000,0.521739,0.0


In [41]:
tun_dt.fit(X_train_new, y_train)

In [42]:
y_pred_new = tun_dt.predict(X_val_new)

accuracy_score_tun_dt = accuracy_score(y_val, y_pred_new)
print("accuracy:",accuracy_score_tun_dt)

accuracy: 0.8636481644945221


In [43]:
tun_dt_feature_sel = tun_dt.predict(test_last[feat_importances_sort[:4].index])
tun_dt_feature_sel

array(['B', 'B', 'A', ..., 'D', 'C', 'A'], dtype=object)

In [44]:
sub['대출등급'] = tun_dt_feature_sel
sub.to_csv('feature_selection_tun_dt.csv', index=False)

tuning rf model

In [45]:
tuning_rf = RandomForestClassifier(max_depth=20, min_samples_leaf=3, min_samples_split=9,n_estimators=200, random_state=42)

tuning_rf.fit(X_train_last, y_train)
y_val_pred_rf = tuning_rf.predict(X_val_last)
pred_accuracy = accuracy_score(y_val, y_val_pred_rf)
print("검증 데이터 정확도:", pred_accuracy)

  return fit_method(estimator, *args, **kwargs)


검증 데이터 정확도: 0.7828547691988161


In [46]:
tuning_rf.fit(X_train_new, y_train)

  return fit_method(estimator, *args, **kwargs)


In [47]:
y_val_pred_tun_rf = tuning_rf.predict(X_val_new)
pred_accuracy = accuracy_score(y_val, y_val_pred_tun_rf)
print("검증 데이터 정확도:", pred_accuracy)

검증 데이터 정확도: 0.8866503972168857


In [48]:
tuning_rf_feature_sel = tuning_rf.predict(test_last[feat_importances_sort[:4].index])

In [49]:
sub['대출등급'] = tuning_rf_feature_sel
sub

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C


In [50]:
#피쳐 셀렉션을 적용한 튜닝 랜덤 포레스트 모델
sub.to_csv('tuning_rf_feature_sel3.csv',index=False)

피쳐 셀렉션을 적용한 튜닝 랜덤 포레스트 모델 결과: 측정 결과 0.7954003으로 현재까지 생성한 모델 중 가장 높은 점수

1월 31일: 근로기간의 데이터들을 수치형으로 모두 변환 후 동일한 작업 진행. 이후 피쳐 셀렉션을 적용한 튜닝 랜덤 포레스트 모델 결과 0.7972801로 측정되었고 이전보다 0.002 %p 상승

In [40]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

# 탐색할 하이퍼파라미터 공간 정의
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.5],
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9, 11],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4]
}

# 랜덤 서치 객체 생성
random_search = RandomizedSearchCV(
    xg,
    param_distributions=param_dist,
    n_iter=5,  # 총 반복 횟수
    scoring='neg_mean_squared_error',  # 평가 지표
    cv=3,  # 교차 검증 횟수
    verbose=1,
    n_jobs=-1  # 모든 CPU 코어를 사용하여 병렬 실행
)

# 랜덤 서치 수행
random_search.fit(X_train_last, y_train_xg)

# 최적의 하이퍼파라미터 출력
print("Best Hyperparameters:", random_search.best_params_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Hyperparameters: {'subsample': 1.0, 'n_estimators': 300, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.5, 'gamma': 0.3, 'colsample_bytree': 0.8}


In [50]:
# XGboost model에 feature selection 적용한 x_train_new를 학습 

tun_xg = xgb.XGBClassifier(max_depth = 5, subsample = 1.0, n_estimators = 300, min_child_weight = 3,
                           learning_rate = 0.5, gamma = 0.3, colsample_bytree = 0.8,
                           objective="multi:softmax", eval_metric="mlogloss", random_state=42, num_class=len(label_encoder.classes_))

tun_xg.fit(X_train_new, y_train_xg)

In [51]:
y_val_pred_tun_xg = tun_xg.predict(X_val_new)
pred_accuracy = accuracy_score(y_val_xg, y_val_pred_tun_xg)
print("검증 데이터 정확도:", pred_accuracy)

검증 데이터 정확도: 0.8544057323848591


In [53]:
tuning_xg_test = tun_xg.predict(test_last[feat_importances_sort[:4].index])
tuning_xg_test

array([1, 1, 0, ..., 3, 2, 0])

In [64]:
xg_sub =pd.read_csv('sample_submission.csv')

In [65]:
xg_sub['대출등급'] = tuning_xg_test
xg_sub

Unnamed: 0,ID,대출등급
0,TEST_00000,1
1,TEST_00001,1
2,TEST_00002,0
3,TEST_00003,2
4,TEST_00004,2
...,...,...
64192,TEST_64192,3
64193,TEST_64193,3
64194,TEST_64194,3
64195,TEST_64195,2


In [69]:
replacement = {
    0: 'A',
    1: 'B',
    2: 'C',
    3: 'D',
    4: 'E',
    5: 'F',
    6: 'G'
}

xg_sub['대출등급'].replace(replacement, inplace=True)

In [72]:
xg_sub

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C


In [71]:
xg_sub.to_csv('tuning_xgboost.csv', index=False)