1. 작성 함수

type_transform(): int 타입 변수를 float형태로 변환

replace_work_period(): 근로기간에서 같은 의미를 담고 있지만 다르게 작성된 부분을 통일하는 함수

simple_imputation(): 단순 확률 대치법으로 nan값을 대체하는 함수 

loan_purpose(): 대출 목적 피쳐를 변환하고 통일하는 함수

2. 주요 전처리 사항

근로기간에서 같은 의미를 담고 있지만 다르게 작성된 부분을 통일

대출 목적에서 중요 목적을 제외한 나머지 클래스들을 '기타'로 통일

In [1]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

plt.rc("font", family = "Malgun Gothic")
sns.set(font="Malgun Gothic", 
rc={"axes.unicode_minus":False}, style='white')




In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.drop(['ID'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

In [3]:
train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
0,12480000,36 months,6 years,RENT,72000000,18.90,15,부채 통합,0,0,0.0,0.0,0.0,C
1,14400000,60 months,10+ years,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B
2,12000000,36 months,5 years,MORTGAGE,96000000,8.60,14,부채 통합,0,928644,151944.0,0.0,0.0,A
3,14400000,36 months,8 years,MORTGAGE,132000000,15.09,15,부채 통합,0,325824,153108.0,0.0,0.0,C
4,18000000,60 months,Unknown,RENT,71736000,25.39,19,주요 구매,0,228540,148956.0,0.0,0.0,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96289,14400000,36 months,10+ years,MORTGAGE,210000000,9.33,33,신용 카드,0,974580,492168.0,0.0,0.0,C
96290,28800000,60 months,10+ years,MORTGAGE,132000000,5.16,25,주택 개선,0,583728,855084.0,0.0,0.0,E
96291,14400000,36 months,1 year,MORTGAGE,84000000,11.24,22,신용 카드,0,1489128,241236.0,0.0,0.0,A
96292,15600000,36 months,5 years,MORTGAGE,66330000,17.30,21,부채 통합,2,1378368,818076.0,0.0,0.0,D


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96294 entries, 0 to 96293
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   대출금액          96294 non-null  int64  
 1   대출기간          96294 non-null  object 
 2   근로기간          96294 non-null  object 
 3   주택소유상태        96294 non-null  object 
 4   연간소득          96294 non-null  int64  
 5   부채_대비_소득_비율   96294 non-null  float64
 6   총계좌수          96294 non-null  int64  
 7   대출목적          96294 non-null  object 
 8   최근_2년간_연체_횟수  96294 non-null  int64  
 9   총상환원금         96294 non-null  int64  
 10  총상환이자         96294 non-null  float64
 11  총연체금액         96294 non-null  float64
 12  연체계좌수         96294 non-null  float64
 13  대출등급          96294 non-null  object 
dtypes: float64(4), int64(5), object(5)
memory usage: 10.3+ MB


In [5]:
X = train.drop(['대출등급'], axis=1)
y = train['대출등급']

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
y_train.value_counts()

B    23010
C    22129
A    13501
D    10612
E     5906
F     1537
G      340
Name: 대출등급, dtype: int64

In [8]:
y_val.value_counts()

B    5807
C    5494
A    3271
D    2742
E    1448
F     417
G      80
Name: 대출등급, dtype: int64

In [9]:
X_train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
56034,28800000,36 months,10+ years,MORTGAGE,120000000,10.10,20,신용 카드,0,2914824,625428.0,0.0,0.0
20086,2880000,36 months,2 years,RENT,16200000,33.78,16,신용 카드,0,65172,28428.0,0.0,0.0
43318,14400000,36 months,5 years,RENT,240000000,3.76,37,부채 통합,0,338304,125664.0,0.0,0.0
46440,7800000,36 months,10+ years,RENT,40149600,22.92,28,신용 카드,0,385116,97944.0,0.0,0.0
9774,17760000,60 months,1 year,RENT,54000000,13.41,29,부채 통합,0,442560,407424.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,15360000,36 months,3 years,MORTGAGE,87000000,22.35,47,부채 통합,0,389088,74544.0,0.0,0.0
54886,42000000,60 months,10+ years,MORTGAGE,168000000,23.90,36,부채 통합,0,1089564,648168.0,0.0,0.0
76820,18000000,36 months,Unknown,RENT,48000000,18.06,26,신용 카드,0,1812528,411972.0,0.0,0.0
860,24000000,36 months,10+ years,MORTGAGE,123600000,21.88,73,주택 개선,0,2395752,597180.0,0.0,0.0


In [10]:
X_val

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
79749,14400000,60 months,8 years,MORTGAGE,72000000,33.74,18,신용 카드,0,530268,427224.0,0.0,0.0
61621,34830000,36 months,10+ years,OWN,79200000,9.67,41,기타,3,741732,475968.0,0.0,0.0
127,14400000,36 months,1 year,RENT,91200000,9.81,12,신용 카드,1,309456,176700.0,0.0,0.0
65562,21900000,36 months,< 1 year,MORTGAGE,132000000,17.72,38,부채 통합,0,982080,510948.0,0.0,0.0
26735,12000000,36 months,10+ years,MORTGAGE,102000000,13.64,30,소규모 사업,0,559536,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,29040000,60 months,3 years,RENT,66000000,19.18,12,부채 통합,0,1268280,1549476.0,0.0,0.0
43079,5760000,36 months,10+ years,RENT,53697600,16.06,25,휴가,0,527880,250512.0,0.0,0.0
54846,15570000,60 months,< 1 year,RENT,134400000,4.82,15,신용 카드,0,357120,365328.0,0.0,0.0
88333,28800000,36 months,1 year,MORTGAGE,74400000,14.36,49,신용 카드,2,2110164,614856.0,0.0,0.0


In [11]:
cate_column = [var for var in X_train if X_train[var].dtype == 'O']
num_column = [var for var in X_train if X_train[var].dtype != 'O'] 

In [12]:
# int 타입 변수들을 모두 float 형태로 변환하는 함수 작성 
def type_transform(data_set):
    for var in num_column:
        if data_set[var].dtype == 'int64':
            data_set[var] = data_set[var].astype(float)

type_transform(X_train)
type_transform(X_val)

In [13]:
X_train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
56034,28800000.0,36 months,10+ years,MORTGAGE,120000000.0,10.10,20.0,신용 카드,0.0,2914824.0,625428.0,0.0,0.0
20086,2880000.0,36 months,2 years,RENT,16200000.0,33.78,16.0,신용 카드,0.0,65172.0,28428.0,0.0,0.0
43318,14400000.0,36 months,5 years,RENT,240000000.0,3.76,37.0,부채 통합,0.0,338304.0,125664.0,0.0,0.0
46440,7800000.0,36 months,10+ years,RENT,40149600.0,22.92,28.0,신용 카드,0.0,385116.0,97944.0,0.0,0.0
9774,17760000.0,60 months,1 year,RENT,54000000.0,13.41,29.0,부채 통합,0.0,442560.0,407424.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,15360000.0,36 months,3 years,MORTGAGE,87000000.0,22.35,47.0,부채 통합,0.0,389088.0,74544.0,0.0,0.0
54886,42000000.0,60 months,10+ years,MORTGAGE,168000000.0,23.90,36.0,부채 통합,0.0,1089564.0,648168.0,0.0,0.0
76820,18000000.0,36 months,Unknown,RENT,48000000.0,18.06,26.0,신용 카드,0.0,1812528.0,411972.0,0.0,0.0
860,24000000.0,36 months,10+ years,MORTGAGE,123600000.0,21.88,73.0,주택 개선,0.0,2395752.0,597180.0,0.0,0.0


In [14]:
X_val

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
79749,14400000.0,60 months,8 years,MORTGAGE,72000000.0,33.74,18.0,신용 카드,0.0,530268.0,427224.0,0.0,0.0
61621,34830000.0,36 months,10+ years,OWN,79200000.0,9.67,41.0,기타,3.0,741732.0,475968.0,0.0,0.0
127,14400000.0,36 months,1 year,RENT,91200000.0,9.81,12.0,신용 카드,1.0,309456.0,176700.0,0.0,0.0
65562,21900000.0,36 months,< 1 year,MORTGAGE,132000000.0,17.72,38.0,부채 통합,0.0,982080.0,510948.0,0.0,0.0
26735,12000000.0,36 months,10+ years,MORTGAGE,102000000.0,13.64,30.0,소규모 사업,0.0,559536.0,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,29040000.0,60 months,3 years,RENT,66000000.0,19.18,12.0,부채 통합,0.0,1268280.0,1549476.0,0.0,0.0
43079,5760000.0,36 months,10+ years,RENT,53697600.0,16.06,25.0,휴가,0.0,527880.0,250512.0,0.0,0.0
54846,15570000.0,60 months,< 1 year,RENT,134400000.0,4.82,15.0,신용 카드,0.0,357120.0,365328.0,0.0,0.0
88333,28800000.0,36 months,1 year,MORTGAGE,74400000.0,14.36,49.0,신용 카드,2.0,2110164.0,614856.0,0.0,0.0


In [15]:
def replace_work_period(data_set):

    data_set.replace({'근로기간' : '<1 year'}, '< 1 year', inplace=True)
    data_set.replace({'근로기간' : '10+years'}, '10+ years', inplace=True)
    data_set.replace({'근로기간' : '1 years'}, '1 year', inplace=True)
    data_set.replace({'근로기간' : '3'}, '3 years', inplace=True)
    data_set.replace({'근로기간' : 'Unknown'}, np.NaN, inplace=True)
    data_set.replace({'주택소유상태': 'ANY'}, 'MORTGAGE', inplace=True)

replace_work_period(X_train)
replace_work_period(X_val)

In [16]:
for var in X_train[cate_column]:
    print(var, X_train[var].unique())

대출기간 [' 36 months' ' 60 months']
근로기간 ['10+ years' '2 years' '5 years' '1 year' '4 years' '7 years' '< 1 year'
 '3 years' '8 years' '9 years' '6 years' nan]
주택소유상태 ['MORTGAGE' 'RENT' 'OWN']
대출목적 ['신용 카드' '부채 통합' '주요 구매' '기타' '이사' '주택 개선' '휴가' '소규모 사업' '재생 에너지' '주택'
 '의료' '자동차']


In [17]:
for var in X_train[cate_column]:
    print(var, X_val[var].unique())

대출기간 [' 60 months' ' 36 months']
근로기간 ['8 years' '10+ years' '1 year' '< 1 year' '2 years' nan '7 years'
 '6 years' '4 years' '5 years' '3 years' '9 years']
주택소유상태 ['MORTGAGE' 'OWN' 'RENT']
대출목적 ['신용 카드' '기타' '부채 통합' '소규모 사업' '주요 구매' '주택 개선' '의료' '자동차' '휴가' '이사' '주택'
 '재생 에너지']


In [18]:
def simple_imputation(df, column_name):
    # 결측치가 있는 열을 선택
    missing_values = df[column_name].isnull()
    
    # 결측치가 아닌 값들의 확률 분포 계산
    non_missing_values = df.loc[~missing_values, column_name]
    probabilities = non_missing_values.value_counts(normalize=True)
    
    # 결측치 대체
    imputed_values = np.random.choice(probabilities.index, size=missing_values.sum(), p=probabilities.values)
    
    # 대체된 값으로 결측치 채우기
    df.loc[missing_values, column_name] = imputed_values
    
    return df


# 단순 확률 대치법 적용
column_to_impute = '근로기간'
simple_imputation(X_train, column_to_impute)
simple_imputation(X_val, column_to_impute)

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
79749,14400000.0,60 months,8 years,MORTGAGE,72000000.0,33.74,18.0,신용 카드,0.0,530268.0,427224.0,0.0,0.0
61621,34830000.0,36 months,10+ years,OWN,79200000.0,9.67,41.0,기타,3.0,741732.0,475968.0,0.0,0.0
127,14400000.0,36 months,1 year,RENT,91200000.0,9.81,12.0,신용 카드,1.0,309456.0,176700.0,0.0,0.0
65562,21900000.0,36 months,< 1 year,MORTGAGE,132000000.0,17.72,38.0,부채 통합,0.0,982080.0,510948.0,0.0,0.0
26735,12000000.0,36 months,10+ years,MORTGAGE,102000000.0,13.64,30.0,소규모 사업,0.0,559536.0,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,29040000.0,60 months,3 years,RENT,66000000.0,19.18,12.0,부채 통합,0.0,1268280.0,1549476.0,0.0,0.0
43079,5760000.0,36 months,10+ years,RENT,53697600.0,16.06,25.0,휴가,0.0,527880.0,250512.0,0.0,0.0
54846,15570000.0,60 months,< 1 year,RENT,134400000.0,4.82,15.0,신용 카드,0.0,357120.0,365328.0,0.0,0.0
88333,28800000.0,36 months,1 year,MORTGAGE,74400000.0,14.36,49.0,신용 카드,2.0,2110164.0,614856.0,0.0,0.0


In [19]:
X_train.isnull().sum()

대출금액            0
대출기간            0
근로기간            0
주택소유상태          0
연간소득            0
부채_대비_소득_비율     0
총계좌수            0
대출목적            0
최근_2년간_연체_횟수    0
총상환원금           0
총상환이자           0
총연체금액           0
연체계좌수           0
dtype: int64

In [20]:
X_val.isnull().sum()

대출금액            0
대출기간            0
근로기간            0
주택소유상태          0
연간소득            0
부채_대비_소득_비율     0
총계좌수            0
대출목적            0
최근_2년간_연체_횟수    0
총상환원금           0
총상환이자           0
총연체금액           0
연체계좌수           0
dtype: int64

In [21]:
def loan_purpose(data_set):

    data_set.replace({'대출목적' : '주택 개선'}, '주택', inplace=True)
    data_set.replace({'대출목적' : '이사'}, '주택', inplace=True)
    data_set.replace({'대출목적' : '주요 구매'}, '생활비', inplace=True)
    data_set.replace({'대출목적' : '휴가'}, '생활비', inplace=True)
    data_set.replace({'대출목적':'의료'}, '생활비', inplace=True)
    data_set.replace({'대출목적':'자동차'}, '생활비', inplace=True)
    data_set.replace({'대출목적':'소규모 사업'}, '기타', inplace=True)
    data_set.replace({'대출목적':'재생 에너지'}, '기타', inplace=True)
    data_set.replace({'대출목적':'결혼'}, '기타', inplace = True)
loan_purpose(X_train)
loan_purpose(X_val)

In [22]:
X_train['대출목적'].value_counts()

부채 통합    44132
신용 카드    19673
주택        5573
기타        4379
생활비       3278
Name: 대출목적, dtype: int64

In [23]:
X_val['대출목적'].value_counts()

부채 통합    11018
신용 카드     4827
주택        1394
기타        1193
생활비        827
Name: 대출목적, dtype: int64

In [24]:
X_train['근로기간'].unique()

array(['10+ years', '2 years', '5 years', '1 year', '4 years', '7 years',
       '< 1 year', '3 years', '8 years', '9 years', '6 years'],
      dtype=object)

In [25]:
X_val['근로기간'].unique()

array(['8 years', '10+ years', '1 year', '< 1 year', '2 years', '7 years',
       '6 years', '4 years', '5 years', '3 years', '9 years'],
      dtype=object)

In [26]:
data = X_train['근로기간']

df = pd.DataFrame(data)

def convert_years_to_number(set):
    if "10+" in set:
        return 11
    elif "<" in set:
        return 0
    else:
        return float(set.split()[0])
    
X_train['근로기간'] = df['근로기간'].apply(convert_years_to_number)

In [27]:
data = X_val['근로기간']

df = pd.DataFrame(data)

def convert_years_to_number(set):
    if "10+" in set:
        return 11
    elif "<" in set:
        return 0
    else:
        return float(set.split()[0])
    
X_val['근로기간'] = df['근로기간'].apply(convert_years_to_number)

In [28]:
# 다중공선성 확인 1

from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(data):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = data.columns
    vif_data["VIF"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    return vif_data

# 독립 변수만 선택
independent_vars = X_train[num_column]

# VIF 계산
vif_result = calculate_vif(independent_vars)
print(vif_result)

       Variable       VIF
0          대출금액  6.493396
1          연간소득  2.058218
2   부채_대비_소득_비율  1.244949
3          총계좌수  3.695351
4  최근_2년간_연체_횟수  1.176848
5         총상환원금  2.124199
6         총상환이자  3.379988
7         총연체금액  1.005756
8         연체계좌수  1.024213


In [29]:
X_train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
56034,28800000.0,36 months,11.0,MORTGAGE,120000000.0,10.10,20.0,신용 카드,0.0,2914824.0,625428.0,0.0,0.0
20086,2880000.0,36 months,2.0,RENT,16200000.0,33.78,16.0,신용 카드,0.0,65172.0,28428.0,0.0,0.0
43318,14400000.0,36 months,5.0,RENT,240000000.0,3.76,37.0,부채 통합,0.0,338304.0,125664.0,0.0,0.0
46440,7800000.0,36 months,11.0,RENT,40149600.0,22.92,28.0,신용 카드,0.0,385116.0,97944.0,0.0,0.0
9774,17760000.0,60 months,1.0,RENT,54000000.0,13.41,29.0,부채 통합,0.0,442560.0,407424.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,15360000.0,36 months,3.0,MORTGAGE,87000000.0,22.35,47.0,부채 통합,0.0,389088.0,74544.0,0.0,0.0
54886,42000000.0,60 months,11.0,MORTGAGE,168000000.0,23.90,36.0,부채 통합,0.0,1089564.0,648168.0,0.0,0.0
76820,18000000.0,36 months,5.0,RENT,48000000.0,18.06,26.0,신용 카드,0.0,1812528.0,411972.0,0.0,0.0
860,24000000.0,36 months,11.0,MORTGAGE,123600000.0,21.88,73.0,주택,0.0,2395752.0,597180.0,0.0,0.0


In [30]:
X_val

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
79749,14400000.0,60 months,8.0,MORTGAGE,72000000.0,33.74,18.0,신용 카드,0.0,530268.0,427224.0,0.0,0.0
61621,34830000.0,36 months,11.0,OWN,79200000.0,9.67,41.0,기타,3.0,741732.0,475968.0,0.0,0.0
127,14400000.0,36 months,1.0,RENT,91200000.0,9.81,12.0,신용 카드,1.0,309456.0,176700.0,0.0,0.0
65562,21900000.0,36 months,0.0,MORTGAGE,132000000.0,17.72,38.0,부채 통합,0.0,982080.0,510948.0,0.0,0.0
26735,12000000.0,36 months,11.0,MORTGAGE,102000000.0,13.64,30.0,기타,0.0,559536.0,226152.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50679,29040000.0,60 months,3.0,RENT,66000000.0,19.18,12.0,부채 통합,0.0,1268280.0,1549476.0,0.0,0.0
43079,5760000.0,36 months,11.0,RENT,53697600.0,16.06,25.0,생활비,0.0,527880.0,250512.0,0.0,0.0
54846,15570000.0,60 months,0.0,RENT,134400000.0,4.82,15.0,신용 카드,0.0,357120.0,365328.0,0.0,0.0
88333,28800000.0,36 months,1.0,MORTGAGE,74400000.0,14.36,49.0,신용 카드,2.0,2110164.0,614856.0,0.0,0.0


### Test 데이터 셋에 대해서도 동일하게 전처리 진행 

In [36]:
type_transform(test)
replace_work_period(test)
simple_imputation(test, column_to_impute)
loan_purpose(test)

In [38]:
data = test['근로기간']

df = pd.DataFrame(data)

def convert_years_to_number(set):
    if "10+" in set:
        return 11
    elif "<" in set:
        return 0
    else:
        return float(set.split()[0])
    
test['근로기간'] = df['근로기간'].apply(convert_years_to_number)

In [39]:
test

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,16800000.0,36 months,8.0,MORTGAGE,132000000.0,19.64,12.0,주택,0.0,394692.0,146604.0,0.0,0.0
1,8400000.0,36 months,5.0,RENT,89971200.0,15.84,25.0,부채 통합,0.0,0.0,0.0,0.0,0.0
2,17280000.0,36 months,6.0,RENT,150000000.0,8.41,20.0,신용 카드,0.0,1786980.0,281820.0,0.0,0.0
3,14400000.0,36 months,5.0,MORTGAGE,66000000.0,13.72,30.0,신용 카드,1.0,669024.0,281724.0,0.0,0.0
4,27600000.0,36 months,5.0,RENT,55200000.0,30.50,12.0,신용 카드,0.0,1250052.0,614844.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64192,30000000.0,36 months,3.0,MORTGAGE,78000000.0,22.08,27.0,부채 통합,2.0,1307532.0,763380.0,0.0,0.0
64193,30000000.0,60 months,11.0,MORTGAGE,109200000.0,12.06,26.0,부채 통합,0.0,960612.0,1245252.0,0.0,0.0
64194,6120000.0,36 months,11.0,RENT,39600000.0,28.80,33.0,부채 통합,0.0,131520.0,80880.0,0.0,0.0
64195,11520000.0,36 months,11.0,MORTGAGE,66000000.0,25.44,41.0,부채 통합,1.0,1339536.0,601872.0,0.0,0.0


In [40]:
# 전처리한 파일들을 csv형태로 저장

train_set = X_train.to_csv("preprocessing_train.csv", index=False)
val_set = X_val.to_csv("preprocessing_validation.csv", index=False)
y_train_set = y_train.to_csv('y_train.csv', index=False)
y_val_set = y_val.to_csv('y_val.csv', index=False)
test_set = test.to_csv('preprocessing_test.csv',index=False)