# 데이콘 제출용 작업
- test.csv에 대한 모델의 성능을 확인하기 위한 코드
- 제출 후 F1 점수 : 0.9463
 - 훈련 데이터만을 사용했을 때의 F1 점수와 큰 차이가 없음 -> 오버피팅되지 않은 모델임을 입증

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rc
rc('font', family='AppleGothic')

In [20]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
0,TRAIN_00000,12480000,36 months,6 years,RENT,72000000,18.9,15,부채 통합,0,0,0.0,0.0,0.0,C
1,TRAIN_00001,14400000,60 months,10+ years,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B
2,TRAIN_00002,12000000,36 months,5 years,MORTGAGE,96000000,8.6,14,부채 통합,0,928644,151944.0,0.0,0.0,A
3,TRAIN_00003,14400000,36 months,8 years,MORTGAGE,132000000,15.09,15,부채 통합,0,325824,153108.0,0.0,0.0,C
4,TRAIN_00004,18000000,60 months,Unknown,RENT,71736000,25.39,19,주요 구매,0,228540,148956.0,0.0,0.0,B


In [21]:
# 총상환원금비율과 총상환이자비율 추가
train['총상환원금비율'] = train['총상환원금'] / train['대출금액']
train['총상환이자비율'] = train['총상환이자'] / train['대출금액']

In [22]:
# 근로기간 데이터 정리
train['근로기간'] = train['근로기간'].replace('1 years', '1 year')
train['근로기간'] = train['근로기간'].replace('<1 year', '< 1 year')
train['근로기간'] = train['근로기간'].replace('3', '3 years')
train['근로기간'] = train['근로기간'].replace('10+years', '10+ years')
train['근로기간'].unique()

array(['6 years', '10+ years', '5 years', '8 years', 'Unknown', '9 years',
       '2 years', '1 year', '3 years', '7 years', '4 years', '< 1 year'],
      dtype=object)

In [23]:
# 대출기간 타입변환 object -> int
# 단위 개월 -> 년
train['대출기간'].replace({' 36 months' : '3', ' 60 months' : '5'}, inplace=True)
train['대출기간'] = train['대출기간'].astype('int64')

In [24]:
# 1개밖에 없으니 최빈값(MORTGAGE)으로 채우기로 결정
train.loc[train['주택소유상태'] == 'ANY', '주택소유상태'] = 'MORTGAGE'
train[train['주택소유상태'] == 'ANY'].value_counts().sum()

0

In [26]:
# unknown은 0으로 처리
train['근로기간'] = train['근로기간'].replace('Unknown', '0')

In [28]:
# 로그 변환 (왜도가 있는 연간소득, 총상환원금, 총상환이자에 대한 처리)
train['연간소득_log'] = np.log1p(train['연간소득'])
train['총상환원금_log'] = np.log1p(train['총상환원금'])
train['총상환이자_log'] = np.log1p(train['총상환이자'])

In [29]:
train.head()

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급,총상환원금비율,총상환이자비율,연간소득_log,총상환원금_log,총상환이자_log
0,TRAIN_00000,12480000,3,6 years,RENT,72000000,18.9,15,부채 통합,0,0,0.0,0.0,0.0,C,0.0,0.0,18.092177,0.0,0.0
1,TRAIN_00001,14400000,5,10+ years,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B,0.025943,0.016254,18.68918,12.830869,12.363337
2,TRAIN_00002,12000000,3,5 years,MORTGAGE,96000000,8.6,14,부채 통합,0,928644,151944.0,0.0,0.0,A,0.077387,0.012662,18.379859,13.741482,11.931274
3,TRAIN_00003,14400000,3,8 years,MORTGAGE,132000000,15.09,15,부채 통합,0,325824,153108.0,0.0,0.0,C,0.022627,0.010632,18.698312,12.694116,11.938905
4,TRAIN_00004,18000000,5,0,RENT,71736000,25.39,19,주요 구매,0,228540,148956.0,0.0,0.0,B,0.012697,0.008275,18.088503,12.339471,11.911413


In [32]:
train.shape

(96294, 20)

In [33]:
train_df = train[['대출등급', '총상환원금비율', '총상환이자비율', '총상환원금_log', '총상환이자_log']]
train_df.head()

Unnamed: 0,대출등급,총상환원금비율,총상환이자비율,총상환원금_log,총상환이자_log
0,C,0.0,0.0,0.0,0.0
1,B,0.025943,0.016254,12.830869,12.363337
2,A,0.077387,0.012662,13.741482,11.931274
3,C,0.022627,0.010632,12.694116,11.938905
4,B,0.012697,0.008275,12.339471,11.911413


## test 데이터 전처리


In [9]:
test = pd.read_csv('test_pre.csv')
test.head()

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수
0,16800000,3,8 years,MORTGAGE,132000000,19.64,12,주택 개선,0,394692,146604.0,0.0,0.0
1,8400000,3,5 years,RENT,89971200,15.84,25,부채 통합,0,0,0.0,0.0,0.0
2,17280000,3,6 years,RENT,150000000,8.41,20,신용 카드,0,1786980,281820.0,0.0,0.0
3,14400000,3,5 years,MORTGAGE,66000000,13.72,30,신용 카드,1,669024,281724.0,0.0,0.0
4,27600000,3,5 years,RENT,55200000,30.5,12,신용 카드,0,1250052,614844.0,0.0,0.0


In [10]:
test['대출목적'].unique()

array(['주택 개선', '부채 통합', '신용 카드', '기타', '주요 구매', '이사', '휴가', '소규모 사업',
       '주택', '의료', '자동차', '재생 에너지', '결혼'], dtype=object)

In [11]:
test['대출목적'].replace('결혼', '기타', inplace=True)

In [12]:
test['대출목적'].unique()

array(['주택 개선', '부채 통합', '신용 카드', '기타', '주요 구매', '이사', '휴가', '소규모 사업',
       '주택', '의료', '자동차', '재생 에너지'], dtype=object)

In [13]:
test['근로기간'].unique()

array(['8 years', '5 years', '6 years', '0', '10+ years', '3 years',
       '< 1 year', '1 year', '7 years', '4 years', '2 years', '9 years'],
      dtype=object)

In [15]:
test.shape

(64197, 13)

In [16]:
# 총상환원금비율과 총상환이자비율 추가
test['총상환원금비율'] = test['총상환원금'] / test['대출금액']
test['총상환이자비율'] = test['총상환이자'] / test['대출금액']

In [17]:
test['연간소득_log'] = np.log1p(test['연간소득'])
test['총상환원금_log'] = np.log1p(test['총상환원금'])
test['총상환이자_log'] = np.log1p(test['총상환이자'])

In [25]:
test.loc[test['주택소유상태'] == 'ANY', '주택소유상태'] = 'MORTGAGE'
test[test['주택소유상태'] == 'ANY'].value_counts().sum()

0

In [27]:
# unknown은 0으로 처리
test['근로기간'] = test['근로기간'].replace('Unknown', '0')

In [30]:
test.head()

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,총상환원금비율,총상환이자비율,연간소득_log,총상환원금_log,총상환이자_log
0,16800000,3,8 years,MORTGAGE,132000000,19.64,12,주택 개선,0,394692,146604.0,0.0,0.0,0.023494,0.008726,18.698312,12.885864,11.895497
1,8400000,3,5 years,RENT,89971200,15.84,25,부채 통합,0,0,0.0,0.0,0.0,0.0,0.0,18.315,0.0,0.0
2,17280000,3,6 years,RENT,150000000,8.41,20,신용 카드,0,1786980,281820.0,0.0,0.0,0.103413,0.016309,18.826146,14.396038,12.549027
3,14400000,3,5 years,MORTGAGE,66000000,13.72,30,신용 카드,1,669024,281724.0,0.0,0.0,0.04646,0.019564,18.005165,13.413577,12.548687
4,27600000,3,5 years,RENT,55200000,30.5,12,신용 카드,0,1250052,614844.0,0.0,0.0,0.045292,0.022277,17.826474,14.038697,13.329125


In [31]:
test.shape

(64197, 18)

In [34]:
test_df = test[['총상환원금비율', '총상환이자비율', '총상환원금_log', '총상환이자_log']]
test_df.head()

Unnamed: 0,총상환원금비율,총상환이자비율,총상환원금_log,총상환이자_log
0,0.023494,0.008726,12.885864,11.895497
1,0.0,0.0,0.0,0.0
2,0.103413,0.016309,14.396038,12.549027
3,0.04646,0.019564,13.413577,12.548687
4,0.045292,0.022277,14.038697,13.329125


## 훈련

In [36]:
numeric_cols = ['총상환원금_log', '총상환이자_log','총상환원금비율', '총상환이자비율']

In [38]:
train_scaled = train_df.copy()
test_scaled = test_df.copy()

In [39]:
# 스케일링
# 이상치가 많고 정규분포를 따르는 특성이 없기 때문에 RobustScaler 선택
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
train_scaled[numeric_cols] = scaler.fit_transform(train_scaled[numeric_cols])
test_scaled[numeric_cols] = scaler.transform(test_scaled[numeric_cols])

In [40]:
X = train_scaled.drop(['대출등급'], axis=1)
y = train_scaled['대출등급']

In [42]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

In [43]:
pred = rf.predict(test_scaled)

In [44]:
sub = pd.read_csv('sample_submission.csv')
sub['대출등급'] = pred
sub.head()

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C


In [45]:
sub.to_csv('submission1.csv', index=False)