In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
train_df = pd.read_csv('./open/train.csv')
train_df.head()

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
0,TRAIN_00000,12480000,36 months,6 years,RENT,72000000,18.9,15,부채 통합,0,0,0.0,0.0,0.0,C
1,TRAIN_00001,14400000,60 months,10+ years,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B
2,TRAIN_00002,12000000,36 months,5 years,MORTGAGE,96000000,8.6,14,부채 통합,0,928644,151944.0,0.0,0.0,A
3,TRAIN_00003,14400000,36 months,8 years,MORTGAGE,132000000,15.09,15,부채 통합,0,325824,153108.0,0.0,0.0,C
4,TRAIN_00004,18000000,60 months,Unknown,RENT,71736000,25.39,19,주요 구매,0,228540,148956.0,0.0,0.0,B


In [3]:
# train set에 있는 주택소유상태 = 'ANY'인 것 삭제, test set에는 없는 항목이고 train set에도 1개 밖에 되지 않아 삭제하는 것이 좋다고 판단.

train_df = train_df[train_df['주택소유상태']!= 'ANY']

In [4]:
# 근로기간 컬럼 중복된 항목 합치기
# 함수 선언

def unit_standardization(x):
    if (x == '10+ years') or (x == '10+years'):
        return '10년이상'
    elif x == '9 years':
        return '9'
    elif x == '8 years':
        return '8'
    elif x == '7 years':
        return '7'
    elif x == '6 years':
        return '6'
    elif x == '5 years':
        return '5'
    elif x == '4 years':
        return '4'
    elif (x == '3 years') or (x == '3'):
        return '3'
    elif x == '2 years':
        return '2'
    elif (x == '1 year') or (x == '1 years'):
        return '1'
    elif (x == '< 1 year') or (x == '<1 year') :
        return '1년이하'
    else:
        return 'Unknown'

In [5]:
# train set에 적용

train_df['근로기간'] = train_df['근로기간'].apply(unit_standardization)

In [6]:
# 대출기간 범주형 --> 수치형으로 변경
# 함수 선언

def conv_loan_period(x):
    if x == ' 36 months':
        return 36
    else:
        return 60

In [7]:
# train set에 적용

train_df['대출기간'] = train_df['대출기간'].apply(conv_loan_period)

In [8]:
display(train_df.head(3))

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급
0,TRAIN_00000,12480000,36,6,RENT,72000000,18.9,15,부채 통합,0,0,0.0,0.0,0.0,C
1,TRAIN_00001,14400000,60,10년이상,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B
2,TRAIN_00002,12000000,36,5,MORTGAGE,96000000,8.6,14,부채 통합,0,928644,151944.0,0.0,0.0,A


In [9]:
# 파생변수 생성하기

train_df['총상환액'] = train_df['총상환원금'] + train_df['총상환이자']

In [10]:
train_df['총상환대출비율'] = train_df['총상환액'] / train_df['대출금액']
train_df['월별대출금액'] = train_df['대출금액'] / train_df['대출기간']

In [11]:
train_df.head(3)

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급,총상환액,총상환대출비율,월별대출금액
0,TRAIN_00000,12480000,36,6,RENT,72000000,18.9,15,부채 통합,0,0,0.0,0.0,0.0,C,0.0,0.0,346666.666667
1,TRAIN_00001,14400000,60,10년이상,MORTGAGE,130800000,22.33,21,주택 개선,0,373572,234060.0,0.0,0.0,B,607632.0,0.042197,240000.0
2,TRAIN_00002,12000000,36,5,MORTGAGE,96000000,8.6,14,부채 통합,0,928644,151944.0,0.0,0.0,A,1080588.0,0.090049,333333.333333


In [12]:
#수치형 데이터 로그 스케일링 적용

numeric_columns = train_df.select_dtypes(include=[np.number]).columns

train_df[numeric_columns] = np.log1p(train_df[numeric_columns])

In [13]:
numeric_columns

Index(['대출금액', '대출기간', '연간소득', '부채_대비_소득_비율', '총계좌수', '최근_2년간_연체_횟수', '총상환원금',
       '총상환이자', '총연체금액', '연체계좌수', '총상환액', '총상환대출비율', '월별대출금액'],
      dtype='object')

In [14]:
train_df.head(3)

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급,총상환액,총상환대출비율,월별대출금액
0,TRAIN_00000,16.339638,3.610918,6,RENT,18.092177,2.99072,2.772589,부채 통합,0.0,0.0,0.0,0.0,0.0,C,0.0,0.0,12.756122
1,TRAIN_00001,16.482739,4.110874,10년이상,MORTGAGE,18.68918,3.14974,3.091042,주택 개선,0.0,12.830869,12.363337,0.0,0.0,B,13.317326,0.041331,12.388398
2,TRAIN_00002,16.300417,3.610918,5,MORTGAGE,18.379859,2.261763,2.70805,부채 통합,0.0,13.741482,11.931274,0.0,0.0,A,13.893017,0.086223,12.716901


In [15]:
# MinMax Scaling
from sklearn.preprocessing import MinMaxScaler

In [16]:
# MinMaxScaler 객체 생성
mm_scaler = MinMaxScaler()

In [17]:
# 선택한 컬럼들에 대해 스케일링 적용
train_df[numeric_columns] = mm_scaler.fit_transform(train_df[numeric_columns])

In [18]:
# 범주형데이터 인코딩

from sklearn.preprocessing import LabelEncoder

In [19]:
# LabelEncoder 인스턴스 생성
label_encoder = LabelEncoder()

In [20]:
encoding_list = ['근로기간', '주택소유상태', '대출목적', '대출등급']

In [21]:
# 각 컬럼에 대해 라벨 인코딩 수행
for column in encoding_list:
    train_df[column] = label_encoder.fit_transform(train_df[column])

In [22]:
train_df.head(3)

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급,총상환액,총상환대출비율,월별대출금액
0,TRAIN_00000,0.658671,0.0,7,2,0.783116,0.324713,0.329845,1,0.0,0.0,0.0,0.0,0.0,2,0.0,0.0,0.658669
1,TRAIN_00001,0.698921,1.0,1,0,0.808957,0.341979,0.420151,10,0.0,0.731015,0.795184,0.0,0.0,1,0.758338,0.057717,0.55524
2,TRAIN_00002,0.64764,0.0,6,0,0.795568,0.245568,0.311543,1,0.0,0.782895,0.767395,0.0,0.0,0,0.79112,0.120407,0.647638


In [23]:
train_df.head(5)

Unnamed: 0,ID,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급,총상환액,총상환대출비율,월별대출금액
0,TRAIN_00000,0.658671,0.0,7,2,0.783116,0.324713,0.329845,1,0.0,0.0,0.0,0.0,0.0,2,0.0,0.0,0.658669
1,TRAIN_00001,0.698921,1.0,1,0,0.808957,0.341979,0.420151,10,0.0,0.731015,0.795184,0.0,0.0,1,0.758338,0.057717,0.55524
2,TRAIN_00002,0.64764,0.0,6,0,0.795568,0.245568,0.311543,1,0.0,0.782895,0.767395,0.0,0.0,0,0.79112,0.120407,0.647638
3,TRAIN_00003,0.698921,0.0,9,0,0.809352,0.301639,0.329845,1,0.0,0.723224,0.767885,0.0,0.0,2,0.744785,0.04569,0.698919
4,TRAIN_00004,0.761684,1.0,11,2,0.782957,0.35536,0.393123,8,0.0,0.703018,0.766117,0.0,0.0,1,0.731233,0.028984,0.618003


In [24]:
# shap import

import shap

In [25]:
# train set을 train과 test로 나누기

from sklearn.model_selection import train_test_split

In [26]:
train_df.columns

Index(['ID', '대출금액', '대출기간', '근로기간', '주택소유상태', '연간소득', '부채_대비_소득_비율', '총계좌수',
       '대출목적', '최근_2년간_연체_횟수', '총상환원금', '총상환이자', '총연체금액', '연체계좌수', '대출등급',
       '총상환액', '총상환대출비율', '월별대출금액'],
      dtype='object')

In [27]:
x_list = ['대출금액', '대출기간', '근로기간', '주택소유상태', '연간소득', '부채_대비_소득_비율', '총계좌수',
       '대출목적', '최근_2년간_연체_횟수', '총상환원금', '총상환이자', '총연체금액', '연체계좌수',
       '총상환액', '총상환대출비율', '월별대출금액']

In [28]:
X = train_df[x_list] 
y = train_df['대출등급']

# Train set과 Test set으로 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

In [29]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(72219, 16)
(24074, 16)
(72219,)
(24074,)


In [30]:
X_train.head(3)

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,부채_대비_소득_비율,총계좌수,대출목적,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,총상환액,총상환대출비율,월별대출금액
67899,0.669286,0.0,1,2,0.753113,0.362408,0.508104,1,0.201849,0.77805,0.84561,0.0,0.0,0.80445,0.139897,0.669284
1176,0.839417,0.0,10,2,0.768696,0.324385,0.329845,1,0.0,0.831909,0.887483,0.0,0.0,0.852552,0.175524,0.839416
24038,0.98345,0.0,1,0,0.805227,0.331071,0.444826,1,0.0,0.845239,0.895925,0.0,0.0,0.864129,0.131004,0.98345


In [31]:
# 머신러닝모델 라이브러리 

from sklearn.tree import DecisionTreeClassifier #할당받은거
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier #할당받은거
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier #할당받은거

# 평가 라이브러리

from sklearn.metrics import accuracy_score, f1_score

In [32]:
# GradientBoostingClassifier 초
# MinMax_Scaler_재시도 파라미터 설정:  n_estimators=100, max_depth=5, learning_rate=0.1

gb_model = GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=5, learning_rate=0.1) 
#learning_rate=0.1, min_samples_split=3,min_samples_leaf=5, max_features=10
gb_model.fit(X_train, y_train)

y_pred_train = gb_model.predict(X_train)
y_pred_test = gb_model.predict(X_test)
    
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
    
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

result = pd.DataFrame({'accuracy' : [train_accuracy, test_accuracy],
                            'F1' : [train_f1_score, test_f1_score]},
                            index = ['train','test'])

display(result)

Unnamed: 0,accuracy,F1
train,0.888492,0.888586
test,0.859226,0.858998


In [32]:
# XGBClassifier 
# MinMax_Scaler_재시도 파라미터 설정:  n_estimators=100, max_depth=5, learning_rate=0.1

xgb_model = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state = 42)
xgb_model.fit(X_train, y_train)

y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)
    
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
    
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

result = pd.DataFrame({'accuracy' : [train_accuracy, test_accuracy],
                            'F1' : [train_f1_score, test_f1_score]},
                            index = ['train','test'])

display(result)

Unnamed: 0,accuracy,F1
train,0.860895,0.86086
test,0.842361,0.841887


In [33]:
# LGBMClassifier
# MinMax_Scaler_재시도 파라미터 설정:  n_estimators=100, max_depth=5, learning_rate=0.1

lgb_model = LGBMClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state = 42)
lgb_model.fit(X_train, y_train)

y_pred_train = lgb_model.predict(X_train)
y_pred_test = lgb_model.predict(X_test)
    
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
    
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

result = pd.DataFrame({'accuracy' : [train_accuracy, test_accuracy],
                            'F1' : [train_f1_score, test_f1_score]},
                            index = ['train','test'])

display(result)

Unnamed: 0,accuracy,F1
train,0.851646,0.851627
test,0.818186,0.817672


In [34]:
# RandomForestClassifier
# MinMax_Scaler_재시도 파라미터 설정:  n_estimators=100, max_depth=5

rf_model = RandomForestClassifier(random_state = 42, n_estimators=100, max_depth=5, n_jobs=-1, criterion='entropy')
rf_model.fit(X_train, y_train)

y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)
    
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
    
train_f1_score = f1_score(y_train, y_pred_train, average='weighted')
test_f1_score = f1_score(y_test, y_pred_test, average='weighted')

result = pd.DataFrame({'accuracy' : [train_accuracy, test_accuracy],
                            'F1' : [train_f1_score, test_f1_score]},
                            index = ['train','test'])

display(result)

Unnamed: 0,accuracy,F1
train,0.508592,0.481374
test,0.50702,0.479992


In [33]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

In [None]:
# 모델 및 RFECV 객체 생성
rfecv_gb_model = GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=5, learning_rate=0.1) 
rfecv = RFECV(estimator=rfecv_gb_model, step=1, cv=StratifiedKFold(5), scoring='f1_weighted')  # StratifiedKFold를 사용하여 다중 분류에 적용
# 파라미터에 변수개수선택가능 min_features_to_select 설정가능

# RFECV 수행
selector = rfecv.fit(X, y)

# 특성 수에 따른 교차 검증 점수 시각화
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation score (nb of correct classifications)")
plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
plt.show()

# 선택된 특성 확인
selected_features = selector.support_
print("Selected Features:", selected_features)


selected_feature_indices = selector.get_support(indices=True)
print('selected_feature: ', selected_feature_indices)

In [None]:
selected_feature_names = [X.columns[i] for i in selected_feature_indices]
print(selected_feature_names)

In [None]:
# 피처 중요도 얻기
feature_importances = gb_model.feature_importances_

# 피처 중요도를 피처 이름과 함께 DataFrame으로 만들기 (예시)
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# 중요도에 따라 내림차순 정렬
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=True)

# 피처 중요도 시각화
plt.figure(figsize=(8, 8))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature Importance')
plt.title('GradientBoostingClassifier Feature Importance')
plt.show()