In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import warnings

from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBClassifier

%matplotlib inline
pd.options.display.max_columns = 50
# 한글 폰트 설정
plt.rc('font', family='Malgun Gothic')
# 경고창 무시
warnings.filterwarnings(action="ignore")

In [2]:
# 원본 데이터 불러오기
credit_df = pd.read_csv("credit_data.csv")

# 데이터 전처리

In [3]:
# ID 열 삭제
credit_df = credit_df.drop(columns=['OBS'])

In [31]:
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 992 entries, 0 to 999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   DURATION           992 non-null    int64  
 1   EDUCATION_PURPOSE  992 non-null    int64  
 2   SAV_ACCT           992 non-null    int64  
 3   EMPLOYMENT         992 non-null    float64
 4   INSTALL_RATE       992 non-null    int64  
 5   REAL_ESTATE        992 non-null    int64  
 6   PROP_UNKN_NONE     992 non-null    int64  
 7   OTHER_INSTALL      992 non-null    int64  
 8   RENT               992 non-null    int64  
 9   FOREIGN            992 non-null    int64  
 10  IS_CHK_ACCT        992 non-null    int32  
 11  NO_CREDIT_HISTORY  992 non-null    float64
 12  DULY_PAY_HISTORY   992 non-null    float64
 13  CRITICAL_ACCT      992 non-null    float64
 14  IS_SAV_ACCT        992 non-null    int32  
 15  IS_MALE            992 non-null    int32  
 16  RESPONSE           992 non

In [4]:
# 당좌예금계좌 : 범주형 변수 처리
# 3: 당좌계좌 존재 X
# 당좌계좌 유무를 새로운 변수로 만듦
# 당좌계좌가 존재하지 않는 사람들은 계좌 잔액이 0이므로 0으로 처리 

credit_df['IS_CHK_ACCT'] = np.where(credit_df['CHK_ACCT'] == 3, 1, 0)
credit_df.loc[credit_df['CHK_ACCT'] == 3, 'CHK_ACCT'] = 0

In [5]:
# HISTORY: one_hot_encoding
one_hot = OneHotEncoder(sparse=False)
history_df = pd.DataFrame(one_hot.fit_transform(credit_df[['HISTORY']]))
history_df.columns = ['NO_CREDIT_HISTORY', 'DULY_PAY_HISTORY', 'NOW_PAY_HISTORY', 'DELAY_PAY_HISTORY', 'CRITICAL_ACCT']

# concat df
credit_df = pd.concat([credit_df, history_df], axis=1)
credit_df = credit_df.drop(columns=['HISTORY'])

#credit_df['IS_CRITICAL'] = np.where(credit_df['HISTORY'] == 4, 1, 0)

In [6]:
# PURPOSE: '자산 목적', '교육 목적'
credit_df['ASSET_PURPOSE'] = np.where(credit_df['NEW_CAR'] == 1, 1,
                             np.where(credit_df['USED_CAR'] == 1, 1,
                                 np.where(credit_df['FURNITURE'] == 1, 1,
                                     np.where(credit_df['RADIO/TV'] == 1, 1, 0))))

credit_df.loc[credit_df['RETRAINING'] == 1,'EDUCATION'] = 1
credit_df = credit_df.rename(columns={'EDUCATION' : 'EDUCATION_PURPOSE'})
credit_df = credit_df.drop(columns=['NEW_CAR', 'USED_CAR', 'FURNITURE', 'RADIO/TV', 'RETRAINING'])

In [7]:
# 당좌예금(CHK_ACCT)과 똑같이 처리

credit_df['IS_SAV_ACCT'] = np.where(credit_df['SAV_ACCT'] == 4, 1, 0)
credit_df.loc[credit_df['SAV_ACCT'] == 4, 'SAV_ACCT'] = 0

In [8]:
# Employment, PRESENT_RESIDENT 범주형 범위 다른데 어떻게?

In [9]:
# 성별: 남자와 여자로 나누기
credit_df['IS_MALE'] = np.where(credit_df['MALE_DIV'] == 1, 1,
                             np.where(credit_df['MALE_SINGLE'] == 1, 1,
                                 np.where(credit_df['MALE_MAR_or_WID'] == 1, 1, 0)))

credit_df = credit_df.drop(columns=['MALE_DIV', 'MALE_SINGLE', 'MALE_MAR_or_WID'])

In [10]:
# 직업 유무로 나누기
credit_df['IS_JOB'] = np.where(credit_df['JOB'] == 0, 0, 1)

credit_df = credit_df.drop(columns=['JOB'])

#### 결측치 처리

In [13]:
# 결측치 안에서 특별한 패턴을 찾을 수 없었으며
# 전체 데이터의 0.8% 밖에 되지 않아, 삭제하더라도 모델의 결과에 영향을 끼치지 않을 것이라고 판단
credit_df = credit_df.dropna()

In [14]:
# 열 순서 변경
temp = credit_df['RESPONSE']
credit_df = credit_df.drop(columns=['RESPONSE'])
credit_df['RESPONSE'] = temp

# 변수 선정

In [15]:
# levenue: 정규성 유무없이 등분산 검정 가능 
# t-test : 정규분포일 때 t 값의 표본분포를 t분포라고 함 t = (표본평균 - 모평균) / 표준오차

group1 = credit_df[credit_df['RESPONSE'] == 0]
group2 = credit_df[credit_df['RESPONSE'] == 1]
drop_col = []
ttest_p = 0

for col in credit_df.columns[:-1]:
    equal_var = stats.levene(group1[col], group2[col])
    if equal_var.pvalue > .05:
        ttest_p = stats.ttest_ind(group1[col], group2[col], equal_var=True).pvalue
    else:
        ttest_p = stats.ttest_ind(group1[col], group2[col], equal_var=False).pvalue
    
    if ttest_p > 0.05:
        drop_col.append(col)
print("Drop by t-test: ", drop_col)

credit_df = credit_df.drop(columns=drop_col)

Drop by t-test:  ['CHK_ACCT', 'AMOUNT', 'CO-APPLICANT', 'GUARANTOR', 'PRESENT_RESIDENT', 'NUM_CREDITS', 'NUM_DEPENDENTS', 'TELEPHONE', 'NOW_PAY_HISTORY', 'DELAY_PAY_HISTORY', 'IS_JOB']


In [16]:
# 다중공선성

drop_col = []
for i, col in enumerate(credit_df.columns):
    # 다중공선성 10 이상 삭제
    if variance_inflation_factor(credit_df.values, i) > 10:
        drop_col.append(col)

        
print("Drop by Multicollinearity: ", drop_col)
credit_df = credit_df.drop(columns=drop_col)

Drop by Multicollinearity:  ['AGE', 'OWN_RES', 'ASSET_PURPOSE']


In [17]:
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 992 entries, 0 to 999
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   DURATION           992 non-null    int64  
 1   EDUCATION_PURPOSE  992 non-null    int64  
 2   SAV_ACCT           992 non-null    int64  
 3   EMPLOYMENT         992 non-null    float64
 4   INSTALL_RATE       992 non-null    int64  
 5   REAL_ESTATE        992 non-null    int64  
 6   PROP_UNKN_NONE     992 non-null    int64  
 7   OTHER_INSTALL      992 non-null    int64  
 8   RENT               992 non-null    int64  
 9   FOREIGN            992 non-null    int64  
 10  IS_CHK_ACCT        992 non-null    int32  
 11  NO_CREDIT_HISTORY  992 non-null    float64
 12  DULY_PAY_HISTORY   992 non-null    float64
 13  CRITICAL_ACCT      992 non-null    float64
 14  IS_SAV_ACCT        992 non-null    int32  
 15  IS_MALE            992 non-null    int32  
 16  RESPONSE           992 non

# 학습, 평가 데이터 분리

In [18]:
# 독립변수와 종속변수 분리
X = credit_df.drop(columns=['RESPONSE'])
y = credit_df['RESPONSE']

In [19]:
# 학습 데이터 0.8 // 평가 데이터 0.2 로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# SMOTE 

In [20]:
# 종속변수의 데이터 불균형 해결 위해 SMOTE 사용
smote = SMOTE(random_state=1)
X_smote_train, y_smote_train = smote.fit_resample(X_train, y_train)

# 스케일링

In [21]:
# sclaer 쓴 이유: 
scaler = MinMaxScaler()
X_scaler_train = scaler.fit_transform(X_train)
X_scaler_test = scaler.transform(X_test)

# 모델링

In [32]:
def get_score(pred):
    print("정확도: ", accuracy_score(y_test, pred))
    print("정밀도: ", precision_score(y_test, pred))
    print("재현율: ", recall_score(y_test, pred))
    print("ROC score: ", roc_auc_score(y_test, pred))

In [23]:
# 회귀 모델
logit_model = sm.Logit(y, X)
logit_result = logit_model.fit()
logit_result.summary()

Optimization terminated successfully.
         Current function value: 0.486596
         Iterations 7


0,1,2,3
Dep. Variable:,RESPONSE,No. Observations:,992.0
Model:,Logit,Df Residuals:,976.0
Method:,MLE,Df Model:,15.0
Date:,"Wed, 01 Dec 2021",Pseudo R-squ.:,0.2005
Time:,19:33:07,Log-Likelihood:,-482.7
converged:,True,LL-Null:,-603.76
Covariance Type:,nonrobust,LLR p-value:,5.22e-43

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
DURATION,-0.0233,0.006,-3.906,0.000,-0.035,-0.012
EDUCATION_PURPOSE,-0.1847,0.223,-0.828,0.408,-0.622,0.253
SAV_ACCT,0.4056,0.122,3.337,0.001,0.167,0.644
EMPLOYMENT,0.2202,0.064,3.443,0.001,0.095,0.346
INSTALL_RATE,-0.0692,0.057,-1.210,0.226,-0.181,0.043
REAL_ESTATE,0.4665,0.187,2.490,0.013,0.099,0.834
PROP_UNKN_NONE,-0.3615,0.224,-1.615,0.106,-0.800,0.077
OTHER_INSTALL,-0.3392,0.201,-1.684,0.092,-0.734,0.055
RENT,-0.3720,0.201,-1.855,0.064,-0.765,0.021


In [24]:
# 로지스틱 회귀
logit = LogisticRegression()
logit.fit(X_scaler_train, y_train)
logit_pred = logit.predict(X_scaler_test)

get_score(logit_pred)

정확도:  0.6934673366834171
정밀도:  0.7532467532467533
재현율:  0.8345323741007195
ROC score:  0.6005995203836931


In [25]:
# 랜덤 포레스트
rf = RandomForestClassifier()
rf.fit(X_scaler_train, y_train)
rf_pred = rf.predict(X_scaler_test)

get_score(rf_pred)

정확도:  0.6934673366834171
정밀도:  0.7671232876712328
재현율:  0.8057553956834532
ROC score:  0.6195443645083932


In [26]:
# xgboost
xg = XGBClassifier()
xg.fit(X_scaler_train, y_train)
xg_pred = xg.predict(X_scaler_test)

get_score(xg_pred)

정확도:  0.6683417085427136
정밀도:  0.762589928057554
재현율:  0.762589928057554
ROC score:  0.6062949640287769


In [27]:
# LGBM
lgbm = LGBMClassifier()
lgbm.fit(X_scaler_train, y_train)
lgbm_pred = lgbm.predict(X_scaler_test)

get_score(lgbm_pred)

정확도:  0.7085427135678392
정밀도:  0.7832167832167832
재현율:  0.8057553956834532
ROC score:  0.6445443645083933


In [28]:
# GridSearch 
param = {'learning_rate': [0.001, 0.01, 0.1],
        'n_estimators': [100, 200, 500],
        'max_depth': [-1, 10, 20, 50]
        }

lgbm_grid = GridSearchCV(lgbm, param_grid=param, cv=5, n_jobs=-1,scoring='accuracy')

lgbm_grid.fit(X_scaler_train, y_train)
lgbm_grid_pred = lgbm_grid.predict(X_scaler_test)

get_score(lgbm_grid_pred)

정확도:  0.7085427135678392
정밀도:  0.7579617834394905
재현율:  0.8561151079136691
ROC score:  0.6113908872901679


In [30]:
# svm
# dnn