<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/JuhongPark/snu-fintech-ai/blob/main/Lending_Club_SSAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
</table>

# 데이터 가져오기 및 전처리

In [621]:
# Brnach 지정
branch = 'develop'

# 라이브러리 설치 (2분 가량 소요됨)
!pip install shap -q
!pip install pytorch_tabnet -q

# 필요 파일 다운로드
!wget https://raw.githubusercontent.com/JuhongPark/snu-fintech-ai/{branch}/LC_Data_Cleaned_0829.csv

zsh:1: command not found: wget


In [622]:
seed = 42  # seed 값 설정
is_test = True  # 테스트런 설정

# 입력 변수 파라미터
target = 'loan_status_encoded'
drop_list = ['id', 'int_rate', 'installment', 'sub_grade', 'grade', 'tbond_int', 'year', 'term']
feature_list = ['loan_amnt', 'emp_length', 'revol_util', 'pub_rec', 'fico_range_high', 'fico_range_low', 'percent_bc_gt_75']

# 대출 이후 변수
post_list = ['Funded_amnt', 'funded_amnt_inv', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'last_credit_pull_d', 'last_pymnt_amnt', 'last_pymnt_d',
 'mths_since_last_major_derog', 'next_pymnt_d', 'out_prncp', 'out_prncp_inv', 'recoveries', 'total_pymnt', 'total_pymnt_inv', 'total_rec_int', 'total_rec_late_fee',
 'total_rec_prncp', 'hardship_flag', 'hardship_type', 'hardship_reason', 'hardship_status', 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date',
 'payment_plan_start_date', 'hardship_length', 'hardship_dpd', 'hardship_loan_status', 'orig_projected_additional_accrued_interest', 'hardship_payoff_balance_amount',
 'hardship_last_payment_amount', 'debt_settlement_flag', 'last_fico_range_high', 'last_fico_range_low']

# 모델 파라미터 설정
# For model
learning_rate = 0.01

# For training
n_epochs = 10000

# For CV
cv = 10

# For SSAE
n_epochs_ssae = 10000

# 테스트 런일 경우, 에포크 수를 줄임
if is_test:
    n_epochs = 10

In [623]:
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import shap
from sklearn.ensemble import RandomForestClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgbm
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.base import BaseEstimator, TransformerMixin

In [624]:
# Random Seed 설정
def set_random_seed(seed):
    random.seed(seed) # 파이썬 표준 라이브러리
    np.random.seed(seed) # numpy의 random 모듈에서 사용하는 seed
    torch.manual_seed(seed) # pytorch에서 사용하는 seed
    if torch.cuda.is_available(): # GPU에서 실행되는 PyTorch 연산의 무작위성을 제어
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

# seed 값 설정
set_random_seed(seed)

## 데이터 다운로드하기

In [625]:
# 모든 행이 화면에 표시되도록 설정합니다.
pd.set_option('display.max_rows', None)

# 파일 로드
file_path = 'LC_Data_Cleaned_0829.csv'
df = pd.read_csv(file_path)

# 테스트 런일 경우, 데이터 크기 줄이기
if is_test:
    df = df.sample(frac=0.01, random_state=seed)

## 데이터 구조 훑어 보기

In [626]:
df.head()

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,loan_status_encoded,year,tbond_int
178221,22250.0,22250.0,0,0.2557,891.38,F,F5,0.0,1,110000.0,...,0.0,1.0,0.0,114548.0,66485.0,38500.0,74848.0,0,2013,0.001312
109102,4200.0,4200.0,0,0.1398,143.51,C,C1,10.0,0,80000.0,...,25.0,0.0,0.0,407056.0,64902.0,43100.0,51956.0,0,2013,0.001312
90322,9000.0,9000.0,0,0.079,281.62,A,A4,2.0,1,67000.0,...,20.0,0.0,0.0,36656.0,25908.0,13700.0,22456.0,0,2014,0.001211
210783,5350.0,5350.0,0,0.1099,175.13,B,B3,3.0,1,41683.0,...,0.0,0.0,0.0,185699.0,35220.0,3200.0,30000.0,0,2014,0.001211
208963,16000.0,16000.0,0,0.1099,523.75,B,B2,10.0,0,89000.0,...,50.0,0.0,0.0,334807.0,92785.0,12500.0,70998.0,0,2013,0.001312


In [627]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2220 entries, 178221 to 132080
Data columns (total 71 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   loan_amnt                       2220 non-null   float64
 1   funded_amnt                     2220 non-null   float64
 2   term                            2220 non-null   int64  
 3   int_rate                        2220 non-null   float64
 4   installment                     2220 non-null   float64
 5   grade                           2220 non-null   object 
 6   sub_grade                       2220 non-null   object 
 7   emp_length                      2220 non-null   float64
 8   home_ownership                  2220 non-null   int64  
 9   annual_inc                      2220 non-null   float64
 10  verification_status             2220 non-null   int64  
 11  loan_status                     2220 non-null   object 
 12  purpose                         

In [628]:
df.shape

(2220, 71)

In [629]:
df.dtypes

loan_amnt                         float64
funded_amnt                       float64
term                                int64
int_rate                          float64
installment                       float64
grade                              object
sub_grade                          object
emp_length                        float64
home_ownership                      int64
annual_inc                        float64
verification_status                 int64
loan_status                        object
purpose                            object
zip_code                           object
addr_state                         object
dti                               float64
delinq_2yrs                       float64
earliest_cr_line                  float64
fico_range_low                    float64
fico_range_high                   float64
inq_last_6mths                    float64
mths_since_last_delinq            float64
mths_since_last_record            float64
open_acc                          

## 데이터 전처리

In [630]:
# 기존 df 에 있는 칼럼 중에서, drop_list 및 post_list의 칼럼 제거
drop_list = set(df.columns) & set(drop_list + post_list)

# 불필요한 변수 Drop
df = df.drop(columns = drop_list)

# 범주형변수 처리
df = df.select_dtypes(include = [np.number])

# 결측치 처리
df = df.fillna(0)
df.shape

(2220, 56)

In [631]:
X_feature = df[feature_list]
X_remain = df.drop(columns=feature_list + [target])
y = df[target]

# 모델 선택과 훈련

## 테스트 세트 만들기

In [632]:
# Torch 설정

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not is_test else "cpu" 

In [634]:
# 데이터를 훈련 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test, X_feature_train, X_feature_test = train_test_split(X_remain, y, X_feature, test_size=0.2, random_state=42, stratify=y)

# 표준화 (훈련 데이터에 fit_transform, 테스트 데이터에 transform)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# X_feature에 대해서도 동일하게 적용
X_feature_train_scaled = scaler.fit_transform(X_feature_train)
X_feature_test_scaled = scaler.transform(X_feature_test)

In [638]:
print(X_train_scaled.shape)
print(X_feature_train_scaled.shape)
print(y_train.shape)

(1776, 48)
(1776, 7)
(1776,)


## SSAE 모델 정의

In [636]:
# SSAE 모델 정의
class DenoisingSSAE(nn.Module):
    def __init__(self, input_dim):
        super(DenoisingSSAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),  # input_dim이 실제 데이터의 feature 수와 일치해야 함
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 8),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(8, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)  # output_dim도 input_dim과 일치해야 함
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
# Noise 추가 함수
def add_noise(data, noise_factor=0.2):
    noise = noise_factor * np.random.randn(*data.shape)
    noisy_data = data + noise
    noisy_data = np.clip(noisy_data, 0., 1.)
    return noisy_data

In [637]:
input_dim = X_train.shape[1]  # X_train의 feature 수
print(input_dim)

48


In [640]:
# 모델 생성 및 학습 설정
model = DenoisingSSAE(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 노이즈가 추가된 학습 데이터 생성
X_train_noisy = add_noise(X_train_scaled)

# 모델 학습
epochs = 10000

for epoch in range(epochs):
    model.train()
    inputs = torch.FloatTensor(X_train_noisy).to(device)  # NumPy 배열 -> PyTorch 텐서
    targets = torch.FloatTensor(X_train_scaled).to(device)  # NumPy 배열 -> PyTorch 텐서

    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, targets)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 1000 == 0 or epoch == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [1/10000], Loss: 1.0103
Epoch [1000/10000], Loss: 0.3384
Epoch [2000/10000], Loss: 0.3178
Epoch [3000/10000], Loss: 0.3090
Epoch [4000/10000], Loss: 0.3019
Epoch [5000/10000], Loss: 0.2934
Epoch [6000/10000], Loss: 0.2902
Epoch [7000/10000], Loss: 0.2911
Epoch [8000/10000], Loss: 0.2855
Epoch [9000/10000], Loss: 0.2824
Epoch [10000/10000], Loss: 0.2812


## 모델 평가함수 및 실행

In [641]:
# 평가함수 정의
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.show()

In [642]:
# Shapley Value 계산, 시각화 함수 정의
def evaluate_models_shap1(model, X_train_encoded_ssae, X_test_encoded_ssae):
    masker = shap.maskers.Independent(X_train_encoded_ssae)
    explainer = shap.LinearExplainer(model, masker=masker)
    shap_values = explainer(X_test_encoded_ssae)

    # summary_plot을 사용한 Shapley Value 시각화
    shap.summary_plot(shap_values, X_test_encoded_ssae, feature_names=[f"Encoded_{i}" for i in range(X_test_encoded_ssae.shape[1])])

def evaluate_models_shap2(model, X_test_encoded_ssae):
    # TreeExplainer를 사용한 Shapley Value 계산
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_encoded_ssae)

    # summary_plot을 사용한 Shapley Value 시각화
    shap.summary_plot(shap_values, X_test_encoded_ssae, feature_names=[f"Encoded_{i}" for i in range(X_test_encoded_ssae.shape[1])])


In [643]:
# ShapleyValue 계산, 시각화함수 정의 (TabNet 전용)
def evaluate_models_shap_tabnet(model, X_train_encoded_ssae, X_test_encoded_ssae):
    # DeepExplainer를 사용한 Shapley Value 계산 (TabNet 전용)
    explainer = DeepExplainer(model, X_train_encoded_ssae)
    shap_values = explainer.shap_values(X_test_encoded_ssae)

    # summary_plot을 사용한 Shapley Value 시각화
    shap.summary_plot(shap_values, X_test_encoded_ssae, feature_names=[f"Encoded_{i}" for i in range(X_test_encoded_ssae.shape[1])])

In [644]:
# 교차 검증(CV)을 위한 함수 정의
def cross_validate_model(model, X, y, cv=cv):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    print(f"Cross-validation scores: {scores}")

In [645]:
# 원래 피처와 SSAE로 변환된 피처들을 결합하는 새로운 파이프라인 정의
combined_transformer = CombinedSSAEAndOriginalFeatures(ssae_transformer=SSAETransformer(input_dim=input_dim), 
                                                       original_features=X_feature)

In [646]:
# 모델 정의 및 파이프라인 연결
linear_models = {
    "Logistic Regression": Pipeline([
        ('combine', combined_transformer),    # SSAE와 원래 피처 결합
        ('model', LogisticRegression(max_iter=1000))  # 최종 모델
    ]),
}

ensemble_models = {
    "Random Forest Classifier": Pipeline([
        ('combine', combined_transformer),    # SSAE와 원래 피처 결합
        ('model', RandomForestClassifier())  # 최종 모델
    ]),
    "XGBoost Classifier": Pipeline([
        ('combine', combined_transformer),    # SSAE와 원래 피처 결합
        ('model', xgb.XGBClassifier())  # 최종 모델
    ]),
    "LightGBM Classifier": Pipeline([
        ('combine', combined_transformer),    # SSAE와 원래 피처 결합
        ('model', lgbm.LGBMClassifier())  # 최종 모델
    ]),
}

tabnet_model = {
    "TabNet Classifier": Pipeline([
        ('combine', combined_transformer),    # SSAE와 원래 피처 결합
        ('model', TabNetClassifier())  # 최종 모델로 TabNet 추가
    ]), 
}



## 테스트세트로 평가하기

In [None]:
for model_name, pipeline in linear_models.items():
    print(f"** Evaluating {model_name}:")
    pipeline.fit(X_train_encoded_ssae, y_train)
    evaluate_model(pipeline, X_train_encoded_ssae, X_test_encoded_ssae, y_train, y_test)

    print(f"** Evaluating {model_name}'s Shapley Value (LinearExplainer): ")
    evaluate_models_shap1(pipeline.named_steps['model'], X_train_encoded_ssae, X_test_encoded_ssae)


** Evaluating Logistic Regression:
X_remaining.shape: (1776, 8)
inputs.shape: torch.Size([1776, 8])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1776x8 and 48x64)

In [None]:
for model_name, pipeline in ensemble_models.items():
    print(f"** Evaluating {model_name} on SSAE Encoded Data: ")
    pipeline.fit(X_train_encoded_ssae, y_train)
    evaluate_model(pipeline, X_train_encoded_ssae, X_test_encoded_ssae, y_train, y_test)

    print(f"** Evaluating {model_name}'s Shapley Value (TreeExplainer): ")
    evaluate_models_shap2(pipeline.named_steps['model'], X_test_encoded_ssae)