In [71]:
import torch
import torch.nn as nn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree, ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('always')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data_path = './data/Loan_Default.csv'

In [36]:
eda_df = pd.read_csv(data_path)
eda_df.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [37]:
print('# of data: {}'.format(eda_df.shape[0]))
print('# of features: {}\n'.format(eda_df.shape[1]))

print('feature name/dtype')
numerical_count = 0
categorical_count = 0

for feature_name, feature_type in zip(eda_df.columns, eda_df.dtypes):
  if np.issubdtype(feature_type, np.number):
    numerical_count += 1
  else:
    categorical_count += 1
  print('{}: {}'.format(feature_name, feature_type))

print('\n # of numerical item: {}/# of categorical item: {}'.format(numerical_count, categorical_count))

print('\nstatistcal infos')
eda_df.describe()

# of data: 148670
# of features: 34

feature name/dtype
ID: int64
year: int64
loan_limit: object
Gender: object
approv_in_adv: object
loan_type: object
loan_purpose: object
Credit_Worthiness: object
open_credit: object
business_or_commercial: object
loan_amount: int64
rate_of_interest: float64
Interest_rate_spread: float64
Upfront_charges: float64
term: float64
Neg_ammortization: object
interest_only: object
lump_sum_payment: object
property_value: float64
construction_type: object
occupancy_type: object
Secured_by: object
total_units: object
income: float64
credit_type: object
Credit_Score: int64
co-applicant_credit_type: object
age: object
submission_of_application: object
LTV: float64
Region: object
Security_Type: object
Status: int64
dtir1: float64

 # of numerical item: 13/# of categorical item: 21

statistcal infos


Unnamed: 0,ID,year,loan_amount,rate_of_interest,Interest_rate_spread,Upfront_charges,term,property_value,income,Credit_Score,LTV,Status,dtir1
count,148670.0,148670.0,148670.0,112231.0,112031.0,109028.0,148629.0,133572.0,139520.0,148670.0,133572.0,148670.0,124549.0
mean,99224.5,2019.0,331117.7,4.045476,0.441656,3224.996127,335.136582,497893.5,6957.338876,699.789103,72.746457,0.246445,37.732932
std,42917.476598,0.0,183909.3,0.561391,0.513043,3251.12151,58.409084,359935.3,6496.586382,115.875857,39.967603,0.430942,10.545435
min,24890.0,2019.0,16500.0,0.0,-3.638,0.0,96.0,8000.0,0.0,500.0,0.967478,0.0,5.0
25%,62057.25,2019.0,196500.0,3.625,0.076,581.49,360.0,268000.0,3720.0,599.0,60.47486,0.0,31.0
50%,99224.5,2019.0,296500.0,3.99,0.3904,2596.45,360.0,418000.0,5760.0,699.0,75.13587,0.0,39.0
75%,136391.75,2019.0,436500.0,4.375,0.7754,4812.5,360.0,628000.0,8520.0,800.0,86.184211,0.0,45.0
max,173559.0,2019.0,3576500.0,8.0,3.357,60000.0,360.0,16508000.0,578580.0,900.0,7831.25,1.0,61.0


In [38]:
label_df = eda_df['Status']
eda_df = eda_df.drop(columns=['ID', 'year', 'Status'])
status_counts = label_df.value_counts()
print(status_counts)

class_counts = label_df.value_counts()

#majority, minority 클래스 분류 --> 추후 언더샘플, 오버샘플링을 위해
majority_class_label = class_counts.idxmax()
minority_class_label = class_counts.idxmin()

majority_class_count = class_counts.max()
minority_class_count = class_counts.min()
print(f"Majority Class Label: {majority_class_label} with {majority_class_count} instances")
print(f"Minority Class Label: {minority_class_label} with {minority_class_count} instances")

#불균형 확인

Status
0    112031
1     36639
Name: count, dtype: int64
Majority Class Label: 0 with 112031 instances
Minority Class Label: 1 with 36639 instances


In [78]:
missing_value_list = ['mean', 'mode', 'median'] #remove 로 하면 데이터가 많이 없어짐...
scaling_list = ['min-max', 'standardization']
#model_list = ['LogisticRegression', 'DecisionTree', 'RandomForest', 'GBDT', 'XGBoost']
imbalance_dataset_list = ['original', 'weight balance', 'undersample', 'oversample']
hparams_dict = dict(
    LogisticRegression={'lr':[1e-5, 1e-4, 5e-4, 1e-3],
                        'epochs':[100]},
    DecisionTree={'max_depth':[3, 4, 5],
                  'min_sample_leaf':[3, 4]},
    RandomForest={'max_depth':[3, 4, 5],
                  'n_estimators':[100, 150]},
    GBDT={'max_depth':[3, 4, 5],
                  'n_estimators':[100, 150]}
    
)
num_class = 2
random_state = 42

Preprocess

In [54]:
# 전처리 자동화
def preprocess(missing_value, scaling, imbalance, test_data_ratio=0.2, random_state=42, verbose=False):
    if verbose: print(f'Preprocess with {missing_value}, {scaling}, {imbalance}.')
    eda_df = pd.read_csv(data_path)
    label_df = eda_df['Status'] #label
    eda_df = eda_df.drop(columns=['ID', 'year', 'Status']) #필요없는 컬럼 제거
    
    #=============================================================================================
    #클리닝
    duplicates = eda_df.duplicated()
    eda_df = eda_df[~duplicates]
    label_df = label_df[~duplicates]

    if missing_value == 'remove':
        non_missing = ~eda_df.isnull().any(axis=1)
        eda_df = eda_df[non_missing]
        label_df = label_df[non_missing]

    else:
        for feature_name in eda_df.columns:
            
            
            for feature_name in eda_df.columns:
                if eda_df[feature_name].dtype in ['int32', 'int64', 'float32', 'float64']:
                    if eda_df[feature_name].isnull().sum() > 0:
                        #mean, mode, median 으로 치환
                        match missing_value:
                            case 'mean':
                                fill_value = eda_df[feature_name].mean()
                            case 'mode':
                                fill_value = eda_df[feature_name].mode()[0]
                            case 'median':
                                fill_value = eda_df[feature_name].median()
                            case _:
                                raise NotImplementedError
                        eda_df.fillna({feature_name: fill_value}, inplace=True) #future warning
                else:
                    if eda_df[feature_name].isnull().sum() > 0:
                        mode_value = eda_df[feature_name].mode()[0]
                        eda_df.fillna({feature_name: mode_value}, inplace=True)
    
    train_data, test_data, train_label, test_label = train_test_split(eda_df, label_df, test_size=test_data_ratio, random_state=random_state)

    #=============================================================================================
    #데이터 전처리
    train_numerical_features = train_data.select_dtypes(include=[np.number])
    train_numerical_array = train_numerical_features.values
    test_numerical_features = test_data.select_dtypes(include=[np.number])
    test_numerical_array = test_numerical_features.values

    if scaling == 'standardization':
        train_feature_mean = np.mean(train_numerical_array, axis=0)
        train_feature_std = np.std(train_numerical_array, axis=0)

        train_feature_std = np.where(train_feature_std == 0, 1e-5, train_feature_std)

        train_scaled_data = (train_numerical_array - train_feature_mean) / train_feature_std
        test_scaled_data = (test_numerical_array - train_feature_mean) / train_feature_std

    elif scaling == 'min-max':
        train_data_min = np.min(train_numerical_array, axis=0)
        train_data_max = np.max(train_numerical_array, axis=0)

        train_data_max = np.where(train_data_max == train_data_min, train_data_min + 1e-5, train_data_max)

        train_scaled_data = (train_numerical_array - train_data_min) / (train_data_max - train_data_min)
        test_scaled_data = (test_numerical_array - train_data_min) / (train_data_max - train_data_min)

    train_categorical_features = train_data.select_dtypes(include=[object])
    test_categorical_features = test_data.select_dtypes(include=[object])

    train_one_hot_encoded_list = []
    test_one_hot_encoded_list = []

    train_encoded_feature_count_list = []

    #원 핫 인코딩
    for feature_name in train_categorical_features.columns:
        unique_values = np.unique(train_categorical_features[feature_name])
        train_encoded_feature_count_list.append(len(unique_values)+1)

        train_encoded_array = np.zeros((train_categorical_features.shape[0], len(unique_values) + 1))
        for index, value in enumerate(train_categorical_features[feature_name]):
            train_encoded_array[index, np.where(unique_values == value)[0]] = 1
        train_one_hot_encoded_list.append(train_encoded_array)

        test_encoded_array = np.zeros((test_categorical_features.shape[0], len(unique_values) + 1))
        for index, value in enumerate(test_categorical_features[feature_name]):
            if value in unique_values:
                test_encoded_array[index, np.where(unique_values == value)[0]] = 1
            else:
                test_encoded_array[index, -1] = 1
        test_one_hot_encoded_list.append(test_encoded_array)

    train_encoded_array = np.hstack(train_one_hot_encoded_list)
    test_encoded_array = np.hstack(test_one_hot_encoded_list)

    train_processed_array = np.hstack((train_scaled_data, train_encoded_array))
    train_label_array = train_label.to_numpy()

    test_processed_array = np.hstack((test_scaled_data, test_encoded_array))
    test_label_array = test_label.to_numpy()

    #=============================================================================================
    # 샘플링
    if imbalance == 'undersample' or imbalance == 'oversample':
        unique, counts = np.unique(train_label_array, return_counts=True)
        class_counts = dict(zip(unique, counts))
        minority_class = min(class_counts, key=class_counts.get)
        majority_class = max(class_counts, key=class_counts.get)

        # minor major index 찾기
        minority_indices = np.where(train_label_array == minority_class)[0]
        majority_indices = np.where(train_label_array == majority_class)[0]

        if imbalance == 'undersample':
            if verbose: print("undersampling")
            # 언더: majority를 minority 에 맞추어 선택
            num_minority = len(minority_indices)
            random_majority_indices = np.random.choice(majority_indices, num_minority, replace=False)
            selected_indices = np.concatenate([minority_indices, random_majority_indices])
        
        elif imbalance == 'oversample':
            if verbose: print('oversampling')
            # 오버: minority를 majority에 맞추어 복사
            num_majority = len(majority_indices)
            num_minority = len(minority_indices)
            repeat_count = (num_majority // num_minority) + 1
            oversampled_minority_indices = np.tile(minority_indices, repeat_count)[:num_majority]
            selected_indices = np.concatenate([oversampled_minority_indices, majority_indices])
        np.random.shuffle(selected_indices)

        train_processed_array = train_processed_array[selected_indices]
        train_label_array = train_label_array[selected_indices]

    return train_processed_array, train_label_array, test_processed_array, test_label_array

전처리 테스트

In [47]:
train_processed_array, train_label_array, test_processed_array, test_label_array = preprocess('mean', "standardization", 'original', test_data_ratio=0.2, random_state=42)
unique, counts = np.unique(train_label_array, return_counts=True)
class_counts = dict(zip(unique, counts))
print(class_counts)

Preprocess with mean, standardization, original.
{0: 89537, 1: 29399}


In [50]:
train_processed_array, train_label_array, test_processed_array, test_label_array = preprocess('mean', "standardization", 'oversample', test_data_ratio=0.2, random_state=42)
unique, counts = np.unique(train_label_array, return_counts=True)
class_counts = dict(zip(unique, counts))
print(class_counts)

Preprocess with mean, standardization, oversample.
oversampling
{0: 89537, 1: 89537}


In [51]:
train_processed_array, train_label_array, test_processed_array, test_label_array = preprocess('mean', "standardization", 'undersample', test_data_ratio=0.2, random_state=42)
unique, counts = np.unique(train_label_array, return_counts=True)
class_counts = dict(zip(unique, counts))
print(class_counts)

Preprocess with mean, standardization, undersample.
undersampling
{0: 29399, 1: 29399}


Logistic Regression

In [60]:
class TorchLogisticRegression(nn.Module):
  def __init__(self, data, num_class):
    super(TorchLogisticRegression, self).__init__()
    self.logistic_regressor = nn.Linear(data.shape[1], num_class)

  def forward(self, data):
    logit = self.logistic_regressor(data)

    return logit

  def get_params(self):
    torch_weight, torch_bias = self.logistic_regressor.weight.detach().cpu().numpy(), self.logistic_regressor.bias.detach().cpu().numpy()

    return torch_weight, torch_bias
  
def train_torch_model(data, label, model, criterion, optimizer, epochs, device):
  model.train()

  data, label = torch.tensor(data, dtype=torch.float32).to(device), torch.tensor(label, dtype=torch.long).to(device)

  for epoch in range(epochs):
    optimizer.zero_grad()

    result = model(data)

    loss = criterion(result, label)

    loss.backward()
    optimizer.step()


def test_torch_model(data, label, model, device):
  model.eval()

  data = torch.tensor(data, dtype=torch.float32).to(device)
  with torch.no_grad():
    result = model(data)
    _, result = torch.max(result, 1)

    result = result.cpu()
    accuracy = accuracy_score(label, result)
    recall = recall_score(label, result)
    precision = precision_score(label, result)
    f1_measure = f1_score(label, result)

    print('acc: {}'.format(accuracy))
    print('recall: {}'.format(recall))
    print('precision: {}'.format(precision))
    print('f1 score: {}'.format(f1_measure))

def get_inverse_class_frequency_weights(label):
    num_data = label.shape[0]

    num_negative_data = np.where(label == 0)[0].shape[0]
    num_positive_data = np.where(label == 1)[0].shape[0]

    negative_weight = num_data / (num_negative_data * 2)
    positive_weight = num_data / (num_positive_data * 2)

    class_weights = torch.tensor([negative_weight, positive_weight], dtype=torch.float32)
    return class_weights

In [65]:
LR_hparams = hparams_dict.get('LogisticRegression')
LR_lr_list = LR_hparams.get('lr')
LR_epochs_list = LR_hparams.get('epochs')

for missing_value in missing_value_list:
  for scaling in scaling_list:
    for imbalance in imbalance_dataset_list:
      #데이터 불러오기
      train_processed_array, train_label_array, test_processed_array, test_label_array = preprocess(missing_value, scaling, imbalance, test_data_ratio=0.2, random_state=42, verbose=False)

      #logistic regression
      for lr in LR_lr_list:
        for epochs in LR_epochs_list:
          
          logistic_regressor = TorchLogisticRegression(train_processed_array, num_class).to(device)
          model_optimizer = torch.optim.Adam(logistic_regressor.parameters(), lr=lr)
          
          #weight balance 처리
          class_weights = get_inverse_class_frequency_weights(train_label_array).to(device) if imbalance == 'weight balance' else None
          criterion = nn.CrossEntropyLoss(weight=class_weights)
          train_torch_model(train_processed_array, train_label_array, logistic_regressor, criterion, model_optimizer, epochs, device)

          #print(f'Testing Logistic Regression with\nMissing Value: {missing_value}\nScaling: {scaling}\nImbalance: {imbalance}\nLearning Rate: {lr}\nEpochs: {epochs}')
          print(f'{missing_value}, {scaling}, {imbalance}, {lr}, {epochs}')
          print("="*10)
          test_torch_model(test_processed_array, test_label_array, logistic_regressor, device)
          print()

mean, min-max, original, 1e-05, 100
acc: 0.6170713661128674
recall: 0.5270718232044199
precision: 0.3239938869077942
f1 score: 0.40130402776317176

mean, min-max, original, 0.0001, 100
acc: 0.7560368601600861
recall: 0.0015193370165745856
precision: 0.3055555555555556
f1 score: 0.0030236393622869707

mean, min-max, original, 0.0005, 100
acc: 0.7565749646868904
recall: 0.00027624309392265195
precision: 1.0
f1 score: 0.0005523336095001381

mean, min-max, original, 0.001, 100
acc: 0.7565413331539652
recall: 0.00013812154696132598
precision: 1.0
f1 score: 0.00027620494406849883

mean, min-max, weight balance, 1e-05, 100
acc: 0.3045671621712518
recall: 0.8794198895027624
precision: 0.2432752559987773
f1 score: 0.38112055548904583

mean, min-max, weight balance, 0.0001, 100
acc: 0.5520279814353938
recall: 0.6320441988950276
precision: 0.3004201680672269
f1 score: 0.4072623709505162

mean, min-max, weight balance, 0.0005, 100
acc: 0.6581690993475483
recall: 0.6024861878453038
precision: 0.374

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


acc: 0.7565077016210399
recall: 0.0
precision: 0.0
f1 score: 0.0

mode, min-max, original, 0.001, 100
acc: 0.7566085962198157
recall: 0.0004143646408839779
precision: 1.0
f1 score: 0.0008283860278889963

mode, min-max, weight balance, 1e-05, 100
acc: 0.5615120737203202
recall: 0.292817679558011
precision: 0.21119744969117354
f1 score: 0.24539877300613497

mode, min-max, weight balance, 0.0001, 100
acc: 0.7013856191565212
recall: 0.20207182320441988
precision: 0.3204819277108434
f1 score: 0.24786107581533248

mode, min-max, weight balance, 0.0005, 100
acc: 0.6211744131297504
recall: 0.6476519337016574
precision: 0.34987315326070734
f1 score: 0.45431644220521267

mode, min-max, weight balance, 0.001, 100
acc: 0.7060940337660591
recall: 0.6596685082872928
precision: 0.43217808343136366
f1 score: 0.5222240446121044

mode, min-max, undersample, 1e-05, 100
acc: 0.2702629985874756
recall: 0.9716850828729282
precision: 0.24659983174425126
f1 score: 0.3933683739655558

mode, min-max, undersampl

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


acc: 0.7565077016210399
recall: 0.0
precision: 0.0
f1 score: 0.0

median, min-max, original, 0.001, 100
acc: 0.7613842738952041
recall: 0.02016574585635359
precision: 0.9931972789115646
f1 score: 0.039528902125355356

median, min-max, weight balance, 1e-05, 100
acc: 0.5106948274702361
recall: 0.6567679558011049
precision: 0.28271597597954695
f1 score: 0.39527827424248724

median, min-max, weight balance, 0.0001, 100
acc: 0.5631936503665838
recall: 0.5582872928176795
precision: 0.29222093695777906
f1 score: 0.3836370539104024

median, min-max, weight balance, 0.0005, 100
acc: 0.6480460079370418
recall: 0.6131215469613259
precision: 0.36676856977608857
f1 score: 0.4589774078478002

median, min-max, weight balance, 0.001, 100
acc: 0.7248604291383601
recall: 0.6366022099447514
precision: 0.4536863864553598
f1 score: 0.5298005632507615

median, min-max, undersample, 1e-05, 100
acc: 0.6688975583507096
recall: 0.10732044198895027
precision: 0.18682375571050733
f1 score: 0.1363277480480744

me

최고 F1 Score:

모델: median, standardization, weight balance, 0.001, 100
F1 Score: 0.5553280896455124
Accuracy: 0.7224053272348153
Recall: 0.711878453038674
Precision: 0.4552199258081611

두 번째로 좋은 F1 Score:

모델: median, min-max, undersample, 0.001, 100
F1 Score: 0.5522920985947938
기타 지표:
Accuracy: 0.7385484630389453
Recall: 0.662292817679558
Precision: 0.4736270248913473


F1 Score는 precision과 recall 간의 균형을 잘 나타내는 지표이므로, F1 Score가 가장 높은 모델인 'median, standardization, weight balance, 0.001, 100'이 가장 우수한 성능을 보인다.

Decision Tree

In [69]:
class SklearnDecisionTreeClassifier:
  def __init__(self, max_depth, min_sample_leaf, random_state):
    self.max_depth = max_depth
    self.min_sample_leaf = min_sample_leaf
    self.random_state = random_state

    self.classifier = tree.DecisionTreeClassifier(
        max_depth=self.max_depth,
        min_samples_leaf=self.min_sample_leaf,
        random_state=self.random_state
    )

  def __call__(self, data):
    return self.classifier.predict(data)

  def train_model(self, train_data, train_label):
      self.classifier.fit(train_data, train_label)

  def test_model(self, test_data, test_label):

    pred = self.classifier.predict(test_data)
    accuracy = accuracy_score(test_label, pred)
    precision = precision_score(test_label, pred)
    recall = recall_score(test_label, pred)
    f1_measure = f1_score(test_label, pred)

    print('acc: {}/precision: {}/recall: {}/f1-measure: {}'.format(accuracy, precision, recall, f1_measure))

In [73]:
DT_hparams = hparams_dict.get('DecisionTree')
DT_max_depth_list = DT_hparams.get('max_depth')
DT_min_sample_leaf_list = DT_hparams.get('min_sample_leaf')

for missing_value in missing_value_list:
  for scaling in scaling_list:
    for imbalance in imbalance_dataset_list:
      #데이터 불러오기
      train_processed_array, train_label_array, test_processed_array, test_label_array = preprocess(missing_value, scaling, imbalance, test_data_ratio=0.2, random_state=42, verbose=False)

      #Decision Tree
      for max_depth in DT_max_depth_list:
        for min_sample_leaf in DT_min_sample_leaf_list:
          
            sklearn_dt_classifier = SklearnDecisionTreeClassifier(max_depth, min_sample_leaf, random_state)
            sklearn_dt_classifier.train_model(train_processed_array, train_label_array)

            print(f'{missing_value}, {scaling}, {imbalance}, {max_depth}, {min_sample_leaf}')
            print("="*10)
            sklearn_dt_classifier.test_model(test_processed_array, test_label_array)
            print()

mean, min-max, original, 3, 3
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, original, 3, 4
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, original, 4, 3
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, original, 4, 4
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, original, 5, 3
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, original, 5, 4
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, weight balance, 3, 3
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, weight balance, 3, 4
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, weight balance, 4, 3
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, weight balance, 4, 4
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, weight balance, 5, 3
acc: 1.0/precision: 1.0/recall: 1.0/f1-measure: 1.0

mean, min-max, weight balance, 5, 4
acc: 1.0/precision: 1

대부분의 케이스가 모두 1.0 이라는 값이 나왔지만, 해당 모델들을 제외하고 본다면,

Median, Min-Max, Original, 4(depth), 3(leaf)

Accuracy: 0.9999663684670748

Precision: 0.9998618975279657

Recall: 1.0

F1: 0.9999309439955805

가 가장 좋은 결과를 보여준다

Random Forest

In [74]:
class SklearnRandomForestClassifier:
  def __init__(self, n_estimators, max_depth, random_state):
    self.n_estimators = n_estimators
    self.max_depth = max_depth
    self.random_state = random_state

    self.classifier = ensemble.RandomForestClassifier(
        n_estimators=self.n_estimators,
        max_depth=self.max_depth,
        random_state=self.random_state
    )

  def __call__(self, data):
    return self.classifier.predict(data)

  def train_model(self, train_data, train_label):
      self.classifier.fit(train_data, train_label)

  def test_model(self, test_data, test_label):

    pred = self.classifier.predict(test_data)
    accuracy = accuracy_score(test_label, pred)
    precision = precision_score(test_label, pred)
    recall = recall_score(test_label, pred)
    f1_measure = f1_score(test_label, pred)

    print('acc: {}/precision: {}/recall: {}/f1-measure: {}'.format(accuracy, precision, recall, f1_measure))

In [75]:
RF_hparams = hparams_dict.get('RandomForest')
RF_max_depth_list = RF_hparams.get('max_depth')
RF_n_estimators_list = RF_hparams.get('n_estimators')

for missing_value in missing_value_list:
  for scaling in scaling_list:
    for imbalance in imbalance_dataset_list:
      #데이터 불러오기
      train_processed_array, train_label_array, test_processed_array, test_label_array = preprocess(missing_value, scaling, imbalance, test_data_ratio=0.2, random_state=42, verbose=False)

      #Random Forest
      for max_depth in RF_max_depth_list:
        for n_estimators in RF_n_estimators_list:
          
            sklearn_rf_classifier = SklearnRandomForestClassifier(n_estimators, max_depth, random_state)
            sklearn_rf_classifier.train_model(train_processed_array, train_label_array)

            print(f'{missing_value}, {scaling}, {imbalance}, {max_depth}, {n_estimators}')
            print("="*10)
            sklearn_rf_classifier.test_model(test_processed_array, test_label_array)
            print()

mean, min-max, original, 3, 100
acc: 0.8613035582161835/precision: 1.0/recall: 0.4303867403314917/f1-measure: 0.6017767477790653

mean, min-max, original, 3, 150
acc: 0.8608327167552297/precision: 1.0/recall: 0.42845303867403317/f1-measure: 0.5998839682846645

mean, min-max, original, 4, 100
acc: 0.9096657025627228/precision: 1.0/recall: 0.6290055248618784/f1-measure: 0.7722570798711209

mean, min-max, original, 4, 150
acc: 0.9080177574493845/precision: 1.0/recall: 0.6222375690607734/f1-measure: 0.767134951042997

mean, min-max, original, 5, 100
acc: 0.9736328781865877/precision: 1.0/recall: 0.8917127071823204/f1-measure: 0.9427570093457944

mean, min-max, original, 5, 150
acc: 0.9557072711374185/precision: 1.0/recall: 0.8180939226519337/f1-measure: 0.8999468206335942

mean, min-max, weight balance, 3, 100
acc: 0.8613035582161835/precision: 1.0/recall: 0.4303867403314917/f1-measure: 0.6017767477790653

mean, min-max, weight balance, 3, 150
acc: 0.8608327167552297/precision: 1.0/recall:

1위 모델: mode, min-max, oversample, 5(max_depth), 150(n_estimators)

Accuracy: 0.9999
Precision: 0.9999
Recall: 1.0
F1-Measure: 0.9999
Configuration: 

2위 모델: mode, min-max, undersample, 5(max_depth), 150(n_estimators)

Accuracy: 0.9969
Precision: 0.9877
Recall: 1.0
F1-Measure: 0.9938

3위 모델: mode, standardization, undersample, 5(max_depth), 150(n_estimators)

Accuracy: 0.9973
Precision: 0.9891
Recall: 1.0
F1-Measure: 0.9945

Gradient Boosted Decision Trees

In [76]:
class SklearnGradientBoostingClassifier:
  def __init__(self, n_estimators, max_depth, random_state):
    self.n_estimators = n_estimators
    self.max_depth = max_depth
    self.random_state = random_state

    self.classifier = ensemble.GradientBoostingClassifier(
        n_estimators=self.n_estimators,
        max_depth=self.max_depth,
        random_state=self.random_state
    )

  def __call__(self, data):
    return self.classifier.predict(data)

  def train_model(self, train_data, train_label):
      self.classifier.fit(train_data, train_label)

  def test_model(self, test_data, test_label):
    pred = self.classifier.predict(test_data)

    accuracy = accuracy_score(test_label, pred)
    recall = recall_score(test_label, pred)
    precision = precision_score(test_label, pred)
    f1 = f1_score(test_label, pred)

    print('acc: {}'.format(accuracy))
    print('recall: {}'.format(recall))
    print('precision: {}'.format(precision))
    print('f1: {}'.format(f1))

In [79]:
RGBDT_hparams = hparams_dict.get('GBDT')
RGBDT_max_depth_list = RGBDT_hparams.get('max_depth')
RGBDT_n_estimators_list = RGBDT_hparams.get('n_estimators')

for missing_value in missing_value_list:
  for scaling in scaling_list:
    for imbalance in imbalance_dataset_list:
      #데이터 불러오기
      train_processed_array, train_label_array, test_processed_array, test_label_array = preprocess(missing_value, scaling, imbalance, test_data_ratio=0.2, random_state=42, verbose=False)

      #Random Forest
      for max_depth in RGBDT_max_depth_list:
        for n_estimators in RGBDT_n_estimators_list:
          
            sklearn_gbdt_classifier= SklearnGradientBoostingClassifier(n_estimators, max_depth, random_state)
            sklearn_gbdt_classifier.train_model(train_processed_array, train_label_array)

            print(f'{missing_value}, {scaling}, {imbalance}, {max_depth}, {n_estimators}')
            print("="*10)
            sklearn_gbdt_classifier.test_model(test_processed_array, test_label_array)
            print()

mean, min-max, original, 3, 100
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, original, 3, 150
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, original, 4, 100
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, original, 4, 150
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, original, 5, 100
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, original, 5, 150
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, weight balance, 3, 100
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, weight balance, 3, 150
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, weight balance, 4, 100
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, weight balance, 4, 150
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, weight balance, 5, 100
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, weight balance, 5, 150
acc: 1.0
recall: 1.0
precision: 1.0
f1: 1.0

mean, min-max, undersample, 3, 100
acc: 

KeyboardInterrupt: 

In [None]:
#모등 결과가 동일하게 1이 나왔다...