In [1]:
#!pip install optuna

**필요한 모듈 불러오기**

In [2]:
import numpy as np
import pandas as pd
import optuna

from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

**데이터 확인 및 타겟 데이터 분류**

In [3]:
df = pd.read_csv('loan_train.csv') # 엑셀 데이터 불러오기

In [4]:
df.info() # 데이터 확인 결과 Feature들의 Dtype은 int형이며 결측값이 없음.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   ID                  4000 non-null   int64
 1   Age                 4000 non-null   int64
 2   Experience          4000 non-null   int64
 3   ZIP Code            4000 non-null   int64
 4   Family              4000 non-null   int64
 5   Education           4000 non-null   int64
 6   Mortgage            4000 non-null   int64
 7   Securities Account  4000 non-null   int64
 8   CD Account          4000 non-null   int64
 9   Online              4000 non-null   int64
 10  CreditCard          4000 non-null   int64
 11  Personal Loan       4000 non-null   int64
dtypes: int64(12)
memory usage: 375.1 KB


In [5]:
df.head()

Unnamed: 0,ID,Age,Experience,ZIP Code,Family,Education,Mortgage,Securities Account,CD Account,Online,CreditCard,Personal Loan
0,3510,38,12,91330,3,3,0,0,0,0,0,0
1,1129,30,5,94025,2,2,0,0,0,0,0,1
2,1637,65,39,92122,4,3,0,0,0,0,1,0
3,3165,28,4,95136,4,1,0,0,0,1,1,0
4,3563,32,8,94596,1,3,272,1,1,1,0,1


In [6]:
X = df.drop(['ID','ZIP Code','Personal Loan',], axis = 1).values
y = df['Personal Loan'].values # y는 타겟데이터

###**Random Forest**

랜덤 포레스트는 여러 개의 결정트리(Decision Tree)를 활용한 배깅 방식의 대표적인 알고리즘

**하이퍼 파라미터 종류 및 설명**  

**n_estimators**  
- 결정트리의 갯수를 지정
- Default = 10
- 무작정 트리 갯수를 늘리면 성능 좋아지는 것 대비 시간이 걸릴 수 있음  

**min_samples_split**  
- 노드를 분할하기 위한 최소한의 샘플 데이터수
→ 과적합을 제어하는데 사용
- Default = 2 → 작게 설정할 수록 분할 노드가 많아져 과적합 가능성 증가  

**min_samples_leaf**
- 리프노드가 되기 위해 필요한 최소한의 샘플 데이터수
- min_samples_split과 함께 과적합 제어 용도
- 불균형 데이터의 경우 특정 클래스의 데이터가 극도로 작을 수 있으므로 작게 설정 필요  

**max_features**  
- 최적의 분할을 위해 고려할 최대 feature 개수
- Default = 'auto' (결정트리에서는 default가 none이었음)
- int형으로 지정 →피처 갯수 / float형으로 지정 →비중
- sqrt 또는 auto : 전체 피처 중 √(피처개수) 만큼 선정
- log : 전체 피처 중 log2(전체 피처 개수) 만큼 선정  

**max_depth**  
- 트리의 최대 깊이
- default = None
→ 완벽하게 클래스 값이 결정될 때 까지 분할
또는 데이터 개수가 min_samples_split보다 작아질 때까지 분할
- 깊이가 깊어지면 과적합될 수 있으므로 적절히 제어 필요  

**max_leaf_nodes**  
- 리프노드의 최대 개수  


In [7]:
def rf_objective(trial):
       
    rf_n_estimators = trial.suggest_int("n_estimators", 500, 1500, step=50)
    rf_max_depth = trial.suggest_int("max_depth", 1, 32, log=True)
    rf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32, log = True)
    rf_criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])

    classifier_obj = ensemble.RandomForestClassifier(n_estimators=rf_n_estimators,
                                                     max_depth=rf_max_depth,
                                                     min_samples_leaf=rf_min_samples_leaf,
                                                     criterion=rf_criterion,
                                                     n_jobs=-1,
                                                     random_state=0)

    score = model_selection.cross_val_score(classifier_obj, X, y, cv=5, n_jobs=-1, scoring = 'roc_auc')
    roc_auc = score.mean()
    return roc_auc


rf_study = optuna.create_study(direction="maximize")
rf_study.optimize(rf_objective, n_trials=70)
print(rf_study.best_trial)

[32m[I 2022-04-06 18:15:36,705][0m A new study created in memory with name: no-name-f8a04075-47e8-4574-be45-dd4fcac9c40b[0m
[32m[I 2022-04-06 18:15:50,025][0m Trial 0 finished with value: 0.7964868206143005 and parameters: {'n_estimators': 1200, 'max_depth': 7, 'min_samples_leaf': 27, 'criterion': 'gini'}. Best is trial 0 with value: 0.7964868206143005.[0m
[32m[I 2022-04-06 18:15:56,440][0m Trial 1 finished with value: 0.8026702613415964 and parameters: {'n_estimators': 1000, 'max_depth': 5, 'min_samples_leaf': 3, 'criterion': 'gini'}. Best is trial 1 with value: 0.8026702613415964.[0m
[32m[I 2022-04-06 18:15:59,660][0m Trial 2 finished with value: 0.7781323354167161 and parameters: {'n_estimators': 550, 'max_depth': 3, 'min_samples_leaf': 7, 'criterion': 'gini'}. Best is trial 1 with value: 0.8026702613415964.[0m
[32m[I 2022-04-06 18:16:04,697][0m Trial 3 finished with value: 0.7431559052036015 and parameters: {'n_estimators': 950, 'max_depth': 1, 'min_samples_leaf': 1, 

[32m[I 2022-04-06 18:20:20,311][0m Trial 34 finished with value: 0.8034264659371063 and parameters: {'n_estimators': 850, 'max_depth': 5, 'min_samples_leaf': 3, 'criterion': 'gini'}. Best is trial 31 with value: 0.8178250543937595.[0m
[32m[I 2022-04-06 18:20:29,164][0m Trial 35 finished with value: 0.816151741511758 and parameters: {'n_estimators': 1200, 'max_depth': 8, 'min_samples_leaf': 5, 'criterion': 'entropy'}. Best is trial 31 with value: 0.8178250543937595.[0m
[32m[I 2022-04-06 18:20:35,462][0m Trial 36 finished with value: 0.7942578476512895 and parameters: {'n_estimators': 900, 'max_depth': 24, 'min_samples_leaf': 31, 'criterion': 'gini'}. Best is trial 31 with value: 0.8178250543937595.[0m
[32m[I 2022-04-06 18:20:40,023][0m Trial 37 finished with value: 0.8147211504315202 and parameters: {'n_estimators': 650, 'max_depth': 6, 'min_samples_leaf': 2, 'criterion': 'entropy'}. Best is trial 31 with value: 0.8178250543937595.[0m
[32m[I 2022-04-06 18:20:49,731][0m Tri

[32m[I 2022-04-06 18:25:50,553][0m Trial 68 finished with value: 0.8162504353428133 and parameters: {'n_estimators': 1150, 'max_depth': 10, 'min_samples_leaf': 1, 'criterion': 'entropy'}. Best is trial 31 with value: 0.8178250543937595.[0m
[32m[I 2022-04-06 18:25:57,772][0m Trial 69 finished with value: 0.7953693973087924 and parameters: {'n_estimators': 1050, 'max_depth': 4, 'min_samples_leaf': 17, 'criterion': 'entropy'}. Best is trial 31 with value: 0.8178250543937595.[0m


FrozenTrial(number=31, values=[0.8178250543937595], datetime_start=datetime.datetime(2022, 4, 6, 18, 19, 47, 387791), datetime_complete=datetime.datetime(2022, 4, 6, 18, 19, 57, 173714), params={'n_estimators': 1200, 'max_depth': 8, 'min_samples_leaf': 2, 'criterion': 'entropy'}, distributions={'n_estimators': IntUniformDistribution(high=1500, low=500, step=50), 'max_depth': IntLogUniformDistribution(high=32, low=1, step=1), 'min_samples_leaf': IntLogUniformDistribution(high=32, low=1, step=1), 'criterion': CategoricalDistribution(choices=('gini', 'entropy'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=31, state=TrialState.COMPLETE, value=None)


###**Logistic Regression**

- 0 또는 1로 분류하는 분류추정기 0.5 이상은 1, 미만은 0
- 시그모이드 함수 형태로 이진분류함. 

**하이퍼 파라미터 종류 및 설명**

- solver : 알고리즘
- penalty : 규제의 유형 설정:과적합을 줄이기 위함

       - 'newton-cg'   -   ['l2', 'none']
       - 'lbfgs'       -   ['l2', 'none']
       - 'liblinear'   -   ['l1', 'l2']
       - 'sag'         -   ['l2', 'none']
       - 'saga'        -   ['elasticnet', 'l1', 'l2', 'none']
       
    > solver 방식에 따라 정해짐
- C : 규제 강도 조절하는 alpha값의 역수
    > 1(기본값), 적을 수록 규제가 강하다
- max_iter : 경사하강법 반복횟수 
    > 기본값 100
- penalty : 규제에 사용 된 기준을 지정 (l1, l2, elasticnet, none) – default : l2
- dual : 이중 또는 초기 공식
- tol : 정밀도
- C : 규제 강도
- fit_intercept : 모형에 상수항 (절편)이 있는가 없는가를 결정하는 인수 (default : True)
- intercept_scaling : 정규화 효과 정도
- class_weight : 클래스의 가중치
- random_state : 난수 seed 설정
- solver : 최적화 문제에 사용하는 알고리즘
- max_iter : 계산에 사용할 작업 수
- multi_class : 다중 분류 시에 (ovr, multinomial, auto)로 설정
- verbose : 동작 과정에 대한 출력 메시지
- warm_start : 이전 모델을 초기화로 적합하게 사용할 것인지 여부
- n_jobs : 병렬 처리 할 때 사용되는 CPU 코어 수
- l1_ratio : L1 규제의 비율(Elastic-Net 믹싱 파라미터 경우에만 사용)

In [8]:
def lr_objective(trial):
    
    lr_solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs','saga','sag'])
    lr_penalty = trial.suggest_categorical('penalty', ['l2', 'none'])
    lr_C = trial.suggest_float("C", 0.01, 15)
    lr_max_iter = trial.suggest_int('max_iter', 50, 2000)
    lr_tol = trial.suggest_float('tol', 1e-4, 1)
    
    regression_obj = LogisticRegression(solver = lr_solver,
                                        penalty = lr_penalty,
                                        C = lr_C,
                                        n_jobs = -1,
                                        random_state = 0,
                                        max_iter = lr_max_iter,
                                        tol = lr_tol)
    
    score = model_selection.cross_val_score(regression_obj, X, y, cv = 5, n_jobs = -1, scoring = 'roc_auc')
    roc_auc = score.mean()
    return roc_auc

lr_study = optuna.create_study(direction = 'maximize')
lr_study.optimize(lr_objective, n_trials = 100)
print(lr_study.best_trial)

[32m[I 2022-04-06 18:25:57,797][0m A new study created in memory with name: no-name-0e817fea-2d85-4bba-94e1-d6950ecaf34f[0m
[32m[I 2022-04-06 18:25:57,876][0m Trial 0 finished with value: 0.5462210417962582 and parameters: {'solver': 'saga', 'penalty': 'l2', 'C': 6.0533870377979175, 'max_iter': 259, 'tol': 0.37733999058455314}. Best is trial 0 with value: 0.5462210417962582.[0m
[32m[I 2022-04-06 18:25:57,921][0m Trial 1 finished with value: 0.5100628192340945 and parameters: {'solver': 'sag', 'penalty': 'l2', 'C': 4.372347004848705, 'max_iter': 1829, 'tol': 0.6353264549487759}. Best is trial 0 with value: 0.5462210417962582.[0m
[32m[I 2022-04-06 18:25:58,187][0m Trial 2 finished with value: 0.7681191748267949 and parameters: {'solver': 'lbfgs', 'penalty': 'l2', 'C': 2.046241135911315, 'max_iter': 1887, 'tol': 0.9319013541170434}. Best is trial 2 with value: 0.7681191748267949.[0m
[32m[I 2022-04-06 18:25:58,232][0m Trial 3 finished with value: 0.545356919283491 and paramet

[32m[I 2022-04-06 18:26:05,393][0m Trial 34 finished with value: 0.7670694769117985 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 12.291077665819614, 'max_iter': 1373, 'tol': 0.2939289081923317}. Best is trial 16 with value: 0.7681297834595785.[0m
[32m[I 2022-04-06 18:26:05,578][0m Trial 35 finished with value: 0.7678084003994222 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 1.2912349157996577, 'max_iter': 721, 'tol': 0.9248374421737604}. Best is trial 16 with value: 0.7681297834595785.[0m
[32m[I 2022-04-06 18:26:05,754][0m Trial 36 finished with value: 0.767960554142525 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 1.166468273612872, 'max_iter': 780, 'tol': 0.8551063188353494}. Best is trial 16 with value: 0.7681297834595785.[0m
[32m[I 2022-04-06 18:26:05,926][0m Trial 37 finished with value: 0.7677721352161642 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 2.7302643938070323, 'max_iter': 1088, 'tol': 0.86196886

[32m[I 2022-04-06 18:26:13,476][0m Trial 67 finished with value: 0.7682632967502775 and parameters: {'solver': 'lbfgs', 'penalty': 'none', 'C': 1.5817055462056797, 'max_iter': 1473, 'tol': 0.8892875061242701}. Best is trial 65 with value: 0.7682632967502775.[0m
[32m[I 2022-04-06 18:26:13,822][0m Trial 68 finished with value: 0.7682632967502775 and parameters: {'solver': 'lbfgs', 'penalty': 'none', 'C': 1.7671772554820309, 'max_iter': 1481, 'tol': 0.9033947383767297}. Best is trial 65 with value: 0.7682632967502775.[0m
[32m[I 2022-04-06 18:26:14,180][0m Trial 69 finished with value: 0.7682632967502775 and parameters: {'solver': 'lbfgs', 'penalty': 'none', 'C': 1.6025519369415917, 'max_iter': 1527, 'tol': 0.9083207267118852}. Best is trial 65 with value: 0.7682632967502775.[0m
[32m[I 2022-04-06 18:26:14,519][0m Trial 70 finished with value: 0.7682632967502775 and parameters: {'solver': 'lbfgs', 'penalty': 'none', 'C': 1.631947927770026, 'max_iter': 1453, 'tol': 0.9042528191666

[32m[I 2022-04-06 18:26:27,154][0m Trial 98 finished with value: 0.7683926279999407 and parameters: {'solver': 'lbfgs', 'penalty': 'none', 'C': 0.7665866924782854, 'max_iter': 1619, 'tol': 0.8461280512838271}. Best is trial 73 with value: 0.7683926279999407.[0m
[32m[I 2022-04-06 18:26:27,633][0m Trial 99 finished with value: 0.7683926279999407 and parameters: {'solver': 'lbfgs', 'penalty': 'none', 'C': 0.273459307848602, 'max_iter': 1710, 'tol': 0.8338333420556034}. Best is trial 73 with value: 0.7683926279999407.[0m


FrozenTrial(number=73, values=[0.7683926279999407], datetime_start=datetime.datetime(2022, 4, 6, 18, 26, 15, 350114), datetime_complete=datetime.datetime(2022, 4, 6, 18, 26, 15, 817669), params={'solver': 'lbfgs', 'penalty': 'none', 'C': 0.3595544920908258, 'max_iter': 1656, 'tol': 0.8303852448559972}, distributions={'solver': CategoricalDistribution(choices=('newton-cg', 'lbfgs', 'saga', 'sag')), 'penalty': CategoricalDistribution(choices=('l2', 'none')), 'C': UniformDistribution(high=15.0, low=0.01), 'max_iter': IntUniformDistribution(high=2000, low=50, step=1), 'tol': UniformDistribution(high=1.0, low=0.0001)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=73, state=TrialState.COMPLETE, value=None)


###**k-NN**

- 거리기반 분류분석 모델
- 데이터로부터 거리가 가까운 'k'개의 다른 데이터의 레이블을 참조하여 분류하는 알고리즘

**하이퍼 파라미터 종류 및 설명**

- Distance (거리) : 유클리디안 거리, 맨하탄 거리 두가지를 일반적으로 많이씀
- 이웃수 : n_neighbors = K  
K가 작을 수록 모델이 복잡해져 과적합이 일어나고 너무 크면 단순해져 성능이 나빠진다.  
n_neighbors는 Feature수의 제곱근 정도를 지정할 때 성능이 좋은 것으로 알려져 있다.

In [9]:
def kNN_objective(trial):
    
    kNN_n_neighbors = trial.suggest_int("n_neighbors", 50, 150)
    kNN_weights = trial.suggest_categorical("weights", ['uniform', 'distance'])
    kNN_metric = trial.suggest_categorical("metric", ['euclidean', 'manhattan', 'minkowski'])
    
    kNN_obj = KNeighborsClassifier(n_neighbors=kNN_n_neighbors,
                                   weights=kNN_weights,
                                   metric=kNN_metric)
        

    score = model_selection.cross_val_score(kNN_obj, X, y, cv = 5, n_jobs = -1, scoring = 'roc_auc')
    roc_auc = score.mean()
    return roc_auc

kNN_study = optuna.create_study(direction="maximize")
kNN_study.optimize(kNN_objective, n_trials=100)
print(kNN_study.best_trial)

[32m[I 2022-04-06 18:26:27,658][0m A new study created in memory with name: no-name-2654aefd-333f-4517-8e42-7aab37d5e2b0[0m
[32m[I 2022-04-06 18:26:27,792][0m Trial 0 finished with value: 0.660302765152501 and parameters: {'n_neighbors': 79, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.660302765152501.[0m
[32m[I 2022-04-06 18:26:27,916][0m Trial 1 finished with value: 0.6663259793147513 and parameters: {'n_neighbors': 76, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 1 with value: 0.6663259793147513.[0m
[32m[I 2022-04-06 18:26:28,047][0m Trial 2 finished with value: 0.6661684969999018 and parameters: {'n_neighbors': 110, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 1 with value: 0.6663259793147513.[0m
[32m[I 2022-04-06 18:26:28,150][0m Trial 3 finished with value: 0.6463236313049945 and parameters: {'n_neighbors': 64, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 0.6663259793147513.[

[32m[I 2022-04-06 18:26:32,667][0m Trial 37 finished with value: 0.6392069445800959 and parameters: {'n_neighbors': 135, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 20 with value: 0.6830800139654956.[0m
[32m[I 2022-04-06 18:26:32,815][0m Trial 38 finished with value: 0.6699965321763324 and parameters: {'n_neighbors': 144, 'weights': 'distance', 'metric': 'minkowski'}. Best is trial 20 with value: 0.6830800139654956.[0m
[32m[I 2022-04-06 18:26:32,963][0m Trial 39 finished with value: 0.6811820554298751 and parameters: {'n_neighbors': 127, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 20 with value: 0.6830800139654956.[0m
[32m[I 2022-04-06 18:26:33,108][0m Trial 40 finished with value: 0.666938593515295 and parameters: {'n_neighbors': 120, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 20 with value: 0.6830800139654956.[0m
[32m[I 2022-04-06 18:26:33,223][0m Trial 41 finished with value: 0.6810310037874185 and parameters: {'n_neigh

[32m[I 2022-04-06 18:26:37,605][0m Trial 74 finished with value: 0.6801773332433058 and parameters: {'n_neighbors': 111, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 52 with value: 0.6835759029107099.[0m
[32m[I 2022-04-06 18:26:37,738][0m Trial 75 finished with value: 0.6810346751275509 and parameters: {'n_neighbors': 128, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 52 with value: 0.6835759029107099.[0m
[32m[I 2022-04-06 18:26:37,871][0m Trial 76 finished with value: 0.6816202620725867 and parameters: {'n_neighbors': 118, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 52 with value: 0.6835759029107099.[0m
[32m[I 2022-04-06 18:26:38,010][0m Trial 77 finished with value: 0.6781389124700123 and parameters: {'n_neighbors': 99, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 52 with value: 0.6835759029107099.[0m
[32m[I 2022-04-06 18:26:38,146][0m Trial 78 finished with value: 0.6384680441073111 and parameters: {'n_nei

FrozenTrial(number=52, values=[0.6835759029107099], datetime_start=datetime.datetime(2022, 4, 6, 18, 26, 34, 486621), datetime_complete=datetime.datetime(2022, 4, 6, 18, 26, 34, 606299), params={'n_neighbors': 123, 'weights': 'distance', 'metric': 'manhattan'}, distributions={'n_neighbors': IntUniformDistribution(high=150, low=50, step=1), 'weights': CategoricalDistribution(choices=('uniform', 'distance')), 'metric': CategoricalDistribution(choices=('euclidean', 'manhattan', 'minkowski'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=52, state=TrialState.COMPLETE, value=None)


###**Tuning한 각 모델의 Best score**

In [10]:
print('Random Forest Best score :', rf_study.best_value)
print('Logistic Regression Best score :', lr_study.best_value)
print('k-NN Best score :', kNN_study.best_value)

Random Forest Best score : 0.8178250543937595
Logistic Regression Best score : 0.7683926279999407
k-NN Best score : 0.6835759029107099
