In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import optuna

# randomforest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# adaboost & decisiontree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
# catboost
from catboost import CatBoostClassifier
# MLP
from sklearn.neural_network import MLPClassifier
# stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [9]:
# 랜덤시드 고정
import random
import os

def seed_everything(seed: int = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(seed=42) # Seed 고정

In [10]:
df=pd.read_csv("fire_occasion1.csv")

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36259 entries, 0 to 36258
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   화재시각      36259 non-null  object 
 1   시도        36259 non-null  object 
 2   시군구       36259 non-null  object 
 3   화재유형      36259 non-null  object 
 4   발화열원(대)   36259 non-null  object 
 5   발화열원(소)   36259 non-null  object 
 6   발화요인(대)   36259 non-null  object 
 7   발화요인(소)   36259 non-null  object 
 8   최초착화물(대)  36259 non-null  object 
 9   최초착화물(소)  36259 non-null  object 
 10  인명피해      36259 non-null  int64  
 11  사망        36259 non-null  int64  
 12  부상        36259 non-null  int64  
 13  재산피해      36259 non-null  int64  
 14  장소(대)     36259 non-null  object 
 15  장소(중)     36259 non-null  object 
 16  장소(소)     36259 non-null  object 
 17  부동산       36259 non-null  float64
 18  동산        36259 non-null  float64
 19  month     36259 non-null  int64  
 20  day       36259 non-null  in

In [12]:
df.loc[df['재산피해']<=1000,'재산피해']=0
df.loc[(df['재산피해']>1000)&
       (df['재산피해']<=50000),'재산피해']=1
df.loc[df['재산피해']>50000,'재산피해']=2

In [13]:
df['재산피해'].value_counts()/len(df['재산피해'])

0    0.575885
1    0.388455
2    0.035660
Name: 재산피해, dtype: float64

### 1. 데이터 전처리

In [7]:
# 재산피해 = 부동산 + 동산
X=df.drop(columns=['화재시각','재산피해', '부동산', '동산'])
y=df['재산피해']

In [8]:
category_columns=['시도', '시군구', '화재유형', '발화열원(대)', '발화열원(소)', '발화요인(대)', '발화요인(소)',
                  '최초착화물(대)', '최초착화물(소)', '장소(대)', '장소(중)', '장소(소)']

for col in category_columns:
    label_encoder = LabelEncoder()
    X[col]=label_encoder.fit_transform(X[col])

In [9]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,시도,시군구,화재유형,발화열원(대),발화열원(소),발화요인(대),발화요인(소),최초착화물(대),최초착화물(소),인명피해,사망,부상,장소(대),장소(중),장소(소),month,day,hour,minute
0,0.5000,0.140969,0.0,0.500,0.703704,0.636364,0.687500,0.250000,0.078652,0.000000,0.00,0.000000,0.285714,0.122449,0.117834,0.0,0.0,0.000000,0.000000
1,0.2500,0.114537,0.0,0.750,0.037037,0.636364,0.729167,0.416667,0.865169,0.000000,0.00,0.000000,0.285714,0.122449,0.770701,0.0,0.0,0.000000,0.084746
2,0.2500,0.114537,0.0,0.375,0.481481,0.363636,0.395833,0.333333,0.292135,0.030303,0.25,0.000000,1.000000,0.693878,0.659236,0.0,0.0,0.000000,0.101695
3,0.0625,0.568282,0.0,0.750,0.740741,0.818182,0.833333,0.750000,0.629213,0.000000,0.00,0.000000,0.357143,0.653061,0.955414,0.0,0.0,0.000000,0.118644
4,0.0625,0.559471,0.0,0.750,0.740741,0.818182,0.416667,0.750000,0.629213,0.000000,0.00,0.000000,0.285714,0.346939,0.028662,0.0,0.0,0.000000,0.203390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36254,0.0625,0.925110,0.0,0.750,0.740741,0.818182,0.812500,0.750000,0.629213,0.000000,0.00,0.000000,0.857143,0.306122,0.312102,1.0,1.0,0.956522,0.915254
36255,0.8125,0.748899,0.0,0.500,0.185185,0.636364,0.500000,0.916667,0.516854,0.000000,0.00,0.000000,0.857143,0.306122,0.321656,1.0,1.0,1.000000,0.118644
36256,0.7500,0.088106,0.0,0.500,0.000000,0.636364,0.500000,0.833333,0.280899,0.000000,0.00,0.000000,0.857143,0.244898,0.248408,1.0,1.0,1.000000,0.152542
36257,0.1875,0.933921,1.0,0.750,0.037037,0.181818,0.062500,0.666667,0.382022,0.000000,0.00,0.000000,0.785714,0.734694,0.429936,1.0,1.0,1.000000,0.423729


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2. 결과 요약

#### 파라미터 조정 : optuna 사용  
  
* 1. random forest Accuracy : 약 0.708  (0.7079426365140651)
* 2. ada boost Accuracy     : 약 0.704  (0.7042195256480971)
* 3. decision tree Accuracy : 약 0.685  (0.6854660783232212)
* 4. catboost Accuracy      : 약 0.714  (0.7140099282956426)
* 5. MLP Accuracy           : 약 0.693  (0.6934638720353006)  
  
  
* 6.1. stacking Accuracy : random forest + ada boost + MLP, final_estimator = decision tree  
     => 약 0.617  (6169332597904027)
* 6.2. stacking Accuracy : random forest + ada boost + MLP, final_estimator = logistic regression  
     => 약 0.706  (0.706150027578599)

### 3. modeling

#### 3.1 Random Forest

##### 파라미터 조정

In [17]:
def objective(trial):
    # 하이퍼파라미터 탐색 공간 정의
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)

    # 랜덤 포레스트 회귀 모델 생성
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=1,
        random_state=42
    )
    
    # 모델 학습
    model.fit(X_train, y_train)

    # 검증 데이터에 대한 평가 지표 계산
    y_pred = model.predict(X_test)
    accuracy=accuracy_score(y_test, y_pred)
    return accuracy

In [18]:
# Optuna로 하이퍼파라미터 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-11-24 01:35:33,865] A new study created in memory with name: no-name-85c5ff92-f8d7-4f63-b251-cd3de8b03aba
[I 2023-11-24 01:35:38,820] Trial 0 finished with value: 0.6959459459459459 and parameters: {'n_estimators': 242, 'max_depth': 8, 'min_samples_split': 9}. Best is trial 0 with value: 0.6959459459459459.
[I 2023-11-24 01:35:40,614] Trial 1 finished with value: 0.6752619966905681 and parameters: {'n_estimators': 126, 'max_depth': 5, 'min_samples_split': 20}. Best is trial 0 with value: 0.6959459459459459.
[I 2023-11-24 01:35:42,823] Trial 2 finished with value: 0.7000827357970215 and parameters: {'n_estimators': 62, 'max_depth': 16, 'min_samples_split': 6}. Best is trial 2 with value: 0.7000827357970215.
[I 2023-11-24 01:35:45,730] Trial 3 finished with value: 0.6937396580253723 and parameters: {'n_estimators': 140, 'max_depth': 8, 'min_samples_split': 9}. Best is trial 2 with value: 0.7000827357970215.
[I 2023-11-24 01:35:51,631] Trial 4 finished with value: 0.70159955874241

[I 2023-11-24 01:40:11,429] Trial 40 finished with value: 0.7013237727523441 and parameters: {'n_estimators': 69, 'max_depth': 10, 'min_samples_split': 18}. Best is trial 13 with value: 0.7062879205736349.
[I 2023-11-24 01:40:18,778] Trial 41 finished with value: 0.7049089906232764 and parameters: {'n_estimators': 231, 'max_depth': 17, 'min_samples_split': 19}. Best is trial 13 with value: 0.7062879205736349.
[I 2023-11-24 01:40:25,260] Trial 42 finished with value: 0.7011858797573083 and parameters: {'n_estimators': 189, 'max_depth': 19, 'min_samples_split': 16}. Best is trial 13 with value: 0.7062879205736349.
[I 2023-11-24 01:40:30,827] Trial 43 finished with value: 0.7017374517374517 and parameters: {'n_estimators': 158, 'max_depth': 18, 'min_samples_split': 13}. Best is trial 13 with value: 0.7062879205736349.
[I 2023-11-24 01:40:35,129] Trial 44 finished with value: 0.7054605626034197 and parameters: {'n_estimators': 126, 'max_depth': 20, 'min_samples_split': 15}. Best is trial 1

[I 2023-11-24 01:43:31,185] Trial 80 finished with value: 0.7072531715388858 and parameters: {'n_estimators': 252, 'max_depth': 19, 'min_samples_split': 13}. Best is trial 65 with value: 0.7079426365140651.
[I 2023-11-24 01:43:39,892] Trial 81 finished with value: 0.7073910645339216 and parameters: {'n_estimators': 253, 'max_depth': 19, 'min_samples_split': 13}. Best is trial 65 with value: 0.7079426365140651.
[I 2023-11-24 01:43:48,512] Trial 82 finished with value: 0.7073910645339216 and parameters: {'n_estimators': 253, 'max_depth': 19, 'min_samples_split': 13}. Best is trial 65 with value: 0.7079426365140651.
[I 2023-11-24 01:43:57,297] Trial 83 finished with value: 0.7049089906232764 and parameters: {'n_estimators': 252, 'max_depth': 20, 'min_samples_split': 12}. Best is trial 65 with value: 0.7079426365140651.
[I 2023-11-24 01:44:05,396] Trial 84 finished with value: 0.7042195256480971 and parameters: {'n_estimators': 234, 'max_depth': 19, 'min_samples_split': 11}. Best is trial 

In [19]:
# 최적의 하이퍼파라미터 및 결과 출력
best_params = study.best_params
best_accuracy = study.best_value
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy: {best_accuracy}")

Best Hyperparameters: {'n_estimators': 133, 'max_depth': 20, 'min_samples_split': 17}
Best Accuracy: 0.7079426365140651


#### 3.2 AdaBoost

##### 파라미터 조정

In [24]:
def objective(trial):
    # 하이퍼파라미터 탐색 공간 정의
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    learning_rate = trial.suggest_float('learning_rate', 0.1, 1.0)

    # 랜덤 포레스트 회귀 모델 생성
    base_classifier = DecisionTreeClassifier(max_depth=max_depth)
    model = AdaBoostClassifier(
        base_classifier,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        random_state=42
    )
    
    # 모델 학습
    model.fit(X_train, y_train)

    # 검증 데이터에 대한 평가 지표 계산
    y_pred = model.predict(X_test)
    accuracy=accuracy_score(y_test, y_pred)
    return accuracy

In [25]:
# Optuna로 하이퍼파라미터 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-11-24 01:48:54,637] A new study created in memory with name: no-name-ccd7340f-9d2b-4fd8-a3e2-22fbbe143a48
[I 2023-11-24 01:49:12,523] Trial 0 finished with value: 0.676365140650855 and parameters: {'n_estimators': 72, 'max_depth': 20, 'learning_rate': 0.8270315968295711}. Best is trial 0 with value: 0.676365140650855.
[I 2023-11-24 01:49:41,254] Trial 1 finished with value: 0.673469387755102 and parameters: {'n_estimators': 155, 'max_depth': 12, 'learning_rate': 0.8581983092701817}. Best is trial 0 with value: 0.676365140650855.
[I 2023-11-24 01:50:04,687] Trial 2 finished with value: 0.6755377826806398 and parameters: {'n_estimators': 111, 'max_depth': 15, 'learning_rate': 0.7033785024234952}. Best is trial 0 with value: 0.676365140650855.
[I 2023-11-24 01:50:38,983] Trial 3 finished with value: 0.6773303916161059 and parameters: {'n_estimators': 180, 'max_depth': 13, 'learning_rate': 0.39405569268043683}. Best is trial 3 with value: 0.6773303916161059.
[I 2023-11-24 01:51:07,

[I 2023-11-24 01:57:22,369] Trial 38 finished with value: 0.666023166023166 and parameters: {'n_estimators': 149, 'max_depth': 12, 'learning_rate': 0.3316753014845071}. Best is trial 12 with value: 0.7010479867622724.
[I 2023-11-24 01:57:48,638] Trial 39 finished with value: 0.6766409266409267 and parameters: {'n_estimators': 113, 'max_depth': 19, 'learning_rate': 0.28932077856207744}. Best is trial 12 with value: 0.7010479867622724.
[I 2023-11-24 01:57:51,810] Trial 40 finished with value: 0.6959459459459459 and parameters: {'n_estimators': 64, 'max_depth': 3, 'learning_rate': 0.4497791377698136}. Best is trial 12 with value: 0.7010479867622724.
[I 2023-11-24 01:57:58,101] Trial 41 finished with value: 0.6973248758963044 and parameters: {'n_estimators': 101, 'max_depth': 4, 'learning_rate': 0.3773442031104808}. Best is trial 12 with value: 0.7010479867622724.
[I 2023-11-24 01:58:04,075] Trial 42 finished with value: 0.686569222283508 and parameters: {'n_estimators': 82, 'max_depth': 5

[I 2023-11-24 02:04:08,634] Trial 76 finished with value: 0.7022890237175952 and parameters: {'n_estimators': 155, 'max_depth': 3, 'learning_rate': 0.22447309598302348}. Best is trial 43 with value: 0.7032542746828461.
[I 2023-11-24 02:04:19,988] Trial 77 finished with value: 0.6897407611693326 and parameters: {'n_estimators': 157, 'max_depth': 5, 'learning_rate': 0.22661764823084513}. Best is trial 43 with value: 0.7032542746828461.
[I 2023-11-24 02:04:28,406] Trial 78 finished with value: 0.7015995587424159 and parameters: {'n_estimators': 162, 'max_depth': 3, 'learning_rate': 0.19188766937493432}. Best is trial 43 with value: 0.7032542746828461.
[I 2023-11-24 02:04:35,131] Trial 79 finished with value: 0.7032542746828461 and parameters: {'n_estimators': 141, 'max_depth': 3, 'learning_rate': 0.1269964273715558}. Best is trial 43 with value: 0.7032542746828461.
[I 2023-11-24 02:04:42,777] Trial 80 finished with value: 0.7009100937672367 and parameters: {'n_estimators': 152, 'max_depth

In [26]:
# 최적의 하이퍼파라미터 및 결과 출력
best_params = study.best_params
best_accuracy = study.best_value
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy: {best_accuracy}")

Best Hyperparameters: {'n_estimators': 175, 'max_depth': 4, 'learning_rate': 0.15900320437580656}
Best Accuracy: 0.7042195256480971


#### 3.3 decision tree

##### 파라미터 조정

In [31]:
def objective(trial):
    # 하이퍼파라미터 탐색 공간 정의
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)

    # 랜덤 포레스트 회귀 모델 생성
    model = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=1,
        random_state=42
    )
    
    # 모델 학습
    model.fit(X_train, y_train)

    # 검증 데이터에 대한 평가 지표 계산
    y_pred = model.predict(X_test)
    accuracy=accuracy_score(y_test, y_pred)
    return accuracy

In [32]:
# Optuna로 하이퍼파라미터 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-11-24 02:08:42,626] A new study created in memory with name: no-name-a0d74645-c255-4c8c-80c6-9ea729c00868
[I 2023-11-24 02:08:42,745] Trial 0 finished with value: 0.680088251516823 and parameters: {'max_depth': 8, 'min_samples_split': 5}. Best is trial 0 with value: 0.680088251516823.
[I 2023-11-24 02:08:42,898] Trial 1 finished with value: 0.6851902923331494 and parameters: {'max_depth': 9, 'min_samples_split': 13}. Best is trial 1 with value: 0.6851902923331494.
[I 2023-11-24 02:08:42,997] Trial 2 finished with value: 0.6832597904026475 and parameters: {'max_depth': 7, 'min_samples_split': 3}. Best is trial 1 with value: 0.6851902923331494.
[I 2023-11-24 02:08:43,094] Trial 3 finished with value: 0.6851902923331494 and parameters: {'max_depth': 9, 'min_samples_split': 17}. Best is trial 1 with value: 0.6851902923331494.
[I 2023-11-24 02:08:43,331] Trial 4 finished with value: 0.6490623276337562 and parameters: {'max_depth': 15, 'min_samples_split': 8}. Best is trial 1 with va

[I 2023-11-24 02:08:49,139] Trial 44 finished with value: 0.6784335355763927 and parameters: {'max_depth': 11, 'min_samples_split': 14}. Best is trial 1 with value: 0.6851902923331494.
[I 2023-11-24 02:08:49,404] Trial 45 finished with value: 0.6374793160507446 and parameters: {'max_depth': 20, 'min_samples_split': 16}. Best is trial 1 with value: 0.6851902923331494.
[I 2023-11-24 02:08:49,577] Trial 46 finished with value: 0.6675399889685604 and parameters: {'max_depth': 13, 'min_samples_split': 11}. Best is trial 1 with value: 0.6851902923331494.
[I 2023-11-24 02:08:49,725] Trial 47 finished with value: 0.6829840044125759 and parameters: {'max_depth': 10, 'min_samples_split': 19}. Best is trial 1 with value: 0.6851902923331494.
[I 2023-11-24 02:08:49,891] Trial 48 finished with value: 0.6696083838940982 and parameters: {'max_depth': 12, 'min_samples_split': 17}. Best is trial 1 with value: 0.6851902923331494.
[I 2023-11-24 02:08:50,031] Trial 49 finished with value: 0.683397683397683

[I 2023-11-24 02:08:55,704] Trial 89 finished with value: 0.680088251516823 and parameters: {'max_depth': 8, 'min_samples_split': 4}. Best is trial 72 with value: 0.6854660783232212.
[I 2023-11-24 02:08:55,871] Trial 90 finished with value: 0.681880860452289 and parameters: {'max_depth': 10, 'min_samples_split': 5}. Best is trial 72 with value: 0.6854660783232212.
[I 2023-11-24 02:08:55,993] Trial 91 finished with value: 0.6853281853281853 and parameters: {'max_depth': 9, 'min_samples_split': 3}. Best is trial 72 with value: 0.6854660783232212.
[I 2023-11-24 02:08:56,117] Trial 92 finished with value: 0.6853281853281853 and parameters: {'max_depth': 9, 'min_samples_split': 3}. Best is trial 72 with value: 0.6854660783232212.
[I 2023-11-24 02:08:56,234] Trial 93 finished with value: 0.6853281853281853 and parameters: {'max_depth': 9, 'min_samples_split': 3}. Best is trial 72 with value: 0.6854660783232212.
[I 2023-11-24 02:08:56,370] Trial 94 finished with value: 0.6849145063430778 and 

In [33]:
# 최적의 하이퍼파라미터 및 결과 출력
best_params = study.best_params
best_accuracy = study.best_value
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy: {best_accuracy}")

Best Hyperparameters: {'max_depth': 9, 'min_samples_split': 4}
Best Accuracy: 0.6854660783232212


#### 3.4 CatBoostClassifier

In [14]:
X1=df.drop(columns=['화재시각','재산피해', '부동산', '동산'])
y1=df['재산피해']

In [15]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [16]:
category_columns=['시도', '시군구', '화재유형', '발화열원(대)', '발화열원(소)', '발화요인(대)', '발화요인(소)',
                  '최초착화물(대)', '최초착화물(소)', '장소(대)', '장소(중)', '장소(소)']

In [17]:
# CatBoostClassifier 생성
model = CatBoostClassifier(iterations=500,
                           depth=10,
                           learning_rate=0.05,
                           loss_function='MultiClass',
                           cat_features=category_columns,
                           random_state=42)

# 모델 훈련
model.fit(X_train1, y_train1)

0:	learn: 1.0619164	total: 368ms	remaining: 3m 3s
1:	learn: 1.0285197	total: 814ms	remaining: 3m 22s
2:	learn: 0.9988969	total: 1.2s	remaining: 3m 19s
3:	learn: 0.9719522	total: 1.66s	remaining: 3m 25s
4:	learn: 0.9466674	total: 2.09s	remaining: 3m 26s
5:	learn: 0.9242239	total: 2.54s	remaining: 3m 29s
6:	learn: 0.9033660	total: 2.99s	remaining: 3m 30s
7:	learn: 0.8842106	total: 3.37s	remaining: 3m 27s
8:	learn: 0.8666160	total: 3.81s	remaining: 3m 28s
9:	learn: 0.8508885	total: 4.18s	remaining: 3m 24s
10:	learn: 0.8363781	total: 4.65s	remaining: 3m 26s
11:	learn: 0.8221994	total: 5.1s	remaining: 3m 27s
12:	learn: 0.8089941	total: 5.58s	remaining: 3m 29s
13:	learn: 0.7970486	total: 6.03s	remaining: 3m 29s
14:	learn: 0.7866376	total: 6.15s	remaining: 3m 18s
15:	learn: 0.7762632	total: 6.53s	remaining: 3m 17s
16:	learn: 0.7661241	total: 6.98s	remaining: 3m 18s
17:	learn: 0.7567558	total: 7.42s	remaining: 3m 18s
18:	learn: 0.7483219	total: 7.82s	remaining: 3m 18s
19:	learn: 0.7401244	tota

158:	learn: 0.5520709	total: 1m 11s	remaining: 2m 32s
159:	learn: 0.5517631	total: 1m 11s	remaining: 2m 31s
160:	learn: 0.5510857	total: 1m 11s	remaining: 2m 31s
161:	learn: 0.5501309	total: 1m 12s	remaining: 2m 31s
162:	learn: 0.5496520	total: 1m 12s	remaining: 2m 30s
163:	learn: 0.5494719	total: 1m 13s	remaining: 2m 30s
164:	learn: 0.5490134	total: 1m 13s	remaining: 2m 29s
165:	learn: 0.5487899	total: 1m 14s	remaining: 2m 29s
166:	learn: 0.5484439	total: 1m 14s	remaining: 2m 29s
167:	learn: 0.5482071	total: 1m 15s	remaining: 2m 28s
168:	learn: 0.5479298	total: 1m 15s	remaining: 2m 28s
169:	learn: 0.5476291	total: 1m 16s	remaining: 2m 27s
170:	learn: 0.5473862	total: 1m 16s	remaining: 2m 27s
171:	learn: 0.5472697	total: 1m 17s	remaining: 2m 26s
172:	learn: 0.5471113	total: 1m 17s	remaining: 2m 26s
173:	learn: 0.5465982	total: 1m 17s	remaining: 2m 25s
174:	learn: 0.5460628	total: 1m 18s	remaining: 2m 25s
175:	learn: 0.5453516	total: 1m 18s	remaining: 2m 25s
176:	learn: 0.5452237	total:

311:	learn: 0.4849735	total: 2m 25s	remaining: 1m 27s
312:	learn: 0.4843918	total: 2m 26s	remaining: 1m 27s
313:	learn: 0.4839304	total: 2m 26s	remaining: 1m 26s
314:	learn: 0.4834927	total: 2m 27s	remaining: 1m 26s
315:	learn: 0.4832176	total: 2m 27s	remaining: 1m 26s
316:	learn: 0.4827986	total: 2m 28s	remaining: 1m 25s
317:	learn: 0.4823716	total: 2m 29s	remaining: 1m 25s
318:	learn: 0.4816442	total: 2m 29s	remaining: 1m 24s
319:	learn: 0.4811130	total: 2m 30s	remaining: 1m 24s
320:	learn: 0.4806089	total: 2m 30s	remaining: 1m 24s
321:	learn: 0.4801877	total: 2m 31s	remaining: 1m 23s
322:	learn: 0.4798363	total: 2m 31s	remaining: 1m 23s
323:	learn: 0.4797082	total: 2m 32s	remaining: 1m 22s
324:	learn: 0.4793159	total: 2m 32s	remaining: 1m 22s
325:	learn: 0.4790728	total: 2m 33s	remaining: 1m 21s
326:	learn: 0.4785490	total: 2m 33s	remaining: 1m 21s
327:	learn: 0.4783539	total: 2m 34s	remaining: 1m 20s
328:	learn: 0.4778748	total: 2m 34s	remaining: 1m 20s
329:	learn: 0.4772932	total:

466:	learn: 0.4226454	total: 3m 47s	remaining: 16.1s
467:	learn: 0.4222525	total: 3m 48s	remaining: 15.6s
468:	learn: 0.4219332	total: 3m 49s	remaining: 15.1s
469:	learn: 0.4214404	total: 3m 49s	remaining: 14.7s
470:	learn: 0.4210953	total: 3m 50s	remaining: 14.2s
471:	learn: 0.4207631	total: 3m 50s	remaining: 13.7s
472:	learn: 0.4202719	total: 3m 51s	remaining: 13.2s
473:	learn: 0.4200551	total: 3m 51s	remaining: 12.7s
474:	learn: 0.4196316	total: 3m 52s	remaining: 12.2s
475:	learn: 0.4193843	total: 3m 52s	remaining: 11.7s
476:	learn: 0.4189667	total: 3m 53s	remaining: 11.3s
477:	learn: 0.4185810	total: 3m 53s	remaining: 10.8s
478:	learn: 0.4181833	total: 3m 54s	remaining: 10.3s
479:	learn: 0.4177480	total: 3m 54s	remaining: 9.79s
480:	learn: 0.4172525	total: 3m 55s	remaining: 9.3s
481:	learn: 0.4165552	total: 3m 55s	remaining: 8.81s
482:	learn: 0.4159659	total: 3m 56s	remaining: 8.32s
483:	learn: 0.4156252	total: 3m 57s	remaining: 7.84s
484:	learn: 0.4154230	total: 3m 57s	remaining: 

<catboost.core.CatBoostClassifier at 0x281ae940eb0>

In [18]:
# 모델 평가
accuracy = model.score(X_test1, y_test1)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7140099282956426


In [19]:
guesses = model.predict(X_test1)

In [22]:
guesses

array([[1],
       [0],
       [1],
       ...,
       [1],
       [0],
       [0]], dtype=int64)

In [21]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test1, guesses))

[[3303  864    0]
 [ 966 1845   23]
 [  26  195   30]]


#### 3.5 MLP

In [46]:
def objective(trial):
    # 하이퍼파라미터 탐색 공간 정의
    activation = trial.suggest_categorical('activation', ['relu','logistic'])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True)

    # MLP 모델 생성
    model = MLPClassifier(hidden_layer_sizes=(32,16,8),
                          max_iter=500,
                          activation=activation,
                          learning_rate_init=learning_rate,
                          solver='adam',
                          random_state=42)

    
    # 모델 학습
    model.fit(X_train, y_train)

    # 검증 데이터에 대한 평가 지표 계산
    y_pred = model.predict(X_test)
    accuracy=accuracy_score(y_test, y_pred)
    return accuracy

In [47]:
# Optuna로 하이퍼파라미터 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-11-24 02:15:39,027] A new study created in memory with name: no-name-859a430b-0511-42e7-b6d3-7d14831d4b5e
[I 2023-11-24 02:16:00,961] Trial 0 finished with value: 0.6861555432984004 and parameters: {'activation': 'relu', 'learning_rate': 0.00302511008404565}. Best is trial 0 with value: 0.6861555432984004.
[I 2023-11-24 02:16:19,494] Trial 1 finished with value: 0.6858797573083287 and parameters: {'activation': 'relu', 'learning_rate': 0.0018641766752840966}. Best is trial 0 with value: 0.6861555432984004.
[I 2023-11-24 02:16:32,611] Trial 2 finished with value: 0.6817429674572532 and parameters: {'activation': 'logistic', 'learning_rate': 0.030921723848931857}. Best is trial 0 with value: 0.6861555432984004.
[I 2023-11-24 02:16:46,321] Trial 3 finished with value: 0.6829840044125759 and parameters: {'activation': 'logistic', 'learning_rate': 0.019084550713975714}. Best is trial 0 with value: 0.6861555432984004.
[I 2023-11-24 02:17:10,160] Trial 4 finished with value: 0.6897407

[I 2023-11-24 02:32:42,682] Trial 39 finished with value: 0.6898786541643684 and parameters: {'activation': 'logistic', 'learning_rate': 0.0029632572044093887}. Best is trial 20 with value: 0.6934638720353006.
[I 2023-11-24 02:33:10,839] Trial 40 finished with value: 0.6875344732487589 and parameters: {'activation': 'relu', 'learning_rate': 0.0022611544095083036}. Best is trial 20 with value: 0.6934638720353006.
[I 2023-11-24 02:34:00,628] Trial 41 finished with value: 0.6920849420849421 and parameters: {'activation': 'logistic', 'learning_rate': 0.0036855149295595196}. Best is trial 20 with value: 0.6934638720353006.
[I 2023-11-24 02:34:35,690] Trial 42 finished with value: 0.6883618312189741 and parameters: {'activation': 'logistic', 'learning_rate': 0.003464420183252045}. Best is trial 20 with value: 0.6934638720353006.
[I 2023-11-24 02:35:07,656] Trial 43 finished with value: 0.6913954771097628 and parameters: {'activation': 'logistic', 'learning_rate': 0.004274518548967719}. Best 

[I 2023-11-24 02:54:19,659] Trial 78 finished with value: 0.6908439051296195 and parameters: {'activation': 'logistic', 'learning_rate': 0.0037474768793824744}. Best is trial 20 with value: 0.6934638720353006.
[I 2023-11-24 02:54:52,526] Trial 79 finished with value: 0.6879481522338665 and parameters: {'activation': 'logistic', 'learning_rate': 0.0031500837820273232}. Best is trial 20 with value: 0.6934638720353006.
[I 2023-11-24 02:55:24,027] Trial 80 finished with value: 0.6879481522338665 and parameters: {'activation': 'logistic', 'learning_rate': 0.00233612485235511}. Best is trial 20 with value: 0.6934638720353006.
[I 2023-11-24 02:55:55,709] Trial 81 finished with value: 0.686569222283508 and parameters: {'activation': 'logistic', 'learning_rate': 0.0027952433785647206}. Best is trial 20 with value: 0.6934638720353006.
[I 2023-11-24 02:56:28,272] Trial 82 finished with value: 0.6889134031991175 and parameters: {'activation': 'logistic', 'learning_rate': 0.0037804129503424106}. Be

In [48]:
# 최적의 하이퍼파라미터 및 결과 출력
best_params = study.best_params
best_accuracy = study.best_value
print(f"Best Hyperparameters: {best_params}")
print(f"Best Accuracy: {best_accuracy}")

Best Hyperparameters: {'activation': 'logistic', 'learning_rate': 0.004903017211572883}
Best Accuracy: 0.6934638720353006


#### 3.6 Stacking( 조정된 최적의 파라미터를 사용)

decision tree

In [51]:
base_models = [
    ('rf', RandomForestClassifier(
        n_estimators=133,
        max_depth=20,
        min_samples_split=17,
        min_samples_leaf=1,
        random_state=42)),
    ('ada', AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=4),
        n_estimators=175,
        learning_rate=0.15900320437580656,
        random_state=42)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(32,16,8),
                          max_iter=500,
                          activation='logistic',
                          learning_rate_init=0.004903017211572883,
                          solver='adam',
                          random_state=42))
]
classifier = StackingClassifier(estimators=base_models, 
                                final_estimator=DecisionTreeClassifier())
classifier.fit(X_train, y_train)
stack_guesses = classifier.predict(X_test)

In [52]:
print("Accuracy:", accuracy_score(y_test, stack_guesses))

Accuracy: 0.6169332597904027


logistic regression

In [17]:
base_models = [
    ('rf', RandomForestClassifier(
        n_estimators=133,
        max_depth=20,
        min_samples_split=17,
        min_samples_leaf=1,
        random_state=42)),
    ('ada', AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=4),
        n_estimators=175,
        learning_rate=0.15900320437580656,
        random_state=42)),
    ('mlp', MLPClassifier(hidden_layer_sizes=(32,16,8),
                          max_iter=500,
                          activation='logistic',
                          learning_rate_init=0.004903017211572883,
                          solver='adam',
                          random_state=42))
]
classifier = StackingClassifier(estimators=base_models, 
                                final_estimator=LogisticRegression(max_iter=1000))
classifier.fit(X_train, y_train)
stack_guesses = classifier.predict(X_test)

In [18]:
print("Accuracy:", accuracy_score(y_test, stack_guesses))

Accuracy: 0.706150027578599
