In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [11]:
train = pd.read_csv('../datasets/unlisted_resampled_train_data.csv')
test = pd.read_csv('../datasets/unlisted_test_data.csv')

In [12]:
# train_life_cycle = train[['도입기', '성장기', '성숙기', '쇠퇴기']]
# test_life_cycle = test[['도입기', '성장기', '성숙기', '쇠퇴기']]

In [13]:
selected_features = ['CASH FLOW 대 부채비율', 'CASH FLOW 대 총자본비율', 'CASH FLOW 대 매출액비율', '차입금의존도', '순운전자본비율',
                     '자기자본구성비율', '경영자본순이익률', '총자본사업이익률', '총자본영업이익률', '금융비용부담률', 
                     '매출액증가율', '이윤분배율', '총자본회전률', '영업년수', 
                     '도입기', '성장기', '성숙기', '쇠퇴기']

In [14]:
x_train = train[selected_features]
x_test = test[selected_features]

y_train = train['부실판단']
y_test = test['부실판단']

In [15]:
x_test = x_test.fillna(test['영업년수'].median())

In [16]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

labels = x_train.columns

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

x_train = pd.DataFrame(data=x_train, columns = labels)
x_test = pd.DataFrame(data=x_test, columns = labels)

In [17]:
# 모델 생성
logit_model = LogisticRegression()

# Cross Validation
cv_accuracy = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(logit_model, x_train, y_train, cv=5, scoring='roc_auc')

print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 결과=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

# 모델 학습 및 평가
logit_model.fit(x_train, y_train)
y_pred = logit_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 스코어: {f1:.3f}')
print(f'ROC AUC 스코어: {roc_auc:.3f}')


CV_Accuracy_Scores: [0.70366066 0.75479372 0.79418605 0.74651163 0.6627907 ]
CV_Precision_Scores: [0.71498771 0.76224612 0.76409186 0.74036281 0.67632242]
CV_Recall_Scores: [0.67674419 0.74099884 0.85116279 0.75930233 0.6244186 ]
CV_F1_Scores: [0.6953405  0.75147232 0.80528053 0.74971297 0.64933495]
CV_ROC/AUC: [0.76260163 0.83252843 0.87766901 0.81450784 0.74245538]

CV_Accuracy_mean: 0.732
CV_Precision_mean: 0.732
CV_Recall_mean: 0.731
CV_F1_스코어_mean: 0.730
CV_ROC_AUC+스코어_mean: 0.806

Accuracy: 0.730
Precision: 0.615
Recall: 0.708
F1 스코어: 0.658
ROC AUC 스코어: 0.725


In [18]:
# 하이퍼파라미터 범위 지정
param_grid = {
    'C': [0.1, 0.5, 1.0, 2.0, 5.0],  # 규제 강도
    'penalty': ['l1', 'l2'],  # 규제 유형
    'solver': ['liblinear', 'saga']  # 최적화 알고리즘
}

model = LogisticRegression()
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, 
                                   scoring='accuracy', cv=5, verbose=1, random_state=42)

random_search.fit(x_train, y_train)

best_model = random_search.best_estimator_

y_pred = best_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Logistic 하이퍼파라미터 조정 평가 지표")
print("Best Parameters:", random_search.best_params_)
print(f'Accuracy: {accuracy:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 Score: {f1:.3f}')
print(f'ROC AUC Score: {roc_auc:.3f}')


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Logistic 하이퍼파라미터 조정 평가 지표
Best Parameters: {'solver': 'saga', 'penalty': 'l1', 'C': 0.5}
Accuracy: 0.730
Precision: 0.616
Recall: 0.707
F1 Score: 0.658
ROC AUC Score: 0.725


In [19]:
# Random Forest 모델 생성 및 학습
rf_model = RandomForestClassifier()

# Cross Validation
cv_accuracy = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(rf_model, x_train, y_train, cv=5, scoring='roc_auc')

print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 결과=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')


rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_rf:.3f}')
print(f'Precision: {precision_rf:.3f}')
print(f'Recall: {recall_rf:.3f}')
print(f'F1 스코어: {f1_rf:.3f}')
print(f'ROC AUC 스코어: {roc_auc_rf:.3f}')

CV_Accuracy_Scores: [0.72167345 0.80650784 0.90348837 0.79302326 0.72034884]
CV_Precision_Scores: [0.71028037 0.77296727 0.88616071 0.78236607 0.69514768]
CV_Recall_Scores: [0.79069767 0.87224158 0.91976744 0.82674419 0.76162791]
CV_F1_Scores: [0.75013669 0.81391496 0.9061967  0.80539933 0.72615728]
CV_ROC/AUC: [0.81144694 0.89248845 0.9496924  0.88524202 0.79229786]

CV_Accuracy_mean: 0.789
CV_Precision_mean: 0.769
CV_Recall_mean: 0.834
CV_F1_스코어_mean: 0.800
CV_ROC_AUC+스코어_mean: 0.866

Accuracy: 0.830
Precision: 0.728
Recall: 0.858
F1 스코어: 0.788
ROC AUC 스코어: 0.836


In [20]:
random_search = {'n_estimators': [50, 100, 150],
                 'max_depth': [None, 5, 10, 15],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}

clf = RandomForestClassifier()
random = RandomizedSearchCV(estimator = clf, param_distributions = random_search, n_iter = 10, 
                               cv = 4, verbose= 1, random_state= 101, n_jobs = -1)
random.fit(x_train,y_train)
random_pf = random.best_estimator_.predict(x_test)

accuracy_rdrf = accuracy_score(y_test, random_pf)
precision_rdrf = precision_score(y_test, random_pf)
recall_rdrf = recall_score(y_test, random_pf)
f1_rdrf = f1_score(y_test, random_pf)
roc_auc_rdrf = roc_auc_score(y_test, random_pf)

print("Random Forest 모델 하이퍼파라미터 조정 평가 지표")
print(random.best_params_)
print(f'Accuracy: {accuracy_rdrf:.3f}')
print(f'Precision: {precision_rdrf:.3f}')
print(f'Recall: {recall_rdrf:.3f}')
print(f'F1 스코어: {f1_rdrf:.2f}')
print(f'ROC AUC 스코어: {roc_auc_rdrf:.2f}')

Fitting 4 folds for each of 10 candidates, totalling 40 fits
Random Forest 모델 하이퍼파라미터 조정 평가 지표
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': None}
Accuracy: 0.828
Precision: 0.725
Recall: 0.861
F1 스코어: 0.79
ROC AUC 스코어: 0.84


In [21]:
# AdaBoost 모델 생성 및 학습
adaboost_model = AdaBoostClassifier()

# Cross Validation
cv_accuracy = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(adaboost_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

adaboost_model.fit(x_train, y_train)
y_pred_adaboost = adaboost_model.predict(x_test)

accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
precision_adaboost = precision_score(y_test, y_pred_adaboost)
recall_adaboost = recall_score(y_test, y_pred_adaboost)
f1_adaboost = f1_score(y_test, y_pred_adaboost)
roc_auc_adaboost = roc_auc_score(y_test, y_pred_adaboost)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_adaboost:.3f}')
print(f'Precision: {precision_adaboost:.3f}')
print(f'Recall: {recall_adaboost:.3f}')
print(f'F1 스코어: {f1_adaboost:.2f}')
print(f'ROC AUC 스코어: {roc_auc_adaboost:.2f}')

CV_Accuracy_Scores: [0.72051133 0.78094131 0.83255814 0.78081395 0.71976744]
CV_Precision_Scores: [0.71607754 0.79876543 0.85929648 0.77288136 0.7090708 ]
CV_Recall_Scores: [0.73023256 0.7514518  0.79534884 0.79534884 0.74534884]
CV_F1_Scores: [0.72308578 0.77438659 0.82608696 0.78395415 0.72675737]
CV_ROC/AUC: [0.80820368 0.87892864 0.91517645 0.87009194 0.80805368]

CV_Accuracy_mean: 0.767
CV_Precision_mean: 0.771
CV_Recall_mean: 0.764
CV_F1_스코어_mean: 0.767
CV_ROC_AUC+스코어_mean: 0.856

Accuracy: 0.775
Precision: 0.677
Recall: 0.740
F1 스코어: 0.71
ROC AUC 스코어: 0.77


In [22]:
adaboost_model = AdaBoostClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],  # 트리 개수
    'learning_rate': [0.01, 0.1, 1.0]  # 학습률
}

random_search = RandomizedSearchCV(estimator=adaboost_model, param_distributions=param_grid, n_iter=10, 
                                   scoring='accuracy', cv=5, verbose=1, random_state=42)

random_search.fit(x_train, y_train)

best_model = random_search.best_estimator_

y_pred_adaboost = best_model.predict(x_test)

# 평가 메트릭 계산
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
precision_adaboost = precision_score(y_test, y_pred_adaboost)
recall_adaboost = recall_score(y_test, y_pred_adaboost)
f1_adaboost = f1_score(y_test, y_pred_adaboost)
roc_auc_adaboost = roc_auc_score(y_test, y_pred_adaboost)

print("Best Parameters:", random_search.best_params_)
print("AdaBoost 모델 하이퍼파라미터 조정 평가 지표:")
print(f'Accuracy: {accuracy_adaboost:.3f}')
print(f'Precision: {precision_adaboost:.3f}')
print(f'Recall: {recall_adaboost:.3f}')
print(f'F1 Score: {f1_adaboost:.3f}')
print(f'ROC AUC Score: {roc_auc_adaboost:.3f}')

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'n_estimators': 200, 'learning_rate': 1.0}
AdaBoost 모델 하이퍼파라미터 조정 평가 지표:
Accuracy: 0.782
Precision: 0.683
Recall: 0.760
F1 Score: 0.719
ROC AUC Score: 0.777


In [23]:
# Bagging 모델 생성 및 학습
bagging_model = BaggingClassifier()

cv_accuracy = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(bagging_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

bagging_model.fit(x_train, y_train)
y_pred_bagging = bagging_model.predict(x_test)

accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
precision_bagging = precision_score(y_test, y_pred_bagging)
recall_bagging = recall_score(y_test, y_pred_bagging)
f1_bagging = f1_score(y_test, y_pred_bagging)
roc_auc_bagging = roc_auc_score(y_test, y_pred_bagging)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_bagging:.3f}')
print(f'Precision: {precision_bagging:.3f}')
print(f'Recall: {recall_bagging:.3f}')
print(f'F1 스코어: {f1_bagging:.3f}')
print(f'ROC AUC 스코어: {roc_auc_bagging:.3f}')

CV_Accuracy_Scores: [0.70366066 0.77396862 0.875      0.77093023 0.67906977]
CV_Precision_Scores: [0.72627737 0.77882353 0.87544484 0.78871549 0.706399  ]
CV_Recall_Scores: [0.6627907  0.7514518  0.87790698 0.7372093  0.64767442]
CV_F1_Scores: [0.69862193 0.76823529 0.87477954 0.75505351 0.65036675]
CV_ROC/AUC: [0.77934662 0.86537153 0.9317212  0.85897715 0.76421579]

CV_Accuracy_mean: 0.761
CV_Precision_mean: 0.775
CV_Recall_mean: 0.735
CV_F1_스코어_mean: 0.749
CV_ROC_AUC+스코어_mean: 0.840

Accuracy: 0.810
Precision: 0.724
Recall: 0.782
F1 스코어: 0.752
ROC AUC 스코어: 0.804


In [24]:
bagging_model = BaggingClassifier()

param_grid = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0]
}

random_search = RandomizedSearchCV(estimator=bagging_model, param_distributions=param_grid, n_iter=10, 
                                   scoring='accuracy', cv=5, verbose=1, random_state=42)

random_search.fit(x_train, y_train)

# 최적의 모델 저장
best_model = random_search.best_estimator_

# 최적의 모델로 예측 수행
y_pred_bagging = best_model.predict(x_test)

# 평가 메트릭 계산
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
precision_bagging = precision_score(y_test, y_pred_bagging)
recall_bagging = recall_score(y_test, y_pred_bagging)
f1_bagging = f1_score(y_test, y_pred_bagging)
roc_auc_bagging = roc_auc_score(y_test, y_pred_bagging)

print("Best Parameters:", random_search.best_params_)
print("Bagging 모델 하이퍼파라미터 평가 지표:")
print(f'Accuracy: {accuracy_bagging:.3f}')
print(f'Precision: {precision_bagging:.3f}')
print(f'Recall: {recall_bagging:.3f}')
print(f'F1 Score: {f1_bagging:.3f}')
print(f'ROC AUC Score: {roc_auc_bagging:.3f}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'n_estimators': 100, 'max_samples': 1.0, 'max_features': 0.5}
Bagging 모델 하이퍼파라미터 평가 지표:
Accuracy: 0.829
Precision: 0.725
Recall: 0.861
F1 Score: 0.787
ROC AUC Score: 0.836


In [25]:
from sklearn.svm import SVC

# SVM 모델 생성 및 학습
svm_model = SVC(kernel='linear')

cv_accuracy = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

svm_model.fit(x_train, y_train)
y_pred_svm = svm_model.predict(x_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_svm:.3f}')
print(f'Precision: {precision_svm:.3f}')
print(f'Recall: {recall_svm:.3f}')
print(f'F1 스코어: {f1_svm:.3f}')
print(f'ROC AUC 스코어: {roc_auc_svm:.3f}')

CV_Accuracy_Scores: [0.69726903 0.75363161 0.79302326 0.75232558 0.66511628]
CV_Precision_Scores: [0.71374527 0.75318656 0.75766871 0.73898678 0.67617866]
CV_Recall_Scores: [0.65813953 0.75493612 0.86162791 0.78023256 0.63372093]
CV_F1_Scores: [0.68481549 0.75406032 0.80631121 0.75904977 0.6542617 ]
CV_ROC/AUC: [0.76011398 0.83340221 0.87891969 0.81416306 0.74007301]

CV_Accuracy_mean: 0.732
CV_Precision_mean: 0.728
CV_Recall_mean: 0.738
CV_F1_스코어_mean: 0.732
CV_ROC_AUC+스코어_mean: 0.805

Accuracy: 0.728
Precision: 0.613
Recall: 0.713
F1 스코어: 0.659
ROC AUC 스코어: 0.725


In [26]:
from sklearn.svm import SVC

# SVM 모델 생성 및 학습
svm_model = SVC(kernel='rbf')

cv_accuracy = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(svm_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

svm_model.fit(x_train, y_train)
y_pred_svm = svm_model.predict(x_test)

accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_svm:.3f}')
print(f'Precision: {precision_svm:.3f}')
print(f'Recall: {recall_svm:.3f}')
print(f'F1 스코어: {f1_svm:.3f}')
print(f'ROC AUC 스코어: {roc_auc_svm:.3f}')


CV_Accuracy_Scores: [0.72515979 0.78152237 0.86453488 0.77674419 0.70523256]
CV_Precision_Scores: [0.71334068 0.77588168 0.829653   0.76211454 0.70079636]
CV_Recall_Scores: [0.75232558 0.79210221 0.91744186 0.80465116 0.71627907]
CV_F1_Scores: [0.73231466 0.78390805 0.8713418  0.78280543 0.70845313]
CV_ROC/AUC: [0.80495638 0.86945953 0.93806517 0.85980936 0.78237966]

CV_Accuracy_mean: 0.771
CV_Precision_mean: 0.756
CV_Recall_mean: 0.797
CV_F1_스코어_mean: 0.776
CV_ROC_AUC+스코어_mean: 0.851

Accuracy: 0.774
Precision: 0.664
Recall: 0.784
F1 스코어: 0.719
ROC AUC 스코어: 0.776


In [27]:
svm_model = SVC()

param_grid = {
    'C': [0.1, 1, 10, 100],  # 규제 매개변수
    'gamma': [0.1, 0.01, 0.001, 0.0001],  # 커널 계수
    'kernel': ['linear', 'rbf', 'poly']  # 커널 타입
}

random_search = RandomizedSearchCV(estimator=svm_model, param_distributions=param_grid, n_iter=10, 
                                   scoring='accuracy', cv=5, verbose=1, random_state=42)

random_search.fit(x_train, y_train)

print("Best Parameters:", random_search.best_params_)
best_model = random_search.best_estimator_

y_pred_svm = best_model.predict(x_test)

# 평가 메트릭 계산
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

# 평가 메트릭 출력
print("SVM 모델 평가 지표:")
print(f'Accuracy: {accuracy_svm:.3f}')
print(f'Precision: {precision_svm:.3f}')
print(f'Recall: {recall_svm:.3f}')
print(f'F1 Score: {f1_svm:.3f}')
print(f'ROC AUC Score: {roc_auc_svm:.3f}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'kernel': 'rbf', 'gamma': 0.01, 'C': 100}
SVM 모델 평가 지표:
Accuracy: 0.784
Precision: 0.680
Recall: 0.780
F1 Score: 0.726
ROC AUC Score: 0.783


In [28]:
from lightgbm import LGBMClassifier

# LGBM 모델 생성 및 학습
lgbm_model = LGBMClassifier()

cv_accuracy = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='accuracy')
cv_precision = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='precision')
cv_recall = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='recall')
cv_f1 = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='f1')
cv_roc_auc = cross_val_score(lgbm_model, x_train, y_train, cv=5, scoring='roc_auc')

print('=======교차검증 결과=======')
print("CV_Accuracy_Scores:", cv_accuracy)
print("CV_Precision_Scores:", cv_precision)
print("CV_Recall_Scores:", cv_recall)
print("CV_F1_Scores:", cv_f1)
print("CV_ROC/AUC:", cv_roc_auc)

print('\n=======교차검증 평균값=======')
print(f'CV_Accuracy_mean: {cv_accuracy.mean():.3f}')
print(f'CV_Precision_mean: {cv_precision.mean():.3f}')
print(f'CV_Recall_mean: {cv_recall.mean():.3f}')
print(f'CV_F1_스코어_mean: {cv_f1.mean():.3f}')
print(f'CV_ROC_AUC+스코어_mean: {cv_roc_auc.mean():.3f}')

lgbm_model.fit(x_train, y_train)
y_pred_lgbm = lgbm_model.predict(x_test)

accuracy_lgbm = accuracy_score(y_test, y_pred_lgbm)
precision_lgbm = precision_score(y_test, y_pred_lgbm)
recall_lgbm = recall_score(y_test, y_pred_lgbm)
f1_lgbm = f1_score(y_test, y_pred_lgbm)
roc_auc_lgbm = roc_auc_score(y_test, y_pred_lgbm)

print(f'\n=======Test 데이터 평가======')
print(f'Accuracy: {accuracy_lgbm:.3f}')
print(f'Precision: {precision_lgbm:.3f}')
print(f'Recall: {recall_lgbm:.3f}')
print(f'F1 스코어: {f1_lgbm:.3f}')
print(f'ROC AUC 스코어: {roc_auc_lgbm:.3f}')


[LightGBM] [Info] Number of positive: 3441, number of negative: 3440
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000565 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3385
[LightGBM] [Info] Number of data points in the train set: 6881, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500073 -> initscore=0.000291
[LightGBM] [Info] Start training from score 0.000291
[LightGBM] [Info] Number of positive: 3440, number of negative: 3441
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3384
[LightGBM] [Info] Number of data points in the train set: 6881, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499927 -> initscore=-0.000291
[LightGBM]

In [29]:
param_grid = {
        'learning_rate': [0.01, 0.05, 0.1, 0.001],
        'n_estimators': [50, 100, 150, 300],
        'max_depth': [3, 5, 7],
        'num_leaves': [15, 31, 63, 127],
        'min_child_samples': [5, 10, 20]
    }

lgbm = LGBMClassifier()
random = RandomizedSearchCV(lgbm, param_grid, cv = 5, n_jobs = -1)
random.fit(x_train,y_train)
random_pf = random.best_estimator_.predict(x_test)

accuracy_rdrf = accuracy_score(y_test, random_pf)
precision_rdrf = precision_score(y_test, random_pf)
recall_rdrf = recall_score(y_test, random_pf)
f1_rdrf = f1_score(y_test, random_pf)
roc_auc_rdrf = roc_auc_score(y_test, random_pf)

print("LGBM 모델 하이퍼파라미터 조정 평가 지표")
print(random.best_params_)
print(f'Accuracy: {accuracy_rdrf:.3f}')
print(f'Precision: {precision_rdrf:.3f}')
print(f'Recall: {recall_rdrf:.3f}')
print(f'F1 스코어: {f1_rdrf:.2f}')
print(f'ROC AUC 스코어: {roc_auc_rdrf:.2f}')

[LightGBM] [Info] Number of positive: 3441, number of negative: 3441
[LightGBM] [Info] Number of positive: 3440, number of negative: 3441
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003664 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3384
[LightGBM] [Info] Number of data points in the train set: 6882, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007985 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3384
[LightGBM] [Info] Number of data points in the train set: 6881, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499927 -> initscore=-0.000291
[LightGBM] [Info] Start training from score -0.000291
[LightGBM] [Info] Number of positive: 3441, number of negative: 3440
[Lig

#### 딥러닝

In [30]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# import tensorflow as tf

In [31]:
# model = Sequential()
# model.add(Dense(256, input_dim = 12, activation = 'relu'))
# model.add(Dropout(0.5))
# model.add(Dense(64, activation = 'relu'))
# model.add(Dropout(0.5))
# model.add(Dense(1, activation = 'sigmoid'))

In [32]:
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# history = model.fit(x_train, y_train, epochs=200, batch_size=5)

# print("\n Accuracy: %.4f" % (model.evaluate(x_test, y_test)[1]))

In [33]:
# # 모델을 사용하여 테스트 데이터에 대한 예측 수행
# y_pred = model.predict(x_test)
# y_pred = binarize(y_pred, threshold=0.5)  # 예측값을 0.5 임계값을 기준으로 이진 분류로 변환

# # 정확도(accuracy) 계산
# accuracy = accuracy_score(y_test, y_pred)
# print(f"accuracy: {accuracy:.4f}")

# # F1 점수(f1 score) 계산
# f1 = f1_score(y_test, y_pred)
# print(f"F1-Score: {f1:.4f}")

# # 재현율(recall) 계산
# recall = recall_score(y_test, y_pred)
# print(f"recall: {recall:.4f}")

# # 정밀도(precision) 계산
# precision = precision_score(y_test, y_pred)
# print(f"precision: {precision:.4f}")