In [None]:
from contents_DB import connect_mongodb, read_from_mongodb

# 데이터를 불러오기 위해 MongoDB 연결
collection = connect_mongodb('post','all_post')

In [None]:
# 저장된 데이터 불러오기
ad_df = read_from_mongodb(collection)

In [None]:
import re

# 댓글 수 세기
# 리스트로써 댓글 수를 셀 때, 문장에 ',' 가 있는 경우 여러개로 인식하는 것을 방지하는 함수
def count_comments(comment):
    # 공백을 모두 제거한 후, 댓글이 없는 경우 0 반환
    if comment.strip() == '':
        return 0

    # 쉼표(,)를 기준으로 나눌 경우 처리 (쉼표 사이에 공백 제거)
    return len([c for c in re.split(r',', comment) if c.strip() != ''])

In [39]:
from scipy.stats import ttest_ind

ad_df['content_length'] = ad_df['content'].apply(len)
ad_df['hashtag_count'] = ad_df['content'].apply(lambda x: x.count('#'))
ad_df['comment_count'] = ad_df['comments'].apply(count_comments)

# t-test

features = ['content_length', 'hashtag_count', 'comment_count', 'likes']
t_test_results = {}

for feature in features:
    group_ad = ad_df[ad_df['is_ad']==1][feature]
    group_non_ad = ad_df[ad_df['is_ad']==0][feature]
    t_stat, p_value = ttest_ind(group_ad, group_non_ad, equal_var=False)
    t_test_results[feature] = {'t-statistic': t_stat, 'p-value': p_value}

for feature, result in t_test_results.items():
    print(f"{feature}: t-statistic = {result['t-statistic']:.3f}, p-value = {result['p-value']:.3f}")


content_length: t-statistic = 14.541, p-value = 0.000
hashtag_count: t-statistic = 8.149, p-value = 0.000
comment_count: t-statistic = 4.417, p-value = 0.000
likes: t-statistic = -1.300, p-value = 0.195


In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

X = ad_df[['content_length', 'hashtag_count', 'comment_count']]
y = ad_df['is_ad']

# 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 모델 평가
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89        58
           1       0.85      0.61      0.71        28

    accuracy                           0.84        86
   macro avg       0.84      0.78      0.80        86
weighted avg       0.84      0.84      0.83        86



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


X = ad_df[['content_length', 'hashtag_count', 'comment_count']]
y = ad_df['is_ad']

# 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 비광고글과 광고글의 개수 간 불균형 해소를 위해 SMOTE 사용용
# SMOTE 설정 - 소수 클래스의 샘플을 다수 클래스 샘플의 50%로 설정
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# 증강 확인
print(f"Before SMOTE: {y_train.value_counts()}\n")
print(f"After SMOTE: {y_train_res.value_counts()}\n")

# 모델 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train_res, y_train_res)

# 모델 평가
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Before SMOTE: is_ad
0    274
1     70
Name: count, dtype: int64

After SMOTE: is_ad
0    274
1    137
Name: count, dtype: int64

              precision    recall  f1-score   support

           0       0.87      0.93      0.90        58
           1       0.83      0.71      0.77        28

    accuracy                           0.86        86
   macro avg       0.85      0.82      0.83        86
weighted avg       0.86      0.86      0.86        86



In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

In [73]:
# XGBoost Model

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# 기본 XGBoost 모델 정의
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# 하이퍼파라미터 후보 설정
xgb_params = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1, 5]
}

xgb_grid_search = GridSearchCV(xgb_model, xgb_params, cv=3, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(X_train_res, y_train_res)

# 최적 하이퍼파라미터 적용하여 새로운 XGBoost 모델 학습
best_xgb_model = xgb.XGBClassifier(**xgb_grid_search.best_params_, random_state=42,
                                   use_label_encoder=False, eval_metric='logloss')

best_xgb_model.fit(X_train_res, y_train_res)

# 최적 모델로 테스트 데이터 예측
xgb_pred = best_xgb_model.predict(X_test)

# 모델 평가
print("XGBoost Best Parameters:", xgb_grid_search.best_params_)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_pred))
print("XGBoost Precision:", precision_score(y_test, xgb_pred))
print("XGBoost Recall:", recall_score(y_test, xgb_pred))
print("XGBoost F1-score:", f1_score(y_test, xgb_pred))
print("XGBoost AUC:", roc_auc_score(y_test, xgb_pred))

XGBoost Best Parameters: {'colsample_bytree': 0.8, 'gamma': 5, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 1.0}
XGBoost Accuracy: 0.8255813953488372
XGBoost Precision: 0.7407407407407407
XGBoost Recall: 0.7142857142857143
XGBoost F1-score: 0.7272727272727273
XGBoost AUC: 0.7967980295566504


In [108]:
# LIGHTGBM Model

import lightgbm as lgb
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

# 데이터 전처리 (StandardScaler 적용)
scaler = StandardScaler()
X_train_res = scaler.fit_transform(X_train_res)
X_test = scaler.transform(X_test)

# LightGBM 기본 모델 설정
lgb_model = lgb.LGBMClassifier(random_state=42, class_weight='balanced', verbosity=-1)

# 하이퍼파라미터 후보 설정
lgb_params = {
    'num_leaves': [31, 51],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'max_depth': [6, 10],
    'min_child_samples': [5, 10, 20],
    'min_gain_to_split': [0.0],
    'min_data_in_leaf': [5, 10, 20]
}

# GridSearchCV 수행 (3-Fold CV 사용)
grid_search = GridSearchCV(lgb_model, lgb_params, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_res, y_train_res)

# 최적의 하이퍼파라미터를 사용하여 최종 모델 생성
best_lgb_model = lgb.LGBMClassifier(**grid_search.best_params_, random_state=42,
                                    class_weight='balanced', verbosity=-1)

best_lgb_model.fit(X_train_res, y_train_res)

lightgbm_pred = best_lgb_model.predict(X_test)

# 모델 평가
print("LIGHTGBM Best Parameters:", xgb_grid_search.best_params_)
print("LIGHTGBM Accuracy:", accuracy_score(y_test, lightgbm_pred))
print("LIGHTGBM Precision:", precision_score(y_test, lightgbm_pred))
print("LIGHTGBM Recall:", recall_score(y_test, lightgbm_pred))
print("LIGHTGBM F1-score:", f1_score(y_test, lightgbm_pred))
print("LIGHTGBM AUC:", roc_auc_score(y_test, lightgbm_pred))


Fitting 3 folds for each of 576 candidates, totalling 1728 fits
LIGHTGBM Best Parameters: {'colsample_bytree': 0.8, 'gamma': 5, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 1.0}
LIGHTGBM Accuracy: 0.7906976744186046
LIGHTGBM Precision: 0.6785714285714286
LIGHTGBM Recall: 0.6785714285714286
LIGHTGBM F1-score: 0.6785714285714286
LIGHTGBM AUC: 0.7616995073891626


In [102]:
import optuna
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score

# Optuna objective function
def objective(trial):
    # 하이퍼파라미터 후보 설정
    n_d = trial.suggest_categorical('n_d', [8, 16, 24])
    n_a = trial.suggest_categorical('n_a', [8, 16, 24])
    n_steps = trial.suggest_categorical('n_steps', [3, 5])
    gamma = trial.suggest_uniform('gamma', 1.0, 1.5)
    lambda_sparse = trial.suggest_loguniform('lambda_sparse', 0.001, 0.01)
    momentum = trial.suggest_uniform('momentum', 0.02, 0.1)

    # TabNet 모델 초기화
    model = TabNetClassifier(
        n_d=n_d,
        n_a=n_a,
        n_steps=n_steps,
        gamma=gamma,
        lambda_sparse=lambda_sparse,
        momentum=momentum, 
        seed=42
    )

    # 모델 학습
    model.fit(
        X_train_res, y_train_res,
        eval_set=[(X_test, y_test)],
        eval_metric=['accuracy'],
        patience=20,  # 20번 연속 개선되지 않으면 학습 중단
        max_epochs=100,  # 최대 100 Epoch까지 학습
        batch_size=256,
        virtual_batch_size=128,
    )

    # 테스트 데이터 예측
    tabnet_pred = model.predict(X_test)

    # 성능 평가
    accuracy = accuracy_score(y_test, tabnet_pred)
    
    return accuracy

# Optuna study 객체 생성
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)  # 10번 시도

[I 2025-02-12 16:38:27,825] A new study created in memory with name: no-name-8e0af56d-439f-42c2-97b5-82d58dbf9eda


epoch 0  | loss: 0.91152 | val_0_accuracy: 0.37209 |  0:00:00s
epoch 1  | loss: 0.56589 | val_0_accuracy: 0.44186 |  0:00:00s
epoch 2  | loss: 0.61093 | val_0_accuracy: 0.44186 |  0:00:00s
epoch 3  | loss: 0.49532 | val_0_accuracy: 0.59302 |  0:00:00s
epoch 4  | loss: 0.40416 | val_0_accuracy: 0.66279 |  0:00:00s
epoch 5  | loss: 0.48587 | val_0_accuracy: 0.67442 |  0:00:00s
epoch 6  | loss: 0.3888  | val_0_accuracy: 0.66279 |  0:00:00s
epoch 7  | loss: 0.40036 | val_0_accuracy: 0.65116 |  0:00:01s
epoch 8  | loss: 0.32514 | val_0_accuracy: 0.67442 |  0:00:01s
epoch 9  | loss: 0.35387 | val_0_accuracy: 0.67442 |  0:00:01s
epoch 10 | loss: 0.30334 | val_0_accuracy: 0.65116 |  0:00:01s
epoch 11 | loss: 0.38095 | val_0_accuracy: 0.72093 |  0:00:01s
epoch 12 | loss: 0.3797  | val_0_accuracy: 0.73256 |  0:00:01s
epoch 13 | loss: 0.36818 | val_0_accuracy: 0.77907 |  0:00:01s
epoch 14 | loss: 0.37022 | val_0_accuracy: 0.75581 |  0:00:01s
epoch 15 | loss: 0.34929 | val_0_accuracy: 0.75581 |  0

[I 2025-02-12 16:38:31,237] Trial 0 finished with value: 0.7790697674418605 and parameters: {'n_d': 16, 'n_a': 16, 'n_steps': 5, 'gamma': 1.3514771870009774, 'lambda_sparse': 0.0018491142859658508, 'momentum': 0.05951441147969204}. Best is trial 0 with value: 0.7790697674418605.


epoch 32 | loss: 0.28811 | val_0_accuracy: 0.51163 |  0:00:03s
epoch 33 | loss: 0.25554 | val_0_accuracy: 0.56977 |  0:00:03s

Early stopping occurred at epoch 33 with best_epoch = 13 and best_val_0_accuracy = 0.77907
epoch 0  | loss: 1.21019 | val_0_accuracy: 0.72093 |  0:00:00s
epoch 1  | loss: 0.53413 | val_0_accuracy: 0.74419 |  0:00:00s
epoch 2  | loss: 0.57036 | val_0_accuracy: 0.80233 |  0:00:00s
epoch 3  | loss: 0.3982  | val_0_accuracy: 0.80233 |  0:00:00s
epoch 4  | loss: 0.38065 | val_0_accuracy: 0.82558 |  0:00:00s
epoch 5  | loss: 0.37419 | val_0_accuracy: 0.83721 |  0:00:01s
epoch 6  | loss: 0.40084 | val_0_accuracy: 0.83721 |  0:00:01s
epoch 7  | loss: 0.37376 | val_0_accuracy: 0.77907 |  0:00:01s
epoch 8  | loss: 0.35038 | val_0_accuracy: 0.76744 |  0:00:01s
epoch 9  | loss: 0.36186 | val_0_accuracy: 0.73256 |  0:00:01s
epoch 10 | loss: 0.32323 | val_0_accuracy: 0.69767 |  0:00:01s
epoch 11 | loss: 0.30264 | val_0_accuracy: 0.68605 |  0:00:01s
epoch 12 | loss: 0.31676 |

[I 2025-02-12 16:38:34,187] Trial 1 finished with value: 0.8372093023255814 and parameters: {'n_d': 24, 'n_a': 24, 'n_steps': 5, 'gamma': 1.1712299634224932, 'lambda_sparse': 0.0038941122943742935, 'momentum': 0.042161166573174126}. Best is trial 1 with value: 0.8372093023255814.


epoch 25 | loss: 0.26685 | val_0_accuracy: 0.4186  |  0:00:02s

Early stopping occurred at epoch 25 with best_epoch = 5 and best_val_0_accuracy = 0.83721
epoch 0  | loss: 2.03203 | val_0_accuracy: 0.31395 |  0:00:00s
epoch 1  | loss: 0.95811 | val_0_accuracy: 0.32558 |  0:00:00s
epoch 2  | loss: 0.47937 | val_0_accuracy: 0.5     |  0:00:00s
epoch 3  | loss: 0.43622 | val_0_accuracy: 0.74419 |  0:00:00s
epoch 4  | loss: 0.434   | val_0_accuracy: 0.81395 |  0:00:00s
epoch 5  | loss: 0.39594 | val_0_accuracy: 0.82558 |  0:00:00s
epoch 6  | loss: 0.37741 | val_0_accuracy: 0.81395 |  0:00:00s
epoch 7  | loss: 0.38596 | val_0_accuracy: 0.76744 |  0:00:00s
epoch 8  | loss: 0.33961 | val_0_accuracy: 0.62791 |  0:00:00s
epoch 9  | loss: 0.37613 | val_0_accuracy: 0.37209 |  0:00:00s
epoch 10 | loss: 0.35996 | val_0_accuracy: 0.31395 |  0:00:00s
epoch 11 | loss: 0.36529 | val_0_accuracy: 0.31395 |  0:00:01s
epoch 12 | loss: 0.35949 | val_0_accuracy: 0.31395 |  0:00:01s
epoch 13 | loss: 0.35701 | 

[I 2025-02-12 16:38:36,547] Trial 2 finished with value: 0.8255813953488372 and parameters: {'n_d': 8, 'n_a': 16, 'n_steps': 5, 'gamma': 1.135190351291742, 'lambda_sparse': 0.005639492964391199, 'momentum': 0.09070832062218574}. Best is trial 1 with value: 0.8372093023255814.


epoch 0  | loss: 1.15609 | val_0_accuracy: 0.77907 |  0:00:00s
epoch 1  | loss: 0.67495 | val_0_accuracy: 0.76744 |  0:00:00s
epoch 2  | loss: 0.60054 | val_0_accuracy: 0.82558 |  0:00:00s
epoch 3  | loss: 0.47714 | val_0_accuracy: 0.84884 |  0:00:00s
epoch 4  | loss: 0.41856 | val_0_accuracy: 0.83721 |  0:00:00s
epoch 5  | loss: 0.31412 | val_0_accuracy: 0.82558 |  0:00:00s
epoch 6  | loss: 0.35113 | val_0_accuracy: 0.81395 |  0:00:00s
epoch 7  | loss: 0.33389 | val_0_accuracy: 0.81395 |  0:00:00s
epoch 8  | loss: 0.37383 | val_0_accuracy: 0.81395 |  0:00:00s
epoch 9  | loss: 0.36402 | val_0_accuracy: 0.72093 |  0:00:00s
epoch 10 | loss: 0.34533 | val_0_accuracy: 0.54651 |  0:00:00s
epoch 11 | loss: 0.29875 | val_0_accuracy: 0.38372 |  0:00:00s
epoch 12 | loss: 0.29619 | val_0_accuracy: 0.37209 |  0:00:00s
epoch 13 | loss: 0.32742 | val_0_accuracy: 0.33721 |  0:00:00s
epoch 14 | loss: 0.30943 | val_0_accuracy: 0.37209 |  0:00:00s
epoch 15 | loss: 0.34746 | val_0_accuracy: 0.34884 |  0

[I 2025-02-12 16:38:38,042] Trial 3 finished with value: 0.8488372093023255 and parameters: {'n_d': 24, 'n_a': 16, 'n_steps': 3, 'gamma': 1.1831014346989828, 'lambda_sparse': 0.001959645313797297, 'momentum': 0.0873740845269653}. Best is trial 3 with value: 0.8488372093023255.


epoch 20 | loss: 0.30373 | val_0_accuracy: 0.34884 |  0:00:01s
epoch 21 | loss: 0.29665 | val_0_accuracy: 0.38372 |  0:00:01s
epoch 22 | loss: 0.30795 | val_0_accuracy: 0.4186  |  0:00:01s
epoch 23 | loss: 0.31945 | val_0_accuracy: 0.45349 |  0:00:01s

Early stopping occurred at epoch 23 with best_epoch = 3 and best_val_0_accuracy = 0.84884
epoch 0  | loss: 1.21293 | val_0_accuracy: 0.47674 |  0:00:00s
epoch 1  | loss: 0.59155 | val_0_accuracy: 0.65116 |  0:00:00s
epoch 2  | loss: 0.53276 | val_0_accuracy: 0.38372 |  0:00:00s
epoch 3  | loss: 0.48937 | val_0_accuracy: 0.36047 |  0:00:00s
epoch 4  | loss: 0.50712 | val_0_accuracy: 0.4186  |  0:00:00s
epoch 5  | loss: 0.38061 | val_0_accuracy: 0.44186 |  0:00:00s
epoch 6  | loss: 0.4165  | val_0_accuracy: 0.73256 |  0:00:00s
epoch 7  | loss: 0.47672 | val_0_accuracy: 0.69767 |  0:00:00s
epoch 8  | loss: 0.37639 | val_0_accuracy: 0.59302 |  0:00:00s
epoch 9  | loss: 0.33004 | val_0_accuracy: 0.44186 |  0:00:00s
epoch 10 | loss: 0.3341  | 

[I 2025-02-12 16:38:40,485] Trial 4 finished with value: 0.7325581395348837 and parameters: {'n_d': 24, 'n_a': 24, 'n_steps': 5, 'gamma': 1.156701555368292, 'lambda_sparse': 0.0054672273381407175, 'momentum': 0.07352787479372926}. Best is trial 3 with value: 0.8488372093023255.


epoch 26 | loss: 0.35191 | val_0_accuracy: 0.48837 |  0:00:02s

Early stopping occurred at epoch 26 with best_epoch = 6 and best_val_0_accuracy = 0.73256
epoch 0  | loss: 0.76831 | val_0_accuracy: 0.76744 |  0:00:00s
epoch 1  | loss: 0.5039  | val_0_accuracy: 0.76744 |  0:00:00s
epoch 2  | loss: 0.45577 | val_0_accuracy: 0.7907  |  0:00:00s
epoch 3  | loss: 0.50781 | val_0_accuracy: 0.80233 |  0:00:00s
epoch 4  | loss: 0.49031 | val_0_accuracy: 0.81395 |  0:00:00s
epoch 5  | loss: 0.36654 | val_0_accuracy: 0.80233 |  0:00:00s
epoch 6  | loss: 0.4114  | val_0_accuracy: 0.7907  |  0:00:00s
epoch 7  | loss: 0.44451 | val_0_accuracy: 0.77907 |  0:00:00s
epoch 8  | loss: 0.40861 | val_0_accuracy: 0.80233 |  0:00:00s
epoch 9  | loss: 0.40455 | val_0_accuracy: 0.7907  |  0:00:00s
epoch 10 | loss: 0.37477 | val_0_accuracy: 0.77907 |  0:00:00s
epoch 11 | loss: 0.34177 | val_0_accuracy: 0.74419 |  0:00:00s
epoch 12 | loss: 0.39579 | val_0_accuracy: 0.72093 |  0:00:00s
epoch 13 | loss: 0.35933 | 

[I 2025-02-12 16:38:42,057] Trial 5 finished with value: 0.813953488372093 and parameters: {'n_d': 24, 'n_a': 8, 'n_steps': 3, 'gamma': 1.340932147575804, 'lambda_sparse': 0.008098675486849031, 'momentum': 0.04311104460524679}. Best is trial 3 with value: 0.8488372093023255.


epoch 21 | loss: 0.29963 | val_0_accuracy: 0.62791 |  0:00:01s
epoch 22 | loss: 0.33229 | val_0_accuracy: 0.60465 |  0:00:01s
epoch 23 | loss: 0.31547 | val_0_accuracy: 0.60465 |  0:00:01s
epoch 24 | loss: 0.30637 | val_0_accuracy: 0.60465 |  0:00:01s

Early stopping occurred at epoch 24 with best_epoch = 4 and best_val_0_accuracy = 0.81395
epoch 0  | loss: 1.18892 | val_0_accuracy: 0.77907 |  0:00:00s
epoch 1  | loss: 0.67979 | val_0_accuracy: 0.51163 |  0:00:00s
epoch 2  | loss: 0.62359 | val_0_accuracy: 0.72093 |  0:00:00s
epoch 3  | loss: 0.48036 | val_0_accuracy: 0.7907  |  0:00:00s
epoch 4  | loss: 0.41781 | val_0_accuracy: 0.81395 |  0:00:00s
epoch 5  | loss: 0.34401 | val_0_accuracy: 0.82558 |  0:00:00s
epoch 6  | loss: 0.34654 | val_0_accuracy: 0.81395 |  0:00:00s
epoch 7  | loss: 0.33209 | val_0_accuracy: 0.76744 |  0:00:00s
epoch 8  | loss: 0.37695 | val_0_accuracy: 0.76744 |  0:00:00s
epoch 9  | loss: 0.39011 | val_0_accuracy: 0.80233 |  0:00:00s
epoch 10 | loss: 0.35234 | 

[I 2025-02-12 16:38:44,375] Trial 6 finished with value: 0.8372093023255814 and parameters: {'n_d': 24, 'n_a': 16, 'n_steps': 3, 'gamma': 1.2476518498402414, 'lambda_sparse': 0.005330463301655585, 'momentum': 0.03249130581809876}. Best is trial 3 with value: 0.8488372093023255.


epoch 33 | loss: 0.27274 | val_0_accuracy: 0.4186  |  0:00:02s
epoch 34 | loss: 0.32271 | val_0_accuracy: 0.44186 |  0:00:02s

Early stopping occurred at epoch 34 with best_epoch = 14 and best_val_0_accuracy = 0.83721
epoch 0  | loss: 0.93487 | val_0_accuracy: 0.66279 |  0:00:00s
epoch 1  | loss: 0.57196 | val_0_accuracy: 0.74419 |  0:00:00s
epoch 2  | loss: 0.45044 | val_0_accuracy: 0.72093 |  0:00:00s
epoch 3  | loss: 0.44883 | val_0_accuracy: 0.74419 |  0:00:00s
epoch 4  | loss: 0.45112 | val_0_accuracy: 0.73256 |  0:00:00s
epoch 5  | loss: 0.41936 | val_0_accuracy: 0.73256 |  0:00:00s
epoch 6  | loss: 0.40385 | val_0_accuracy: 0.77907 |  0:00:00s
epoch 7  | loss: 0.40315 | val_0_accuracy: 0.7907  |  0:00:00s
epoch 8  | loss: 0.33721 | val_0_accuracy: 0.7907  |  0:00:00s
epoch 9  | loss: 0.37846 | val_0_accuracy: 0.77907 |  0:00:00s
epoch 10 | loss: 0.40111 | val_0_accuracy: 0.75581 |  0:00:00s
epoch 11 | loss: 0.36314 | val_0_accuracy: 0.74419 |  0:00:00s
epoch 12 | loss: 0.33153 |

[I 2025-02-12 16:38:46,688] Trial 7 finished with value: 0.813953488372093 and parameters: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.4688211986940485, 'lambda_sparse': 0.0036000872278773486, 'momentum': 0.07736826343500075}. Best is trial 3 with value: 0.8488372093023255.


epoch 38 | loss: 0.33649 | val_0_accuracy: 0.7907  |  0:00:02s

Early stopping occurred at epoch 38 with best_epoch = 18 and best_val_0_accuracy = 0.81395
epoch 0  | loss: 2.51865 | val_0_accuracy: 0.24419 |  0:00:00s
epoch 1  | loss: 0.79266 | val_0_accuracy: 0.32558 |  0:00:00s
epoch 2  | loss: 0.50008 | val_0_accuracy: 0.38372 |  0:00:00s
epoch 3  | loss: 0.54326 | val_0_accuracy: 0.68605 |  0:00:00s
epoch 4  | loss: 0.29576 | val_0_accuracy: 0.75581 |  0:00:00s
epoch 5  | loss: 0.42155 | val_0_accuracy: 0.75581 |  0:00:00s
epoch 6  | loss: 0.34538 | val_0_accuracy: 0.63953 |  0:00:00s
epoch 7  | loss: 0.34154 | val_0_accuracy: 0.44186 |  0:00:00s
epoch 8  | loss: 0.34578 | val_0_accuracy: 0.4186  |  0:00:00s
epoch 9  | loss: 0.30904 | val_0_accuracy: 0.4186  |  0:00:00s
epoch 10 | loss: 0.30101 | val_0_accuracy: 0.44186 |  0:00:01s
epoch 11 | loss: 0.31561 | val_0_accuracy: 0.46512 |  0:00:01s
epoch 12 | loss: 0.30935 | val_0_accuracy: 0.47674 |  0:00:01s
epoch 13 | loss: 0.33496 |

[I 2025-02-12 16:38:49,234] Trial 8 finished with value: 0.7558139534883721 and parameters: {'n_d': 16, 'n_a': 24, 'n_steps': 5, 'gamma': 1.064882191100539, 'lambda_sparse': 0.0019724357823765496, 'momentum': 0.07632162633799544}. Best is trial 3 with value: 0.8488372093023255.


epoch 22 | loss: 0.31674 | val_0_accuracy: 0.5     |  0:00:02s
epoch 23 | loss: 0.29061 | val_0_accuracy: 0.53488 |  0:00:02s
epoch 24 | loss: 0.29864 | val_0_accuracy: 0.53488 |  0:00:02s

Early stopping occurred at epoch 24 with best_epoch = 4 and best_val_0_accuracy = 0.75581
epoch 0  | loss: 1.32595 | val_0_accuracy: 0.32558 |  0:00:00s
epoch 1  | loss: 0.53668 | val_0_accuracy: 0.32558 |  0:00:00s
epoch 2  | loss: 0.5403  | val_0_accuracy: 0.32558 |  0:00:00s
epoch 3  | loss: 0.44371 | val_0_accuracy: 0.32558 |  0:00:00s
epoch 4  | loss: 0.46017 | val_0_accuracy: 0.4186  |  0:00:00s
epoch 5  | loss: 0.43173 | val_0_accuracy: 0.46512 |  0:00:00s
epoch 6  | loss: 0.38093 | val_0_accuracy: 0.56977 |  0:00:00s
epoch 7  | loss: 0.3672  | val_0_accuracy: 0.56977 |  0:00:00s
epoch 8  | loss: 0.40009 | val_0_accuracy: 0.5     |  0:00:00s
epoch 9  | loss: 0.38376 | val_0_accuracy: 0.4186  |  0:00:00s
epoch 10 | loss: 0.34546 | val_0_accuracy: 0.4186  |  0:00:00s
epoch 11 | loss: 0.34966 | 

[I 2025-02-12 16:38:53,406] Trial 9 finished with value: 0.7674418604651163 and parameters: {'n_d': 24, 'n_a': 16, 'n_steps': 5, 'gamma': 1.443152035276836, 'lambda_sparse': 0.005202952015391424, 'momentum': 0.029432828005191}. Best is trial 3 with value: 0.8488372093023255.


epoch 44 | loss: 0.28644 | val_0_accuracy: 0.72093 |  0:00:03s
epoch 45 | loss: 0.27177 | val_0_accuracy: 0.72093 |  0:00:04s

Early stopping occurred at epoch 45 with best_epoch = 25 and best_val_0_accuracy = 0.76744


In [107]:
# 최적의 하이퍼파라미터 출력
print("Best hyperparameters: ", study.best_params)

# 최적 파라미터로 모델 학습
best_tabnet_model = TabNetClassifier(**study.best_params)

best_tabnet_model.fit(
        X_train_res, y_train_res,
        eval_set=[(X_test, y_test)],
        eval_metric=['accuracy'],
        patience=20,  # 20번 연속 개선되지 않으면 학습 중단
        max_epochs=100,  # 최대 100 Epoch까지 학습
        batch_size=256,
        virtual_batch_size=128,
    )

# TabNet 모델 평가
print("TabNet Best Parameters:", grid_search.best_params_)
print("TabNet Accuracy:", accuracy_score(y_test, tabnet_pred))
print("TabNet Precision:", precision_score(y_test, tabnet_pred))
print("TabNet Recall:", recall_score(y_test, tabnet_pred))
print("TabNet F1-score:", f1_score(y_test, tabnet_pred))
print("TabNet AUC:", roc_auc_score(y_test, tabnet_pred))


Best hyperparameters:  {'n_d': 24, 'n_a': 16, 'n_steps': 3, 'gamma': 1.1831014346989828, 'lambda_sparse': 0.001959645313797297, 'momentum': 0.0873740845269653}
epoch 0  | loss: 0.66012 | val_0_accuracy: 0.67442 |  0:00:00s
epoch 1  | loss: 0.40473 | val_0_accuracy: 0.73256 |  0:00:00s
epoch 2  | loss: 0.4482  | val_0_accuracy: 0.77907 |  0:00:00s
epoch 3  | loss: 0.4636  | val_0_accuracy: 0.82558 |  0:00:00s
epoch 4  | loss: 0.42285 | val_0_accuracy: 0.7907  |  0:00:00s
epoch 5  | loss: 0.4207  | val_0_accuracy: 0.81395 |  0:00:00s
epoch 6  | loss: 0.37125 | val_0_accuracy: 0.81395 |  0:00:00s
epoch 7  | loss: 0.30216 | val_0_accuracy: 0.81395 |  0:00:00s
epoch 8  | loss: 0.33672 | val_0_accuracy: 0.80233 |  0:00:00s
epoch 9  | loss: 0.32346 | val_0_accuracy: 0.68605 |  0:00:00s
epoch 10 | loss: 0.32149 | val_0_accuracy: 0.69767 |  0:00:00s
epoch 11 | loss: 0.30962 | val_0_accuracy: 0.74419 |  0:00:00s
epoch 12 | loss: 0.3046  | val_0_accuracy: 0.75581 |  0:00:01s
epoch 13 | loss: 0.28

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))