# Data Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from skopt import BayesSearchCV
import time
import joblib

# Data Setting

In [2]:
# Log current status
def log_status(message):
    print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {message}")

In [3]:
# ✅ 1. 기존 Fake/True News 데이터 로드
log_status("Loading Fake and Real News Dataset...")
df_fake = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
df_true = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
df_fake["class"] = 0
df_true["class"] = 1
df_fake["text"] = df_fake["text"]
df_true["text"] = df_true["text"]

# ✅ 2. FakeNewsNet (BuzzFeed) 데이터 로드
log_status("Loading FakeNewsNet BuzzFeed dataset...")
df_buzz_fake = pd.read_csv("/kaggle/input/fakenewsnet/BuzzFeed_fake_news_content.csv")
df_buzz_real = pd.read_csv("/kaggle/input/fakenewsnet/BuzzFeed_real_news_content.csv")

# ✅ 3. 텍스트 구성: title + text
df_buzz_fake["text"] = df_buzz_fake["title"].fillna('') + " " + df_buzz_fake["text"].fillna('')
df_buzz_real["text"] = df_buzz_real["title"].fillna('') + " " + df_buzz_real["text"].fillna('')
df_buzz_fake["class"] = 0
df_buzz_real["class"] = 1

# ✅ 4. 전체 병합
log_status("Merging all datasets...")
df_combined = pd.concat([
    df_fake[["text", "class"]],
    df_true[["text", "class"]],
    df_buzz_fake[["text", "class"]],
    df_buzz_real[["text", "class"]]
], axis=0).sample(frac=1).reset_index(drop=True)

[2025-05-21 07:57:04] Loading Fake and Real News Dataset...
[2025-05-21 07:57:07] Loading FakeNewsNet BuzzFeed dataset...
[2025-05-21 07:57:07] Merging all datasets...


In [4]:
# Text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

log_status("Preprocessing text data...")
df_combined["text"] = df_combined["text"].apply(clean_text)

[2025-05-21 07:57:07] Preprocessing text data...


In [5]:
# Split the data
x = df_combined["text"]
y = df_combined["class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [6]:
# Vectorize the text
vectorizer = TfidfVectorizer()
xv_train = vectorizer.fit_transform(x_train)
xv_test = vectorizer.transform(x_test)

In [7]:
# Save vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

# Logistic Regression

In [8]:
# Bayesian Optimization for Logistic Regression
start_time_total = time.time()
start_time_opt = time.time()
log_status("Optimizing Logistic Regression with Bayesian Optimization...")
lr_search = BayesSearchCV(
    LogisticRegression(max_iter=1000),
    {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'solver': ['liblinear', 'lbfgs']
    },
    n_iter=32, random_state=42, cv=3
)
lr_search.fit(xv_train, y_train)
print("Best hyperparameters for Logistic Regression:")
print(lr_search.best_params_)
opt_time = time.time() - start_time_opt
start_time_train = time.time()
lr = lr_search.best_estimator_
lr.fit(xv_train, y_train)
train_time = time.time() - start_time_train
start_time_pred = time.time()
pred_lr = lr.predict(xv_test)
pred_time = time.time() - start_time_pred
total_time = time.time() - start_time_total
joblib.dump(lr, 'logistic_regression.pkl')

[2025-05-21 07:57:38] Optimizing Logistic Regression with Bayesian Optimization...
Best hyperparameters for Logistic Regression:
OrderedDict([('C', 554139.5475342941), ('solver', 'liblinear')])


['logistic_regression.pkl']

In [9]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))
print("Logistic Regression Hyperparameter Optimization Time: {:.4f} seconds".format(opt_time))
print("Logistic Regression Training Time: {:.4f} seconds".format(train_time))
print("Logistic Regression Prediction Time: {:.4f} seconds".format(pred_time))
print("Total Execution Time: {:.4f} seconds".format(total_time))
print(classification_report(y_test, pred_lr))
log_status("Logistic Regression optimization completed.")

Logistic Regression Accuracy: 0.9939662821650399
Logistic Regression Hyperparameter Optimization Time: 310.5482 seconds
Logistic Regression Training Time: 5.7301 seconds
Logistic Regression Prediction Time: 0.0068 seconds
Total Execution Time: 316.2853 seconds
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5790
           1       0.99      0.99      0.99      5480

    accuracy                           0.99     11270
   macro avg       0.99      0.99      0.99     11270
weighted avg       0.99      0.99      0.99     11270

[2025-05-21 08:02:55] Logistic Regression optimization completed.


# Random Forest

In [10]:
# Bayesian Optimization for Random Forest
start_time_total = time.time()
start_time_opt = time.time()
log_status("Optimizing Random Forest with Bayesian Optimization...")
rfc_search = BayesSearchCV(
    RandomForestClassifier(random_state=42),
    {
        'n_estimators': (10, 200),
        'max_depth': (1, 20),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10)
    },
    n_iter=10, random_state=42, cv=3
)
rfc_search.fit(xv_train, y_train)
opt_time = time.time() - start_time_opt
start_time_train = time.time()
rfc = rfc_search.best_estimator_
print("Best hyperparameters for Random Forest:")
print(rfc)
rfc.fit(xv_train, y_train)
train_time = time.time() - start_time_train
start_time_pred = time.time()
pred_rfc = rfc.predict(xv_test)
pred_time = time.time() - start_time_pred
total_time = time.time() - start_time_total
joblib.dump(rfc, 'random_forest.pkl')

[2025-05-21 08:02:55] Optimizing Random Forest with Bayesian Optimization...
Best hyperparameters for Random Forest:
RandomForestClassifier(max_depth=19, min_samples_leaf=7, min_samples_split=18,
                       n_estimators=89, random_state=42)


['random_forest.pkl']

In [11]:
print("Random Forest Accuracy:", accuracy_score(y_test, pred_rfc))
print("Random Forest Hyperparameter Optimization Time: {:.4f} seconds".format(opt_time))
print("Random Forest Training Time: {:.4f} seconds".format(train_time))
print("Random Forest Prediction Time: {:.4f} seconds".format(pred_time))
print("Total Execution Time: {:.4f} seconds".format(total_time))
print(classification_report(y_test, pred_rfc))
log_status("Random Forest optimization completed.")

Random Forest Accuracy: 0.9784383318544809
Random Forest Hyperparameter Optimization Time: 332.4439 seconds
Random Forest Training Time: 11.1793 seconds
Random Forest Prediction Time: 0.9767 seconds
Total Execution Time: 344.6001 seconds
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5790
           1       0.98      0.98      0.98      5480

    accuracy                           0.98     11270
   macro avg       0.98      0.98      0.98     11270
weighted avg       0.98      0.98      0.98     11270

[2025-05-21 08:08:39] Random Forest optimization completed.


# Naive Bayes

In [12]:
# Bayesian Optimization for Naive Bayes
start_time_total = time.time()
start_time_opt = time.time()
log_status("Optimizing Naive Bayes with Bayesian Optimization...")
nb_search = BayesSearchCV(
    MultinomialNB(),
    {
        'alpha': (1e-6, 1.0, 'log-uniform')
    },
    n_iter=10, random_state=42, cv=3
)
nb_search.fit(xv_train, y_train)
opt_time = time.time() - start_time_opt
start_time_train = time.time()
nb = nb_search.best_estimator_
print("Best hyperparameters for Naive Bayes:")
print(nb)
nb.fit(xv_train, y_train)
train_time = time.time() - start_time_train
start_time_pred = time.time()
pred_nb = nb.predict(xv_test)
pred_time = time.time() - start_time_pred
total_time = time.time() - start_time_total
joblib.dump(nb, 'naive_bayes.pkl')

[2025-05-21 08:08:40] Optimizing Naive Bayes with Bayesian Optimization...
Best hyperparameters for Naive Bayes:
MultinomialNB(alpha=0.0018214548318355843)


['naive_bayes.pkl']

In [13]:
print("Naive Bayes Accuracy:", accuracy_score(y_test, pred_nb))
print("Naive Bayes Hyperparameter Optimization Time: {:.4f} seconds".format(opt_time))
print("Naive Bayes Training Time: {:.4f} seconds".format(train_time))
print("Naive Bayes Prediction Time: {:.4f} seconds".format(pred_time))
print("Total Execution Time: {:.4f} seconds".format(total_time))
print(classification_report(y_test, pred_nb))
log_status("Naive Bayes optimization completed.")

Naive Bayes Accuracy: 0.9469387755102041
Naive Bayes Hyperparameter Optimization Time: 3.1523 seconds
Naive Bayes Training Time: 0.0643 seconds
Naive Bayes Prediction Time: 0.0186 seconds
Total Execution Time: 3.2354 seconds
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      5790
           1       0.96      0.93      0.94      5480

    accuracy                           0.95     11270
   macro avg       0.95      0.95      0.95     11270
weighted avg       0.95      0.95      0.95     11270

[2025-05-21 08:08:43] Naive Bayes optimization completed.


# SVM 

In [14]:
# Bayesian Optimization for SVM
log_status("Optimizing SVM with Bayesian Optimization...")
svm_search = BayesSearchCV(
    SVC(random_state=42),
    {
        'C': (1e-2, 1e+2, 'log-uniform'),  
        'kernel': ['linear', 'rbf'],        
        'gamma': (1e-3, 1e-1, 'log-uniform') 
    },
    n_iter=5,  
    random_state=42, 
    cv=3,
    n_jobs=-1  
)
xv_train_sample = xv_train[:5000]
y_train_sample = y_train[:5000]
start_time_opt_svm = time.time()
svm_search.fit(xv_train_sample, y_train_sample)
opt_time_svm = time.time() - start_time_opt_svm
start_time_train_svm = time.time()
SVM = svm_search.best_estimator_
print("Best hyperparameters for SVM:")
print(SVM)
SVM.fit(xv_train, y_train)
train_time_svm = time.time() - start_time_train_svm
start_time_pred_svm = time.time()
pred_svm = SVM.predict(xv_test)
pred_time_svm = time.time() - start_time_pred_svm

total_time_svm = time.time() - start_time_opt_svm
joblib.dump(SVM, 'svm.pkl')

[2025-05-21 08:08:43] Optimizing SVM with Bayesian Optimization...
Best hyperparameters for SVM:
SVC(C=0.6016307829589929, gamma=0.06877728743793542, kernel='linear',
    random_state=42)


['svm.pkl']

In [15]:
print("SVM Accuracy:", accuracy_score(y_test, pred_svm))
print("SVM Hyperparameter Optimization Time: {:.4f} seconds".format(opt_time_svm))
print("SVM Training Time: {:.4f} seconds".format(train_time_svm))
print("SVM Prediction Time: {:.4f} seconds".format(pred_time_svm))
print("Total Execution Time: {:.4f} seconds".format(total_time_svm))
print(classification_report(y_test, pred_svm))

log_status("SVM optimization completed.")

SVM Accuracy: 0.9912156166814552
SVM Hyperparameter Optimization Time: 161.5686 seconds
SVM Training Time: 807.5440 seconds
SVM Prediction Time: 114.0114 seconds
Total Execution Time: 1083.1241 seconds
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5790
           1       0.99      0.99      0.99      5480

    accuracy                           0.99     11270
   macro avg       0.99      0.99      0.99     11270
weighted avg       0.99      0.99      0.99     11270

[2025-05-21 08:26:46] SVM optimization completed.
