# Data Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from skopt import BayesSearchCV
import time
import joblib

# Data Setting

In [2]:
# Log current status
def log_status(message):
    print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] {message}")

In [3]:
# Load the datasets
log_status("Loading datasets...")
df_fake = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
df_true = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")

[2025-05-27 00:30:18] Loading datasets...


In [4]:
# Assign class labels
df_fake["class"] = 0
df_true["class"] = 1

# Combine the datasets
df = pd.concat([df_fake, df_true], axis=0)
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
# Text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

log_status("Preprocessing text data...")
df["text"] = df["text"].apply(clean_text)

[2025-05-27 00:30:21] Preprocessing text data...


In [6]:
# Split the data
x = df["text"]
y = df["class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [7]:
# Vectorize the text
vectorizer = TfidfVectorizer()
xv_train = vectorizer.fit_transform(x_train)
xv_test = vectorizer.transform(x_test)

In [8]:
# Save vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

# Logistic Regression

In [9]:
# Bayesian Optimization for Logistic Regression
start_time_total = time.time()
start_time_opt = time.time()
log_status("Optimizing Logistic Regression with Bayesian Optimization...")
lr_search = BayesSearchCV(
    LogisticRegression(max_iter=1000),
    {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'solver': ['liblinear', 'lbfgs']
    },
    n_iter=32, random_state=42, cv=3
)
lr_search.fit(xv_train, y_train)
print("Best hyperparameters for Logistic Regression:")
print(lr_search.best_params_)
opt_time = time.time() - start_time_opt
start_time_train = time.time()
lr = lr_search.best_estimator_
lr.fit(xv_train, y_train)
train_time = time.time() - start_time_train
start_time_pred = time.time()
pred_lr = lr.predict(xv_test)
pred_time = time.time() - start_time_pred
total_time = time.time() - start_time_total
joblib.dump(lr, 'logistic_regression.pkl')

[2025-05-27 00:30:54] Optimizing Logistic Regression with Bayesian Optimization...
Best hyperparameters for Logistic Regression:
OrderedDict([('C', 235769.01450652358), ('solver', 'liblinear')])


['logistic_regression.pkl']

In [10]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, pred_lr))
print("Logistic Regression Hyperparameter Optimization Time: {:.4f} seconds".format(opt_time))
print("Logistic Regression Training Time: {:.4f} seconds".format(train_time))
print("Logistic Regression Prediction Time: {:.4f} seconds".format(pred_time))
print("Total Execution Time: {:.4f} seconds".format(total_time))
print(classification_report(y_test, pred_lr))
log_status("Logistic Regression optimization completed.")

Logistic Regression Accuracy: 0.9949220489977728
Logistic Regression Hyperparameter Optimization Time: 321.6311 seconds
Logistic Regression Training Time: 5.3304 seconds
Logistic Regression Prediction Time: 0.0075 seconds
Total Execution Time: 326.9692 seconds
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      5883
           1       1.00      0.99      0.99      5342

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225

[2025-05-27 00:36:21] Logistic Regression optimization completed.


# Random Forest

In [11]:
# Bayesian Optimization for Random Forest
start_time_total = time.time()
start_time_opt = time.time()
log_status("Optimizing Random Forest with Bayesian Optimization...")
rfc_search = BayesSearchCV(
    RandomForestClassifier(random_state=42),
    {
        'n_estimators': (10, 200),
        'max_depth': (1, 20),
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10)
    },
    n_iter=10, random_state=42, cv=3
)
rfc_search.fit(xv_train, y_train)
opt_time = time.time() - start_time_opt
start_time_train = time.time()
rfc = rfc_search.best_estimator_
print("Best hyperparameters for Random Forest:")
print(rfc)
rfc.fit(xv_train, y_train)
train_time = time.time() - start_time_train
start_time_pred = time.time()
pred_rfc = rfc.predict(xv_test)
pred_time = time.time() - start_time_pred
total_time = time.time() - start_time_total
joblib.dump(rfc, 'random_forest.pkl')

[2025-05-27 00:36:21] Optimizing Random Forest with Bayesian Optimization...
Best hyperparameters for Random Forest:
RandomForestClassifier(max_depth=16, min_samples_leaf=3, min_samples_split=13,
                       n_estimators=163, random_state=42)


['random_forest.pkl']

In [12]:
print("Random Forest Accuracy:", accuracy_score(y_test, pred_rfc))
print("Random Forest Hyperparameter Optimization Time: {:.4f} seconds".format(opt_time))
print("Random Forest Training Time: {:.4f} seconds".format(train_time))
print("Random Forest Prediction Time: {:.4f} seconds".format(pred_time))
print("Total Execution Time: {:.4f} seconds".format(total_time))
print(classification_report(y_test, pred_rfc))
log_status("Random Forest optimization completed.")

Random Forest Accuracy: 0.9797772828507795
Random Forest Hyperparameter Optimization Time: 327.9080 seconds
Random Forest Training Time: 12.3189 seconds
Random Forest Prediction Time: 1.3810 seconds
Total Execution Time: 341.6080 seconds
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5883
           1       0.98      0.98      0.98      5342

    accuracy                           0.98     11225
   macro avg       0.98      0.98      0.98     11225
weighted avg       0.98      0.98      0.98     11225

[2025-05-27 00:42:02] Random Forest optimization completed.


# Naive Bayes

In [13]:
# Bayesian Optimization for Naive Bayes
start_time_total = time.time()
start_time_opt = time.time()
log_status("Optimizing Naive Bayes with Bayesian Optimization...")
nb_search = BayesSearchCV(
    MultinomialNB(),
    {
        'alpha': (1e-6, 1.0, 'log-uniform')
    },
    n_iter=10, random_state=42, cv=3
)
nb_search.fit(xv_train, y_train)
opt_time = time.time() - start_time_opt
start_time_train = time.time()
nb = nb_search.best_estimator_
print("Best hyperparameters for Naive Bayes:")
print(nb)
nb.fit(xv_train, y_train)
train_time = time.time() - start_time_train
start_time_pred = time.time()
pred_nb = nb.predict(xv_test)
pred_time = time.time() - start_time_pred
total_time = time.time() - start_time_total
joblib.dump(nb, 'naive_bayes.pkl')

[2025-05-27 00:42:02] Optimizing Naive Bayes with Bayesian Optimization...
Best hyperparameters for Naive Bayes:
MultinomialNB(alpha=0.0018214548318355843)


['naive_bayes.pkl']

In [14]:
print("Naive Bayes Accuracy:", accuracy_score(y_test, pred_nb))
print("Naive Bayes Hyperparameter Optimization Time: {:.4f} seconds".format(opt_time))
print("Naive Bayes Training Time: {:.4f} seconds".format(train_time))
print("Naive Bayes Prediction Time: {:.4f} seconds".format(pred_time))
print("Total Execution Time: {:.4f} seconds".format(total_time))
print(classification_report(y_test, pred_nb))
log_status("Naive Bayes optimization completed.")

Naive Bayes Accuracy: 0.9460133630289532
Naive Bayes Hyperparameter Optimization Time: 2.3197 seconds
Naive Bayes Training Time: 0.0502 seconds
Naive Bayes Prediction Time: 0.0142 seconds
Total Execution Time: 2.3843 seconds
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      5883
           1       0.96      0.93      0.94      5342

    accuracy                           0.95     11225
   macro avg       0.95      0.95      0.95     11225
weighted avg       0.95      0.95      0.95     11225

[2025-05-27 00:42:05] Naive Bayes optimization completed.


# SVM 

In [15]:
# Bayesian Optimization for SVM
log_status("Optimizing SVM with Bayesian Optimization...")
svm_search = BayesSearchCV(
    SVC(random_state=42),
    {
        'C': (1e-2, 1e+2, 'log-uniform'),  
        'kernel': ['linear', 'rbf'],        
        'gamma': (1e-3, 1e-1, 'log-uniform') 
    },
    n_iter=5,  
    random_state=42, 
    cv=3,
    n_jobs=-1  
)
xv_train_sample = xv_train[:5000]
y_train_sample = y_train[:5000]
start_time_opt_svm = time.time()
svm_search.fit(xv_train_sample, y_train_sample)
opt_time_svm = time.time() - start_time_opt_svm
start_time_train_svm = time.time()
SVM = svm_search.best_estimator_
print("Best hyperparameters for SVM:")
print(SVM)
SVM.fit(xv_train, y_train)
train_time_svm = time.time() - start_time_train_svm
start_time_pred_svm = time.time()
pred_svm = SVM.predict(xv_test)
pred_time_svm = time.time() - start_time_pred_svm

total_time_svm = time.time() - start_time_opt_svm
joblib.dump(SVM, 'svm.pkl')

[2025-05-27 00:42:05] Optimizing SVM with Bayesian Optimization...
Best hyperparameters for SVM:
SVC(C=22.364202820542705, gamma=0.058429282697611454, kernel='linear',
    random_state=42)


['svm.pkl']

In [16]:
print("SVM Accuracy:", accuracy_score(y_test, pred_svm))
print("SVM Hyperparameter Optimization Time: {:.4f} seconds".format(opt_time_svm))
print("SVM Training Time: {:.4f} seconds".format(train_time_svm))
print("SVM Prediction Time: {:.4f} seconds".format(pred_time_svm))
print("Total Execution Time: {:.4f} seconds".format(total_time_svm))
print(classification_report(y_test, pred_svm))

log_status("SVM optimization completed.")

SVM Accuracy: 0.9939420935412027
SVM Hyperparameter Optimization Time: 138.5997 seconds
SVM Training Time: 775.1553 seconds
SVM Prediction Time: 82.0453 seconds
Total Execution Time: 995.8005 seconds
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5883
           1       1.00      0.99      0.99      5342

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225

[2025-05-27 00:58:41] SVM optimization completed.
