In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from skopt import gp_minimize
from skopt.space import Integer, Real
from skopt.utils import use_named_args
from sklearn.metrics import accuracy_score, classification_report
import time
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import joblib

In [2]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Load dataset
df_fake = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
df_true = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
df_fake["class"] = 0
df_true["class"] = 1
df = pd.concat([df_fake, df_true], axis=0).sample(frac=1).reset_index(drop=True)
df = df.drop(["title", "subject", "date"], axis=1)
df_buzz_fake = pd.read_csv("/kaggle/input/fakenewsnet/BuzzFeed_fake_news_content.csv")
df_buzz_real = pd.read_csv("/kaggle/input/fakenewsnet/BuzzFeed_real_news_content.csv")

df_buzz_fake["text"] = df_buzz_fake["title"].fillna('') + " " + df_buzz_fake["text"].fillna('')
df_buzz_real["text"] = df_buzz_real["title"].fillna('') + " " + df_buzz_real["text"].fillna('')

df_buzz_fake["class"] = 0
df_buzz_real["class"] = 1

df_combined = pd.concat([
    df_fake[["text", "class"]],
    df_true[["text", "class"]],
    df_buzz_fake[["text", "class"]],
    df_buzz_real[["text", "class"]]
], axis=0).sample(frac=1).reset_index(drop=True)

In [4]:
# Preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub("\[.*?\]", "", text)
    text = re.sub("\\W", " ", text)
    text = re.sub("https?://\\S+|www\\.\\S+", "", text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\\n", "", text)
    text = re.sub("\\w*\\d\\w*", "", text)    
    return text

df_combined["text"] = df_combined["text"].apply(clean_text)

In [5]:
# Tokenization using BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df_sample = df.sample(frac=0.5, random_state=42)  # Reduce dataset size for faster training
inputs = tokenizer(list(df_sample["text"]), return_tensors="pt", padding=True, truncation=True, max_length=512)
attention_masks = inputs['attention_mask']
labels = torch.tensor(df_sample["class"].values, dtype=torch.long)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
# Save tokenizer
tokenizer.save_pretrained('bert_tokenizer')

('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

In [7]:
# Train-test split
x_train, x_test, y_train, y_test, mask_train, mask_test = train_test_split(
    inputs["input_ids"], labels, attention_masks, test_size=0.25, random_state=42
)
train_dataset = TensorDataset(x_train, mask_train, y_train)
test_dataset = TensorDataset(x_test, mask_test, y_test)

In [8]:

# Bayesian Optimization Search Space for BERT
search_space_bert = [
    Real(1e-5, 5e-5, name='learning_rate'),
    Integer(16, 64, name='batch_size'),  # Increased batch size range
    Integer(1, 3, name='epochs')  # Adjusted epoch range
]

@use_named_args(search_space_bert)
def objective_bert(learning_rate, batch_size, epochs):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    train_loader_local = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
    
    for epoch in range(epochs):
        model.train()
        for x_batch, mask_batch, y_batch in train_loader_local:
            x_batch, mask_batch, y_batch = x_batch.to(device), mask_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids=x_batch, attention_mask=mask_batch).logits
            loss = criterion(outputs, y_batch)
            if torch.isnan(loss) or torch.isinf(loss):
                return float('inf')
            loss.backward()
            optimizer.step()
    
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x_batch, mask_batch, y_batch in DataLoader(test_dataset, batch_size=int(batch_size), shuffle=False):
            x_batch, mask_batch, y_batch = x_batch.to(device), mask_batch.to(device), y_batch.to(device)
            outputs = model(input_ids=x_batch, attention_mask=mask_batch).logits
            loss = criterion(outputs, y_batch)
            if torch.isnan(loss) or torch.isinf(loss):
                return float('inf')
            total_loss += loss.item()
    
    return float(total_loss) / len(test_dataset)

In [9]:
# Run Bayesian Optimization for BERT
start_time_opt_bert = time.time()
result_bert = gp_minimize(objective_bert, search_space_bert, n_calls=3, n_initial_points=3, random_state=42)
opt_time_bert = time.time() - start_time_opt_bert
print(f"Best BERT Parameters: learning_rate={result_bert.x[0]}, batch_size={result_bert.x[1]}, epochs={result_bert.x[2]}")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Best BERT Parameters: learning_rate=4.186171947440932e-05, batch_size=25, epochs=3


In [10]:
# Train and Evaluate BERT Model
start_time_train_bert = time.time()
best_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
optimizer = optim.AdamW(best_bert.parameters(), lr=result_bert.x[0])
criterion = nn.CrossEntropyLoss()
train_loader_best = DataLoader(train_dataset, batch_size=int(result_bert.x[1]), shuffle=True)

for epoch in range(result_bert.x[2]):
    best_bert.train()
    for x_batch, mask_batch, y_batch in train_loader_best:
        x_batch, mask_batch, y_batch = x_batch.to(device), mask_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = best_bert(input_ids=x_batch, attention_mask=mask_batch).logits
        loss = criterion(outputs, y_batch)
        if torch.isnan(loss) or torch.isinf(loss):
            continue
        loss.backward()
        optimizer.step()

train_time_bert = time.time() - start_time_train_bert

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Save trained BERT model
torch.save(best_bert.state_dict(), 'bert_model.pth')

In [12]:
# Model Testing
start_time_eval_bert = time.time()
y_true = []
y_pred = []

with torch.no_grad():
    best_bert.eval()
    for x_batch, mask_batch, y_batch in DataLoader(test_dataset, batch_size=int(result_bert.x[1]), shuffle=False):
        x_batch, mask_batch, y_batch = x_batch.to(device), mask_batch.to(device), y_batch.to(device)
        outputs = best_bert(input_ids=x_batch, attention_mask=mask_batch).logits
        predictions = torch.argmax(outputs, dim=1)
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())

eval_time_bert = time.time() - start_time_eval_bert

total_time_bert = opt_time_bert + train_time_bert + eval_time_bert

In [13]:
accuracy = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred)

In [14]:
# Print Results
print(f"BERT Model Accuracy: {accuracy}")
print(f"BERT Hyperparameter Optimization Time: {opt_time_bert:.2f} seconds")
print(f"BERT Training Time: {train_time_bert:.2f} seconds")
print(f"BERT Evaluation Time: {eval_time_bert:.2f} seconds")
print(f"Total Execution Time: {total_time_bert:.2f} seconds")
print(classification_report(y_true, y_pred))
print("BERT optimization completed.")

BERT Model Accuracy: 1.0
BERT Hyperparameter Optimization Time: 4714.32 seconds
BERT Training Time: 2700.86 seconds
BERT Evaluation Time: 86.67 seconds
Total Execution Time: 7501.85 seconds
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2920
           1       1.00      1.00      1.00      2693

    accuracy                           1.00      5613
   macro avg       1.00      1.00      1.00      5613
weighted avg       1.00      1.00      1.00      5613

BERT optimization completed.
