In [1]:
import pandas as pd
import numpy as np
import re
import string
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from skopt import gp_minimize
from skopt.space import Integer, Real
from skopt.utils import use_named_args
from sklearn.metrics import classification_report, accuracy_score
import time
import joblib

In [2]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Load dataset
df_fake = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
df_true = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
df_fake["class"] = 0
df_true["class"] = 1
df = pd.concat([df_fake, df_true], axis=0).sample(frac=1).reset_index(drop=True)
df = df.drop(["title", "subject", "date"], axis=1)

In [4]:
# Preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub("\[.*?\]", "", text)
    text = re.sub("\\W", " ", text)
    text = re.sub("https?://\\S+|www\\.\\S+", "", text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\\n", "", text)
    text = re.sub("\\w*\\d\\w*", "", text)    
    return text

df["text"] = df["text"].apply(clean_text)

In [5]:
# Train-test split
x_train_text, x_test_text, y_train, y_test = train_test_split(df["text"], df["class"].values, test_size=0.25, random_state=42)

In [6]:
# Tokenization
tokenizer = Tokenizer(num_words=5000, lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(x_train_text)
x_train = tokenizer.texts_to_sequences(x_train_text)
x_test = tokenizer.texts_to_sequences(x_test_text)
x_train = pad_sequences(x_train, maxlen=200, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=200, padding='post', truncating='post')

In [7]:
# Save tokenizer
joblib.dump(tokenizer, 'tokenizer.pkl')

['tokenizer.pkl']

In [8]:
# Convert to PyTorch tensors
x_train_torch = torch.tensor(x_train, dtype=torch.long, device=device)
y_train_torch = torch.tensor(y_train, dtype=torch.float32, device=device)
x_test_torch = torch.tensor(x_test, dtype=torch.long, device=device)
y_test_torch = torch.tensor(y_test, dtype=torch.float32, device=device)

In [9]:
# PyTorch Dataset & DataLoader
class TextDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

train_dataset = TextDataset(x_train_torch, y_train_torch)
test_dataset = TextDataset(x_test_torch, y_test_torch)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# LSTM

In [10]:
# Define PyTorch LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_layers, dropout):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=5000, embedding_dim=embed_dim)
        self.lstm = nn.LSTM(
            embed_dim, hidden_dim, num_layers=num_layers, 
            batch_first=True, dropout=dropout if num_layers > 1 else 0.0
        )
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        x = self.fc(hn[-1])
        return self.sigmoid(x).view(-1)

In [11]:
# Bayesian Optimization Search Space
search_space = [
    Integer(32, 256, name='embed_dim'),
    Integer(32, 256, name='hidden_dim'),
    Integer(1, 3, name='num_layers'),
    Real(0.1, 0.5, name='dropout')
]

criterion = nn.BCELoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

@use_named_args(search_space)
def objective(embed_dim, hidden_dim, num_layers, dropout):
    model = LSTMClassifier(
        int(embed_dim),
        int(hidden_dim),
        int(num_layers),
        float(dropout)
    ).to(device)
    
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(3):
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(x_batch).view(-1)
            loss = criterion(y_pred, y_batch.float())
            loss.backward()
            optimizer.step()
    
    total_loss = 0
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            y_pred = model(x_batch).view(-1)
            loss = criterion(y_pred, y_batch.float())
            total_loss += loss.item()
    
    return total_loss / len(test_loader)

start_time_opt = time.time()

In [12]:
# Run Bayesian Optimization for LSTM
result = gp_minimize(objective, search_space, n_calls=10, random_state=42)
print(f"Best LSTM Parameters: embed_dim={result.x[0]}, hidden_dim={result.x[1]}, num_layers={result.x[2]}, dropout={result.x[3]}")
opt_time = time.time() - start_time_opt

Best LSTM Parameters: embed_dim=52, hidden_dim=171, num_layers=2, dropout=0.49329235432271534


In [13]:
# Measure training time
start_time_train = time.time()

# Train final model with best hyperparameters
best_lstm = LSTMClassifier(
    int(result.x[0]),
    int(result.x[1]),
    int(result.x[2]),
    float(result.x[3])
).to(device)

optimizer = optim.Adam(best_lstm.parameters(), lr=0.001)

for epoch in range(5):
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = best_lstm(x_batch).view(-1)
        loss = criterion(y_pred, y_batch.float())
        loss.backward()
        optimizer.step()
train_time = time.time() - start_time_train

In [14]:
torch.save(best_lstm.state_dict(), 'lstm_model.pth')

In [15]:
# Measure evaluation time
start_time_eval = time.time()

# Model Testing and Evaluation
y_true = []
y_pred = []
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        outputs = best_lstm(x_batch).view(-1)
        predictions = (outputs > 0.5).float()
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())
pred_time = time.time() - start_time_eval

# Compute accuracy and classification report
accuracy = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred)

# Total execution time
total_time = opt_time + train_time + pred_time

In [16]:
# Print Results
print(f"LSTM Model Accuracy: {accuracy}")
print(f"LSTM Hyperparameter Optimization Time: {opt_time:.2f} seconds")
print(f"LSTM Training Time: {train_time:.2f} seconds")
print(f"LSTM Prediction Time: {pred_time:.2f} seconds")
print(f"Total Execution Time: {total_time:.2f} seconds")
print(report)
print("LSTM optimization completed.")

LSTM Model Accuracy: 0.9983073496659243
LSTM Hyperparameter Optimization Time: 431.26 seconds
LSTM Training Time: 159.97 seconds
LSTM Prediction Time: 6.04 seconds
Total Execution Time: 597.26 seconds
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5856
         1.0       1.00      1.00      1.00      5369

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225

LSTM optimization completed.


# BiLSTM

In [17]:
# Define PyTorch BiLSTM Model
class BiLSTMClassifier(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_layers, dropout):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=5000, embedding_dim=embed_dim)
        self.bilstm = nn.LSTM(
            embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, 
            dropout=dropout if num_layers > 1 else 0.0, bidirectional=True
        )
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.bilstm(x)
        x = self.fc(torch.cat((hn[-2], hn[-1]), dim=1))
        return self.sigmoid(x).view(-1)

In [18]:
# Bayesian Optimization Search Space for BiLSTM
search_space_bilstm = [
    Integer(32, 256, name='embed_dim'),
    Integer(32, 256, name='hidden_dim'),
    Integer(1, 3, name='num_layers'),
    Real(0.1, 0.5, name='dropout')
]

@use_named_args(search_space_bilstm)
def objective_bilstm(embed_dim, hidden_dim, num_layers, dropout):
    model = BiLSTMClassifier(
        int(embed_dim),
        int(hidden_dim),
        int(num_layers),
        float(dropout)
    ).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()
    
    for epoch in range(3):
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(x_batch).view(-1)
            loss = criterion(y_pred, y_batch.float())
            loss.backward()
            optimizer.step()
    
    total_loss = 0
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            y_pred = model(x_batch).view(-1)
            loss = criterion(y_pred, y_batch.float())
            total_loss += loss.item()
    
    return total_loss / len(test_loader)

In [19]:
# Run Bayesian Optimization for BiLSTM
start_time_opt_bilstm = time.time()
result_bilstm = gp_minimize(objective_bilstm, search_space_bilstm, n_calls=30, random_state=42)
print(f"Best BiLSTM Parameters: embed_dim={result_bilstm.x[0]}, hidden_dim={result_bilstm.x[1]}, num_layers={result_bilstm.x[2]}, dropout={result_bilstm.x[3]}")
opt_time_bilstm = time.time() - start_time_opt_bilstm

Best BiLSTM Parameters: embed_dim=256, hidden_dim=126, num_layers=3, dropout=0.14442561928899436


In [20]:
# Train final BiLSTM model
start_time_train_bilstm = time.time()
best_bilstm = BiLSTMClassifier(
    int(result_bilstm.x[0]),
    int(result_bilstm.x[1]),
    int(result_bilstm.x[2]),
    float(result_bilstm.x[3])
).to(device)

optimizer = optim.Adam(best_bilstm.parameters(), lr=0.001)

for epoch in range(5):
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = best_bilstm(x_batch).view(-1)
        loss = criterion(y_pred, y_batch.float())
        loss.backward()
        optimizer.step()
train_time_bilstm = time.time() - start_time_train_bilstm

In [21]:
torch.save(best_bilstm.state_dict(), 'bilstm_model.pth')

In [22]:
# Model Testing
start_time_eval_bilstm = time.time()
y_true = []
y_pred = []
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        outputs = best_bilstm(x_batch).view(-1)
        predictions = (outputs > 0.5).float()
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())

eval_time_bilstm = time.time() - start_time_eval_bilstm
total_time_bilstm = opt_time_bilstm + train_time_bilstm + eval_time_bilstm
accuracy = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred)
# Print Results
print(f"BiLSTM Model Accuracy: {accuracy}")
print(f"BiLSTM Hyperparameter Optimization Time: {opt_time_bilstm:.2f} seconds")
print(f"BiLSTM Training Time: {train_time_bilstm:.2f} seconds")
print(f"BiLSTM Evaluation Time: {eval_time_bilstm:.2f} seconds")
print(f"Total Execution Time: {total_time_bilstm:.2f} seconds")
print(classification_report(y_true, y_pred))
print("BiLSTM optimization completed.")

BiLSTM Model Accuracy: 0.998218262806236
BiLSTM Hyperparameter Optimization Time: 2893.31 seconds
BiLSTM Training Time: 119.15 seconds
BiLSTM Evaluation Time: 3.30 seconds
Total Execution Time: 3015.77 seconds
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      5856
         1.0       1.00      1.00      1.00      5369

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225

BiLSTM optimization completed.


# CNN

In [23]:
# Define PyTorch CNN Model
class CNNClassifier(nn.Module):
    def __init__(self, embed_dim, num_filters, kernel_size, dropout):
        super(CNNClassifier, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=5000, embedding_dim=embed_dim)
        self.conv = nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=kernel_size, padding=kernel_size // 2)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)
        x = self.conv(x)
        x = self.relu(x)
        x = self.pool(x)
        x = torch.mean(x, dim=2)
        x = self.dropout(x)
        x = self.fc(x)
        return self.sigmoid(x).view(-1)

In [24]:
# Bayesian Optimization Search Space for CNN
search_space_cnn = [
    Integer(32, 256, name='embed_dim'),
    Integer(32, 256, name='num_filters'),
    Integer(2, 5, name='kernel_size'),
    Real(0.1, 0.5, name='dropout')
]

@use_named_args(search_space_cnn)
def objective_cnn(embed_dim, num_filters, kernel_size, dropout):
    model = CNNClassifier(
        int(embed_dim),
        int(num_filters),
        int(kernel_size),
        float(dropout)
    ).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()
    
    for epoch in range(3):
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(x_batch).view(-1)
            loss = criterion(y_pred, y_batch.float())
            loss.backward()
            optimizer.step()
    
    total_loss = 0
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            y_pred = model(x_batch).view(-1)
            loss = criterion(y_pred, y_batch.float())
            total_loss += loss.item()
    
    return total_loss / len(test_loader)

In [25]:
# Run Bayesian Optimization for CNN
start_time_opt_cnn = time.time()
result_cnn = gp_minimize(objective_cnn, search_space_cnn, n_calls=10, random_state=42)
print(f"Best CNN Parameters: embed_dim={result_cnn.x[0]}, num_filters={result_cnn.x[1]}, kernel_size={result_cnn.x[2]}, dropout={result_cnn.x[3]}")
opt_time_cnn = time.time() - start_time_opt_cnn

Best CNN Parameters: embed_dim=35, num_filters=243, kernel_size=4, dropout=0.2541666010159665


In [26]:
# Measure training time
start_time_train_cnn = time.time()

# Train final CNN model
best_cnn = CNNClassifier(
    int(result_cnn.x[0]),
    int(result_cnn.x[1]),
    int(result_cnn.x[2]),
    float(result_cnn.x[3])
).to(device)

optimizer = optim.Adam(best_cnn.parameters(), lr=0.001)
criterion = nn.BCELoss()

for epoch in range(5):
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = best_cnn(x_batch).view(-1)
        loss = criterion(y_pred, y_batch.float())
        loss.backward()
        optimizer.step()
train_time_cnn = time.time() - start_time_train_cnn

In [27]:
torch.save(best_cnn.state_dict(), 'cnn_model.pth')

In [28]:
# Model Testing
start_time_eval_cnn = time.time()
y_true = []
y_pred = []

with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        outputs = best_cnn(x_batch).view(-1)
        predictions = (outputs > 0.5).float()
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(predictions.cpu().numpy())
eval_time_cnn = time.time() - start_time_eval_cnn
total_time_cnn = opt_time_cnn + train_time_cnn + eval_time_cnn
accuracy = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred, output_dict=True)
# Print Results
print(f"CNN Model Accuracy: {accuracy}")
print(f"CNN Hyperparameter Optimization Time: {opt_time_cnn:.2f} seconds")
print(f"CNN Training Time: {train_time_cnn:.2f} seconds")
print(f"CNN Evaluation Time: {eval_time_cnn:.2f} seconds")
print(f"Total Execution Time: {total_time_cnn:.2f} seconds")
print(classification_report(y_true, y_pred))
print("CNN optimization completed.")

CNN Model Accuracy: 0.9933184855233853
CNN Hyperparameter Optimization Time: 79.00 seconds
CNN Training Time: 11.66 seconds
CNN Evaluation Time: 0.31 seconds
Total Execution Time: 90.98 seconds
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      5856
         1.0       0.99      0.99      0.99      5369

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225

CNN optimization completed.
