# 1. Imports

In [2]:
import os
os.environ["HF_HUB_DISABLE_TOKEN_WARNING"] = "1"

from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

import warnings
warnings.simplefilter("ignore")

import random  # Import random module for shuffling data
import torch  # Import PyTorch for tensor computations
import torch.nn as nn  # Import neural network modules
import torch.optim as optim  # Import optimization algorithms
from torch.utils.data import DataLoader, Dataset  # Import PyTorch dataset utilities
from transformers import AutoTokenizer  # Import tokenizer for text processing
from torchinfo import summary  # Import module for model summary
import pandas as pd

print("Import completed.")

Import completed.


# 2. Load Data and Preprocess Text

In [3]:
# Set the path to the file you'd like to load
file_path = "../datasets/WELFake_Dataset.csv"

df = pd.read_csv(file_path)
df = df.dropna(subset=["title", "text", "label"])
df["title"] = df["title"].astype(str)
df["text"] = df["text"].astype(str)
df["label"] = df["label"].astype(int)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

MAX_SAMPLES = 10000
df = df[:MAX_SAMPLES] # Limit dataset for testing

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the texts in the dataframe
tokenized = tokenizer(
    list(df["title"]),
    truncation=True,
    padding="max_length",
    max_length=256,
    return_tensors="pt"
)

# Extract tensors
input_ids = tokenized["input_ids"]
labels = torch.tensor(df["label"].values, dtype=torch.long)

# Combine inputs and labels
data = list(zip(input_ids, labels))

# 3. Prepare Datasets and Dataloaders

In [None]:
# Train-test split
split_idx = int(0.8 * len(data))
train_data = data[:split_idx]
test_data = data[split_idx:]

# Define custom dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, data):
        """
        Initializes the dataset with the provided data.
        """
        self.data = data

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieves the sample at the specified index.
        """
        return self.data[idx]

# Create dataset objects for training and testing
train_dataset = FakeNewsDataset(train_data)  # Initialize training dataset
test_dataset = FakeNewsDataset(test_data)  # Initialize testing dataset

# Define batch size for training and testing
BATCH_SIZE = 32  # Set batch size

# Create DataLoaders for efficient data loading during training and evaluation
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)  # DataLoader for training
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)  # DataLoader for testing

print("Data preparation completed.")

Data preparation completed.


# 4. Define the TextCNN Model

In [22]:
# Define model hyperparameters
VOCAB_SIZE = tokenizer.vocab_size  # Get vocabulary size from tokenizer
EMBEDDING_DIM = 128  # Dimension of word embeddings
NUM_CLASSES = 2  # Number of output classes (real/fake news)
FILTER_SIZES = [3, 5, 7]  # Different filter sizes for convolution layers
NUM_FILTERS = 12  # Number of filters per convolutional layer
NUM_EPOCHS = 10  # Number of training epochs

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, filter_sizes, num_filters):
        """
        Initializes the TextCNN model with embedding, convolutional, and fully connected layers.
        It does not process input data but sets up the model structure.
        """
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Embedding layer
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes  # Apply different filter sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, num_classes)  # Fully connected layer
        self.dropout = nn.Dropout(0.5)  # Dropout for regularization

    def forward(self, x):
        """
        Define how the input data flows through the network.
        It applies the layers defined in __init__() to the input and computes the output.
        This is where the actual computation (like embedding lookup, convolution, activation functions,
        and classification) happens when the model is used.
        """
        x = self.embedding(x).unsqueeze(1)  # Convert input into embeddings and add a channel dimension
        x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs]  # Apply convolution layers
        x = [torch.max(pool, dim=2)[0] for pool in x]  # Apply max pooling
        x = torch.cat(x, dim=1)  # Concatenate feature maps
        x = self.dropout(x)  # Apply dropout
        return self.fc(x)  # Output layer

# Initialize the TextCNN model with predefined parameters
model = TextCNN(VOCAB_SIZE, EMBEDDING_DIM, NUM_CLASSES, FILTER_SIZES, NUM_FILTERS)

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check device availability
model.to(device)  # Move model to selected device

TextCNN(
  (embedding): Embedding(30522, 128)
  (convs): ModuleList(
    (0): Conv2d(1, 12, kernel_size=(3, 128), stride=(1, 1))
    (1): Conv2d(1, 12, kernel_size=(5, 128), stride=(1, 1))
    (2): Conv2d(1, 12, kernel_size=(7, 128), stride=(1, 1))
  )
  (fc): Linear(in_features=36, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

# 5. Train and Evaluate The Model

In [23]:
# Define loss function and optimizer for training
criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for classification
optimizer = optim.Adam(model.parameters(), lr=1e-3)  # Adam optimizer with specified learning rate

def train_model():
    """
    Trains the model for a specified number of epochs.
    """
    model.train()  # Set model to training mode
    for epoch in range(NUM_EPOCHS):  # Loop through epochs
        total_loss = 0  # Initialize total loss
        for batch in train_loader:  # Iterate over training batches
            inputs, labels = batch  # Unpack input features and labels
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to the correct device
            optimizer.zero_grad()  # Reset gradients
            outputs = model(inputs)  # Forward pass through the model
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model weights
            total_loss += loss.item()  # Accumulate loss
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")  # Print average loss per epoch

def evaluate_model():
    """
    Evaluates the model on the test dataset.
    """
    model.eval()  # Set model to evaluation mode
    correct = 0  # Initialize correct predictions count
    total = 0  # Initialize total sample count
    with torch.no_grad():  # Disable gradient computation during evaluation
        for batch in test_loader:  # Iterate over test batches
            inputs, labels = batch  # Unpack inputs and labels
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to the correct device
            outputs = model(inputs)  # Forward pass
            predictions = torch.argmax(outputs, dim=1)  # Get predicted class labels
            correct += (predictions == labels).sum().item()  # Count correct predictions
            total += labels.size(0)  # Count total samples
    print(f"Test Accuracy: {correct / total:.4f}")  # Print test accuracy

# Train and evaluate the model
train_model()
evaluate_model()

Epoch 1/10, Loss: 0.4720
Epoch 2/10, Loss: 0.3281
Epoch 3/10, Loss: 0.2597
Epoch 4/10, Loss: 0.2074
Epoch 5/10, Loss: 0.1649
Epoch 6/10, Loss: 0.1323
Epoch 7/10, Loss: 0.1011
Epoch 8/10, Loss: 0.0887
Epoch 9/10, Loss: 0.0672
Epoch 10/10, Loss: 0.0547
Test Accuracy: 0.9015


In [None]:
import torch.nn.functional as F

def predict_sentiment(text, model, tokenizer, device):
    model.eval()  # Set model to eval mode
    with torch.no_grad():
        # Tokenize the input string
        tokens = tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=256,
            return_tensors="pt"
        )
        input_ids = tokens["input_ids"].to(device)  # Move to same device as model

        # Get model prediction
        output = model(input_ids)
        
        return output

texts = [
    "TOP 5 MIND BLOWING ISSUES VOTING AMERICANS REALIZED THIS ELECTION",
    "How US election fraud claims changed as Trump won",
    "'Google AI presented my April Fools' story as real news'",
    "Cwmbran's roundabouts make the Guinness Book of World Records!",
    "A law enforcement sniper assigned to former President Donald Trump’s rally Saturday in Butler, Pennsylvania, says the head of the Secret Service ordered him not to shoot the suspect accused of attempting to assassinate Trump.",
    "A vaccination that is like 38 different vaccines and it looks like it’s meant for a horse” is being given to babies, making them start to change radically, former President and Republican presidential nominee Donald Trump in a phone call to independent presidential candidate Robert F. Kennedy Jr.",
    "A photo taken on Monday shows former President Donald Trump with no damage to his right ear, contrary to reports that it was injured in an attempted assassination on Saturday.",
    "Starbucks is sponsoring the Republican National Convention in Milwaukee.",
    
    "Trump ambushes S African leader with claim of Afrikaners being 'persecuted'",
    "How a joke about rice cost a Japan cabinet minister his job",
    "Ukrainian ex-top official shot dead outside Madrid school",
    "Kneecap member charged with terror offence",
    "Roof of historic Ming Dynasty tower collapses in China"
]

for text in texts:
    logits = predict_sentiment(
        text = text,
        model = model,
        tokenizer = tokenizer,
        device = device
    )
    probs = F.softmax(logits, dim=1)
    prediction = torch.argmax(logits, dim=1)
    print()
    print(text)
    print("This is fake news!" if prediction == 1 else "This is true!")
    print(f"Confidence scores: Real: {round(float(probs[0][0]), 4)}%, Fake: {round(float(probs[0][1]), 4)}%")


TOP 5 MIND BLOWING ISSUES VOTING AMERICANS REALIZED THIS ELECTION
This is fake news!
Confidence scores: Real: 0.0008%, Fake: 0.9992%

How US election fraud claims changed as Trump won
This is fake news!
Confidence scores: Real: 0.068%, Fake: 0.932%

'Google AI presented my April Fools' story as real news'
This is true!
Confidence scores: Real: 0.9985%, Fake: 0.0015%

Cwmbran's roundabouts make the Guinness Book of World Records!
This is fake news!
Confidence scores: Real: 0.0053%, Fake: 0.9947%

A law enforcement sniper assigned to former President Donald Trump’s rally Saturday in Butler, Pennsylvania, says the head of the Secret Service ordered him not to shoot the suspect accused of attempting to assassinate Trump.
This is fake news!
Confidence scores: Real: 0.0166%, Fake: 0.9834%

A vaccination that is like 38 different vaccines and it looks like it’s meant for a horse” is being given to babies, making them start to change radically, former President and Republican presidential nom

: 

# 6. Improving Model Performance

Some hints:
* Hyperparameter tuning, such as increasing or decreasing batch size
* Experiment with different filter sizes and number of filters
* Train for more epochs and observe if performance improves or overfits