In [1]:
# Install necessary libraries
!pip install transformers
!pip install tqdm

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tqdm.auto import tqdm

# Check if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')



In [2]:
# Load the dataset from the provided path
df = pd.read_csv('/kaggle/input/bert-fake-news-detection/News_Data.csv')

# Verify the data
df.head()


Unnamed: 0,title,text,subject,date,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,Donald Trump s White House is in chaos and the...,News,21-Jul-17,0.0
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,7-May-16,0.0
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe He supports exg...,News,3-Dec-16,0.0
3,California AG pledges to defend birth control ...,SAN FRANCISCO Reuters California Attorney Gen...,politicsNews,6-Oct-17,1.0
4,AZ RANCHERS Living On US-Mexico Border Destroy...,Twisted reasoning is all that comes from Pelos...,politics,25-Apr-17,0.0


In [3]:
# Check the counts of each label
label_counts = df['label'].value_counts()
print(label_counts)

print(f"Number of True news articles: {label_counts.get(1, 0)}")
print(f"Number of False news articles: {label_counts.get(0, 0)}")


label
0.0    23463
1.0    21417
Name: count, dtype: int64
Number of True news articles: 21417
Number of False news articles: 23463


In [4]:
# Combine 'title' and 'text' into one column (if necessary)
df['content'] = df['title'] + ' ' + df['text']

# Drop rows with missing values
df.dropna(subset=['content', 'label'], inplace=True)

# Ensure labels are integers
df['label'] = df['label'].astype(int)


In [5]:
# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['content'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['label']  # Maintain label proportions
)


In [6]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create a custom dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts,
            add_special_tokens=True,
            max_length=256,  # Adjusted max_length to reduce memory usage
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = FakeNewsDataset(train_texts, train_labels)
val_dataset = FakeNewsDataset(val_texts, val_labels)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
# Adjusted batch size
batch_size = 32  # Adjust based on GPU memory

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [12]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
epochs = 5  # Adjust as necessary
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


In [14]:
# Loss function
criterion = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    print(f'\n======== Epoch {epoch + 1} / {epochs} ========')
    print('Training...')
    
    model.train()
    total_train_loss = 0
    total_train_accuracy = 0
    train_steps = 0

    for batch in tqdm(train_loader, desc='Training', leave=False):
        optimizer.zero_grad()

        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels  # Including labels here allows automatic loss computation
        )

        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        # Backward pass
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update optimizer and scheduler
        optimizer.step()
        scheduler.step()

        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        batch_accuracy = (preds == labels).cpu().numpy().mean()
        total_train_accuracy += batch_accuracy
        train_steps += 1

    avg_train_loss = total_train_loss / train_steps
    avg_train_accuracy = total_train_accuracy / train_steps

    print(f"Training Loss: {avg_train_loss:.4f}")
    print(f"Training Accuracy: {avg_train_accuracy:.4f}")

    # Validation
    print('\nRunning Validation...')
    model.eval()
    total_val_loss = 0
    total_val_accuracy = 0
    val_steps = 0

    for batch in tqdm(val_loader, desc='Validation', leave=False):
        with torch.no_grad():
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

        total_val_loss += loss.item()

        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        batch_accuracy = (preds == labels).cpu().numpy().mean()
        total_val_accuracy += batch_accuracy
        val_steps += 1

    avg_val_loss = total_val_loss / val_steps
    avg_val_accuracy = total_val_accuracy / val_steps

    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {avg_val_accuracy:.4f}")



Training...


Training:   0%|          | 0/1120 [00:00<?, ?it/s]

Training Loss: 0.0160
Training Accuracy: 0.9950

Running Validation...


Validation:   0%|          | 0/280 [00:00<?, ?it/s]

Validation Loss: 0.0031
Validation Accuracy: 0.9996

Training...


Training:   0%|          | 0/1120 [00:00<?, ?it/s]

Training Loss: 0.0013
Training Accuracy: 0.9997

Running Validation...


Validation:   0%|          | 0/280 [00:00<?, ?it/s]

Validation Loss: 0.0029
Validation Accuracy: 0.9997

Training...


Training:   0%|          | 0/1120 [00:00<?, ?it/s]

Training Loss: 0.0005
Training Accuracy: 0.9999

Running Validation...


Validation:   0%|          | 0/280 [00:00<?, ?it/s]

Validation Loss: 0.0012
Validation Accuracy: 0.9999

Training...


Training:   0%|          | 0/1120 [00:00<?, ?it/s]

Training Loss: 0.0000
Training Accuracy: 1.0000

Running Validation...


Validation:   0%|          | 0/280 [00:00<?, ?it/s]

Validation Loss: 0.0014
Validation Accuracy: 0.9998

Training...


Training:   0%|          | 0/1120 [00:00<?, ?it/s]

Training Loss: 0.0000
Training Accuracy: 1.0000

Running Validation...


Validation:   0%|          | 0/280 [00:00<?, ?it/s]

Validation Loss: 0.0022
Validation Accuracy: 0.9998


In [15]:
print('\nEvaluating on Validation Set...')
model.eval()
predictions, true_labels = [], []

for batch in tqdm(val_loader, desc='Evaluating', leave=False):
    with torch.no_grad():
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits

    preds = torch.argmax(logits, dim=1)
    predictions.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Classification report
print('\nClassification Report:')
print(classification_report(true_labels, predictions, target_names=['Fake', 'Real']))

# Confusion matrix
print('\nConfusion Matrix:')
print(confusion_matrix(true_labels, predictions))



Evaluating on Validation Set...


Evaluating:   0%|          | 0/280 [00:00<?, ?it/s]


Classification Report:
              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00      4676
        Real       1.00      1.00      1.00      4284

    accuracy                           1.00      8960
   macro avg       1.00      1.00      1.00      8960
weighted avg       1.00      1.00      1.00      8960


Confusion Matrix:
[[4675    1]
 [   1 4283]]


In [16]:
# Save the trained model and tokenizer
model.save_pretrained('/kaggle/working/fake-news-bert-model')
tokenizer.save_pretrained('/kaggle/working/fake-news-bert-tokenizer')


('/kaggle/working/fake-news-bert-tokenizer/tokenizer_config.json',
 '/kaggle/working/fake-news-bert-tokenizer/special_tokens_map.json',
 '/kaggle/working/fake-news-bert-tokenizer/vocab.txt',
 '/kaggle/working/fake-news-bert-tokenizer/added_tokens.json')

In [18]:
!zip -r /kaggle/working/Tokenizer.zip /kaggle/working/fake-news-bert-tokenizer/*


updating: kaggle/working/fake-news-bert-tokenizer/special_tokens_map.json (deflated 42%)
updating: kaggle/working/fake-news-bert-tokenizer/vocab.txt (deflated 53%)
updating: kaggle/working/fake-news-bert-tokenizer/tokenizer_config.json (deflated 75%)


In [19]:
!zip -r /kaggle/working/model.zip /kaggle/working/fake-news-bert-model/*


  adding: kaggle/working/fake-news-bert-model/config.json (deflated 49%)
  adding: kaggle/working/fake-news-bert-model/model.safetensors (deflated 7%)
