<a href="https://colab.research.google.com/github/NavanjanaLAV/SE4050-deeplearning-2025/blob/it22609908-distil-bert/Models/bert-distil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
!pip install -U transformers

Collecting transformers
  Using cached transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
Using cached transformers-4.57.0-py3-none-any.whl (12.0 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.56.2
    Uninstalling transformers-4.56.2:
      Successfully uninstalled transformers-4.56.2
Successfully installed transformers-4.57.0


In [3]:
# Load dataset
url = "https://raw.githubusercontent.com/NavanjanaLAV/SE4050-deeplearning-2025/main/Dataset/fake_or_real_news_cleaned.csv"
df = pd.read_csv(url)

In [4]:
#Check first few rows
print(df.head())

                                               title  label
0  ben stein calls th circuit court committed cou...      0
1  trump drops steve bannon national security cou...      1
2  puerto rico expects us lift jones act shipping...      1
3  oops trump accidentally confirmed leaked israe...      0
4     donald trump heads scotland reopen golf resort      1


## Step 1 -  Split into Train and Test

In [5]:
from sklearn.model_selection import train_test_split

Basic Data Checks

In [6]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

Missing values:
title    0
label    0
dtype: int64


In [7]:
# Drop rows where 'title' or 'label' is missing
df = df.dropna(subset=['title', 'label'])

In [8]:
# Make sure label is int (0/1)
df['label'] = df['label'].astype(int)

In [9]:
# Check class distribution
print("\nLabel distribution:")
print(df['label'].value_counts())


Label distribution:
label
0    23481
1    21417
Name: count, dtype: int64


Split into Train & Validation Sets

In [10]:
# Extract texts and labels
texts = df['title'].tolist()      # list of headlines
labels = df['label'].tolist()     # list of 0s and 1s

In [11]:
# Split: 80% train, 20% validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,           # 20% for validation
    stratify=labels,         # keep same ratio of 0/1 in both sets
    random_state=42          # for reproducibility
)

In [12]:
# Print sizes
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 35918
Validation samples: 8980


Save Splits to Disk

In [13]:
# Create DataFrames for each split
train_df = pd.DataFrame({'title': train_texts, 'label': train_labels})
val_df = pd.DataFrame({'title': val_texts, 'label': val_labels})

In [14]:
# Save to CSV
train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)

In [15]:
# Verify splits are stratified (balanced)
print("Train label distribution:")
print(pd.Series(train_labels).value_counts())

print("\nValidation label distribution:")
print(pd.Series(val_labels).value_counts())

Train label distribution:
0    18785
1    17133
Name: count, dtype: int64

Validation label distribution:
0    4696
1    4284
Name: count, dtype: int64


## Step 2 - Load DistilBERT Tokenizer




In [16]:
# Import the fast tokenizer for DistilBERT from Hugging Face Transformers
from transformers import DistilBertTokenizerFast

In [17]:
# 🚀 Load pre-trained tokenizer
# We use 'distilbert-base-uncased' because:
#   - Our headlines are in English and case doesn't matter much → "uncased" is fine
#   - "base" = standard size (not tiny or large)
#   - "distilbert" = lightweight BERT, great for headlines & fast training
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [18]:
# Test the tokenizer on a sample headline to see how it works
sample_headline = "Trump drops Steve Bannon from security council"
encoded = tokenizer(sample_headline, truncation=True, padding=True, max_length=128)

print("Tokenizer loaded successfully!")
print(f"Sample input: '{sample_headline}'")
print(f"Tokenized output keys: {list(encoded.keys())}")
print(f"Input IDs (first 10): {encoded['input_ids'][:10]}")
print(f"Attention Mask (first 10): {encoded['attention_mask'][:10]}")

Tokenizer loaded successfully!
Sample input: 'Trump drops Steve Bannon from security council'
Tokenized output keys: ['input_ids', 'attention_mask']
Input IDs (first 10): [101, 8398, 9010, 3889, 7221, 8540, 2013, 3036, 2473, 102]
Attention Mask (first 10): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


## Tokenize full Train & Val Sets

In [19]:
import torch
from torch.utils.data import Dataset

In [20]:
# Define max_length — 128 is safe for headlines
MAX_LEN = 128

# Tokenize train set
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,        # or 'max_length' — both work
    max_length=MAX_LEN,
    return_tensors='pt'
)

# Tokenize validation set
val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors='pt'
)

print("Tokenization complete!")
print("Train input shape:", train_encodings['input_ids'].shape)
print("Val input shape:", val_encodings['input_ids'].shape)

Tokenization complete!
Train input shape: torch.Size([35918, 51])
Val input shape: torch.Size([8980, 39])


## Create PyTorch Datasets for Trainer

In [21]:
import torch
from torch.utils.data import Dataset

In [22]:
 # Custom Dataset for fake news classification.
 # Returns: {'input_ids', 'attention_mask', 'labels'} for each sample.
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Return a dictionary with all tensor inputs + label
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [23]:
# Create final datasets for training and validation
train_dataset = FakeNewsDataset(train_encodings, train_labels)
val_dataset = FakeNewsDataset(val_encodings, val_labels)

In [24]:
print(f"Training dataset ready: {len(train_dataset)} samples")
print(f"Validation dataset ready: {len(val_dataset)} samples")

Training dataset ready: 35918 samples
Validation dataset ready: 8980 samples


## Load DistilBERT Model for Binary Classification

In [25]:
from transformers import DistilBertForSequenceClassification

In [26]:
# Load pre-trained DistilBERT model + add classification head on top
# 'distilbert-base-uncased' = lightweight, fast, perfect for headlines
# num_labels=2 → fake (0) vs real (1)
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2,
    problem_type="single_label_classification"  # explicitly tell it we're doing single-label (not multi)
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Optional: Check model architecture summary (first layer)
print("Model loaded successfully!")
print(f"Model type: {type(model).__name__}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")

Model loaded successfully!
Model type: DistilBertForSequenceClassification
Number of parameters: 66,955,010


In [28]:
# Move model to GPU if available (much faster training!)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Model is running on device: {device}")

Model is running on device: cuda


In [29]:
# Peek at classifier head (last layer) — should output 2 logits
classifier_head = model.classifier
print(f"Classifier head: {classifier_head}")

Classifier head: Linear(in_features=768, out_features=2, bias=True)


## Setup Training

In [32]:
from sklearn.metrics import accuracy_score
import numpy as np
from transformers import TrainingArguments, Trainer

In [31]:
# TRAINING ARGS
training_args = TrainingArguments(
    output_dir='./fake-news-results',
    # evaluation_strategy='epoch',
    # save_strategy='epoch',
    # logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    report_to='none',
    seed=42,
    fp16=torch.cuda.is_available(),
)

In [33]:
# METRIC FUNCTION
def compute_metrics(eval_pred):
    logits, labels = eval_pred  # Model’s raw scores + true answers
    predictions = np.argmax(logits, axis=-1)  # Turn scores into 0 (fake) or 1 (real)
    return {'accuracy': accuracy_score(labels, predictions)} # % correct

# Example :- {'eval_loss': 0.243, 'eval_accuracy': 0.9263}