<a href="https://colab.research.google.com/github/NavanjanaLAV/SE4050-deeplearning-2025/blob/IT22609908-bert-distil/bert_distil.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
pip install torch transformers datasets pandas scikit-learn



In [None]:
# Load dataset
url = "https://raw.githubusercontent.com/NavanjanaLAV/SE4050-deeplearning-2025/main/Dataset/fake_or_real_news_cleaned.csv"
df = pd.read_csv(url)

In [None]:
#Check first few rows
print(df.head())

                                               title  label
0  ben stein calls th circuit court committed cou...      0
1  trump drops steve bannon national security cou...      1
2  puerto rico expects us lift jones act shipping...      1
3  oops trump accidentally confirmed leaked israe...      0
4     donald trump heads scotland reopen golf resort      1


## Step 1 -  Split into Train and Test

In [None]:
from sklearn.model_selection import train_test_split

Basic Data Checks

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

Missing values:
title    0
label    0
dtype: int64


In [None]:
# Drop rows where 'title' or 'label' is missing
df = df.dropna(subset=['title', 'label'])

In [None]:
# Make sure label is int (0/1)
df['label'] = df['label'].astype(int)

In [None]:
# Check class distribution
print("\nLabel distribution:")
print(df['label'].value_counts())


Label distribution:
label
0    23481
1    21417
Name: count, dtype: int64


Split into Train & Validation Sets

In [None]:
# Extract texts and labels
texts = df['title'].tolist()      # list of headlines
labels = df['label'].tolist()     # list of 0s and 1s

In [None]:
# Split: 80% train, 20% validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.2,           # 20% for validation
    stratify=labels,         # keep same ratio of 0/1 in both sets
    random_state=42          # for reproducibility
)

In [None]:
# Print sizes
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 35918
Validation samples: 8980


Save Splits to Disk

In [None]:
# Create DataFrames for each split
train_df = pd.DataFrame({'title': train_texts, 'label': train_labels})
val_df = pd.DataFrame({'title': val_texts, 'label': val_labels})

In [None]:
# Save to CSV
train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)

In [None]:
# Verify splits are stratified (balanced)
print("Train label distribution:")
print(pd.Series(train_labels).value_counts())

print("\nValidation label distribution:")
print(pd.Series(val_labels).value_counts())

Train label distribution:
0    18785
1    17133
Name: count, dtype: int64

Validation label distribution:
0    4696
1    4284
Name: count, dtype: int64


## Step 2 - Tokenization for BERT

In [None]:
from transformers import BertTokenizer
import torch

In [None]:
# 1. Load BERT tokenizer (uncased = converts everything to lowercase)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# 2. Set max sequence length
# Most headlines are short. 128 is safe and memory-efficient.
# BERT supports up to 512, but no need for headlines.
MAX_LEN = 128

In [None]:
# 3. Tokenize TRAIN headlines
# truncation=True: cut long headlines
# padding=True: pad short ones to MAX_LEN
# return_tensors='pt': return PyTorch tensors (not lists)
train_encodings = tokenizer(
    train_texts,           # list of headline strings
    truncation=True,       # truncate to MAX_LEN if longer
    padding=True,          # pad to MAX_LEN if shorter
    max_length=MAX_LEN,    # enforce fixed length
    return_tensors='pt'    # return PyTorch tensors
)

In [None]:
# 4. Tokenize VALIDATION headlines ---
# Use SAME settings as train (critical!)
val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors='pt'
)

In [None]:
# 5. Create PyTorch Dataset class
# This wraps tokenized inputs + labels into a format
# that Hugging Face Trainer expects.
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  # dict with 'input_ids', 'attention_mask'
        self.labels = labels        # list of 0s and 1s

    def __getitem__(self, idx):
        # For each sample, return a dict with:
        # - input_ids
        # - attention_mask
        # - labels (added manually)
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# 6. Create final datasets
train_dataset = FakeNewsDataset(train_encodings, train_labels)
val_dataset = FakeNewsDataset(val_encodings, val_labels)

In [None]:
# 7. Quick check
print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")
print(f"Sample input shape: {train_dataset[0]['input_ids'].shape}")  # Should be [128]
print(f"Sample attention mask sum: {train_dataset[0]['attention_mask'].sum()}")  # Real token count

Train dataset size: 35918
Val dataset size: 8980
Sample input shape: torch.Size([51])
Sample attention mask sum: 8


## Step 3 - Load BERT Model

In [None]:
from transformers import BertForSequenceClassification
import torch

In [None]:
# 1. Load pre-trained BERT with a classification head
# 'bert-base-uncased' = standard BERT, lowercased text
# num_labels=2 → because we have 2 classes: real (0) and fake (1)
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 2. Check if GPU is available and move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(f"Model loaded on device: {device}")
print(f"Model type: {type(model).__name__}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")

Model loaded on device: cuda
Model type: BertForSequenceClassification
Number of parameters: 109,483,778
