# Comparing the perplexity of different language models - ALBERT and GPT-2

### Import libraries

In [1]:
#!pip install torch transformers datasets
#!pip install ipywidgets
#!pip install sentencepiece
#!pip install evaluate

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string

import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from tqdm import tqdm

import random
import warnings

# Suppress tokenizer warnings
warnings.filterwarnings("ignore")

In [3]:
# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

In [4]:
news_df = pd.read_csv('bbc_news.csv')
news_df.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [5]:
news_df['text'] = news_df['title'] + ' ' + news_df['description']
news_df = news_df[['pubDate','text']]
news_df['pubDate'] = pd.to_datetime(news_df['pubDate'])
news_df['text'] = news_df['text'].astype(str).str.lower()
news_df.head()

Unnamed: 0,pubDate,text
0,2022-03-07 08:01:56,ukraine: angry zelensky vows to punish russian...
1,2022-03-06 22:49:58,war in ukraine: taking cover in a town under a...
2,2022-03-07 00:14:42,ukraine war 'catastrophic for global food' one...
3,2022-03-07 00:05:40,manchester arena bombing: saffie roussos's par...
4,2022-03-07 08:15:53,ukraine conflict: oil price soars to highest l...


In [6]:
news_df.shape

(41780, 2)

In [7]:
# Split data into train and remaining (val+test)
train_size = 0.6
val_size = 0.2


train_data, remaining_data = train_test_split(news_df, test_size=(1 - train_size), random_state=42)
val_data, test_data = train_test_split(remaining_data, test_size=(val_size / (1 - train_size)), random_state=42)


print(f"Train size: {len(train_data)}")
print(f"Val size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 25068
Val size: 8356
Test size: 8356


In [8]:
# Preprocess Dataset
def preprocess_nsp(dataset, tokenizer, max_length=128):
    """
    Prepares the dataset for NSP with GPT-2.
    Positive examples: Sequential sentences.
    Negative examples: Random sentence pairs.
    """
    inputs, labels = [], []
    dataset = dataset.dropna(subset=["text"])

    for article in dataset["text"]:
        sentences = article.split(". ")  # Split into sentences.
        for i in range(len(sentences) - 1):
            # Positive example: Consecutive sentences.
            inputs.append((sentences[i], sentences[i + 1]))
            labels.append(1)

            # Negative example: Random pair.
            rand_index = np.random.randint(0, len(sentences))
            inputs.append((sentences[i], sentences[rand_index]))
            labels.append(0)

    encodings = tokenizer(
        [x[0] + " " + x[1] for x in inputs],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    encodings["labels"] = torch.tensor(labels)
    return encodings

In [9]:
class NSPDataset(Dataset):
    def __init__(self, tokenized_data):
        """
        Args:
            tokenized_data: Dictionary with tokenized inputs and labels.
        """
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data["attention_mask"]
        self.labels = tokenized_data["labels"]

    def __len__(self):
        return self.input_ids.shape[0]

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

In [10]:
# Load ALBERT tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Ensure the tokenizer can handle sequences with special tokens
tokenizer.pad_token = tokenizer.eos_token

In [11]:
# Preprocess datasets
max_length = 128
tokenized_train = preprocess_nsp(train_data, tokenizer, max_length=max_length)
tokenized_val = preprocess_nsp(val_data, tokenizer, max_length=max_length)
tokenized_test = preprocess_nsp(test_data, tokenizer, max_length=max_length)

train_dataset = NSPDataset(tokenized_train)
val_dataset = NSPDataset(tokenized_val)
test_dataset = NSPDataset(tokenized_test)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=True)


Use Untrained ALBERT for NSP

In [12]:
untrained_model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
# Ensure the model recognizes the pad token
untrained_model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
untrained_model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [13]:
# Evaluation Function
def evaluate_model(model, dataloader):
    """
    Evaluate GPT-2 on NSP and calculate:
    - Accuracy
    - Perplexity
    """
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0.0
    total_batches = 0

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss

            # Aggregate loss
            total_loss += loss.item()
            total_batches += 1

            # Get predictions
            preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    avg_loss = total_loss / total_batches
    perplexity = np.exp(avg_loss)

    return {
        "accuracy": accuracy,
        "perplexity": perplexity,
    }





In [14]:
# Compute Metrics
metrics = evaluate_model(untrained_model, test_dataloader)
print(f"Evaluation Results for Model: {metrics}")

# Clear any unused GPU memory
torch.cuda.empty_cache()

Evaluating: 100%|██████████| 22/22 [00:05<00:00,  3.94it/s]

Evaluation Results for Model: {'accuracy': 0.48567335243553006, 'perplexity': 395.218373570835}





Training GPT-2 for NSP

In [15]:
# Step 2: Fine-Tune ALBERT for NSP
trained_model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
# Ensure the model recognizes the pad token
trained_model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
trained_model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [16]:
optimizer = torch.optim.AdamW(trained_model.parameters(), lr=2e-5)

# Training Loop
def train_model(model, train_dataloader, val_dataloader, epochs=3):
    """
    Fine-tunes GPT-2 on the NSP task.
    """
    best_val_loss = float("inf")

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        model.train()
        total_loss = 0.0

        for batch in tqdm(train_dataloader, desc="Training"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Training Loss: {avg_train_loss:.4f}")

        # Evaluate on validation set
        val_metrics = evaluate_model(model, val_dataloader)
        print(f"Validation Metrics: {val_metrics}")

        # Save the best model based on validation perplexity
        if val_metrics["perplexity"] < best_val_loss:
            best_val_loss = val_metrics["perplexity"]
            torch.save(model.state_dict(), "best_gpt2_nsp_model.pt")
            print("Saved best model!")

    print("Training Complete!")
    return model

In [17]:
# Train the model
epochs = 10
trained_model = train_model(trained_model, train_dataloader, val_dataloader, epochs=epochs)

Epoch 1/10


Training: 100%|██████████| 69/69 [07:55<00:00,  6.89s/it]


Training Loss: 0.6443


Evaluating: 100%|██████████| 23/23 [00:17<00:00,  1.28it/s]


Validation Metrics: {'accuracy': 0.7377049180327869, 'perplexity': 1.6557128558682153}
Saved best model!
Epoch 2/10


Training: 100%|██████████| 69/69 [07:38<00:00,  6.65s/it]


Training Loss: 0.5118


Evaluating: 100%|██████████| 23/23 [00:17<00:00,  1.34it/s]


Validation Metrics: {'accuracy': 0.7377049180327869, 'perplexity': 1.6485781744583876}
Saved best model!
Epoch 3/10


Training: 100%|██████████| 69/69 [07:10<00:00,  6.24s/it]


Training Loss: 0.4980


Evaluating: 100%|██████████| 23/23 [00:07<00:00,  3.18it/s]


Validation Metrics: {'accuracy': 0.7431693989071039, 'perplexity': 1.6429996741451445}
Saved best model!
Epoch 4/10


Training: 100%|██████████| 69/69 [06:32<00:00,  5.69s/it]


Training Loss: 0.4953


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.35it/s]


Validation Metrics: {'accuracy': 0.744535519125683, 'perplexity': 1.6398694188791594}
Saved best model!
Epoch 5/10


Training: 100%|██████████| 69/69 [06:29<00:00,  5.64s/it]


Training Loss: 0.4919


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.48it/s]


Validation Metrics: {'accuracy': 0.744535519125683, 'perplexity': 1.63336986144894}
Saved best model!
Epoch 6/10


Training: 100%|██████████| 69/69 [06:44<00:00,  5.86s/it]


Training Loss: 0.4828


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.50it/s]


Validation Metrics: {'accuracy': 0.7472677595628415, 'perplexity': 1.6316772916327267}
Saved best model!
Epoch 7/10


Training: 100%|██████████| 69/69 [06:32<00:00,  5.69s/it]


Training Loss: 0.4826


Evaluating: 100%|██████████| 23/23 [00:07<00:00,  3.05it/s]


Validation Metrics: {'accuracy': 0.7390710382513661, 'perplexity': 1.6231631442971692}
Saved best model!
Epoch 8/10


Training: 100%|██████████| 69/69 [06:33<00:00,  5.70s/it]


Training Loss: 0.4785


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.43it/s]


Validation Metrics: {'accuracy': 0.7431693989071039, 'perplexity': 1.6238882457509924}
Epoch 9/10


Training: 100%|██████████| 69/69 [06:32<00:00,  5.68s/it]


Training Loss: 0.4746


Evaluating: 100%|██████████| 23/23 [00:07<00:00,  3.19it/s]


Validation Metrics: {'accuracy': 0.7459016393442623, 'perplexity': 1.6304140212084093}
Epoch 10/10


Training: 100%|██████████| 69/69 [06:32<00:00,  5.69s/it]


Training Loss: 0.4701


Evaluating: 100%|██████████| 23/23 [00:07<00:00,  3.19it/s]

Validation Metrics: {'accuracy': 0.7431693989071039, 'perplexity': 1.6399153952484489}
Training Complete!





In [18]:
# Load the best model
trained_model.load_state_dict(torch.load("best_gpt2_nsp_model.pt"))

# Evaluate on the test set
test_metrics = evaluate_model(trained_model, test_dataloader)
print(f"Evaluation Results for Trained Model: {test_metrics}")

# Clear GPU memory
torch.cuda.empty_cache()

Evaluating: 100%|██████████| 22/22 [00:15<00:00,  1.43it/s]

Evaluation Results for Trained Model: {'accuracy': 0.7435530085959885, 'perplexity': 1.627335446527245}





Testing the model performance on different domain dataset

In [19]:
amzn_df = pd.read_csv("amazon_reviews.csv")
amzn_df["text"] = amzn_df["reviewText"].str.lower()
amzn_df = amzn_df.drop(["Unnamed: 0", "reviewText"], axis=1)
amzn_df.head()

Unnamed: 0,reviewerID,text
0,A30TL5EWN6DFXT,they look good and stick good! i just don't li...
1,ASY55RVNIL0UD,these stickers work like the review says they ...
2,A2TMXE2AFO7ONB,these are awesome and make my phone look so st...
3,AWJ0WZQYMYFQ4,item arrived in great time and was in perfect ...
4,ATX7CZYFXI1KW,"awesome! stays on, and looks great. can be use..."


In [None]:
amzn_df = amzn_df.sample(n = 1000, random_state= seed)
tokenized_amzndata = preprocess_nsp(amzn_df, tokenizer, max_length=max_length)
amzndata_dataset = NSPDataset(tokenized_amzndata)
amzndata_dataloader = DataLoader(amzndata_dataset, batch_size=32 , shuffle=True)

In [21]:
# Step 2: Fine-Tune ALBERT for NSP
trained_model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
# Ensure the model recognizes the pad token
trained_model.config.pad_token_id = tokenizer.pad_token_id

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
trained_model.to(device)

trained_model.load_state_dict(torch.load("best_gpt2_nsp_model.pt"))

# Evaluate on the test set
test_metrics = evaluate_model(trained_model, amzndata_dataloader)
print(f"Evaluation Results for Trained GPT-2 Model: {test_metrics}")

# Clear GPU memory
torch.cuda.empty_cache()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


Evaluating: 100%|██████████| 241/241 [02:32<00:00,  1.58it/s]

Evaluation Results for Trained GPT-2 Model: {'accuracy': 0.5653696498054475, 'perplexity': 1.986491947801931}



