# Comparing the perplexity of different language models - ALBERT and GPT-2

### Import libraries

In [1]:
#!pip install torch transformers datasets
#!pip install ipywidgets
#!pip install sentencepiece
#!pip install evaluate

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string

import torch
from transformers import AlbertForSequenceClassification, AlbertTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from tqdm import tqdm

import random
import warnings

# Suppress tokenizer warnings
warnings.filterwarnings("ignore")


In [3]:
# Set random seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

In [4]:
news_df = pd.read_csv('bbc_news.csv')
news_df.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [5]:
news_df['text'] = news_df['title'] + ' ' + news_df['description']
news_df = news_df[['pubDate','text']]
news_df['pubDate'] = pd.to_datetime(news_df['pubDate'])
news_df['text'] = news_df['text'].astype(str).str.lower()
news_df.head()

Unnamed: 0,pubDate,text
0,2022-03-07 08:01:56,ukraine: angry zelensky vows to punish russian...
1,2022-03-06 22:49:58,war in ukraine: taking cover in a town under a...
2,2022-03-07 00:14:42,ukraine war 'catastrophic for global food' one...
3,2022-03-07 00:05:40,manchester arena bombing: saffie roussos's par...
4,2022-03-07 08:15:53,ukraine conflict: oil price soars to highest l...


In [6]:
news_df.shape

(41780, 2)

In [7]:
# Split data into train and remaining (val+test)
train_size = 0.6
val_size = 0.2


train_data, remaining_data = train_test_split(news_df, test_size=(1 - train_size), random_state=42)
val_data, test_data = train_test_split(remaining_data, test_size=(val_size / (1 - train_size)), random_state=42)


print(f"Train size: {len(train_data)}")
print(f"Val size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 25068
Val size: 8356
Test size: 8356


In [8]:
# Step 1: Load and Preprocess Dataset
def preprocess_nsp(dataset, tokenizer, max_length=128):
    """
    Prepares the dataset for NSP (Next Sentence Prediction).
    Positive examples: Sequential sentences.
    Negative examples: Random sentence pairs.
    """
    inputs, labels = [], []
    dataset = dataset.dropna(subset=["text"])


    for article in dataset["text"]:
        sentences = article.split(". ")  # Split into sentences.
        for i in range(len(sentences) - 1):
            # Positive example: Consecutive sentences.
            inputs.append((sentences[i], sentences[i + 1]))
            labels.append(1)

            # Negative example: Random pair.
            rand_index = np.random.randint(0, len(sentences))
            inputs.append((sentences[i], sentences[rand_index]))
            labels.append(0)

    tokenized = tokenizer(
        [x[0] for x in inputs], [x[1] for x in inputs],
        truncation=True, padding="max_length", max_length=max_length, return_tensors="pt"
    )
    tokenized["labels"] = torch.tensor(labels)
    return tokenized

In [9]:
class NSPDataset(Dataset):
    def __init__(self, tokenized_data):
        """
        Args:
            tokenized_data: Dictionary with tokenized inputs and labels.
        """
        self.input_ids = tokenized_data["input_ids"]
        self.token_type_ids = tokenized_data["token_type_ids"]
        self.attention_mask = tokenized_data["attention_mask"]
        self.labels = tokenized_data["labels"]

    def __len__(self):
        return self.input_ids.shape[0]

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "token_type_ids": self.token_type_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

In [10]:
# Load ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

In [11]:
 # Preprocess dataset for NSP
max_length = 128
tokenized_train = preprocess_nsp(train_data, tokenizer, max_length=max_length)
tokenized_val = preprocess_nsp(val_data, tokenizer, max_length=max_length)
tokenized_test = preprocess_nsp(test_data, tokenizer, max_length=max_length)

#Prepare the dataset
train_data = NSPDataset(tokenized_train)
val_data = NSPDataset(tokenized_val)
test_data = NSPDataset(tokenized_test)

batch_size = 32

train_dataloader = DataLoader(train_data, batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size, shuffle=True)

Use Untrained ALBERT for NSP

In [12]:
untrained_model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
untrained_model.to(device)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=

In [13]:
# Evaluation Function
def evaluate_model(model, dataloader):
    """
    Evaluate a model on the given dataloader and calculate metrics including:
    - Accuracy
    - Precision, Recall, F1-Score
    - Perplexity
    """
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0.0
    total_batches = 0

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss

            # Aggregate loss
            total_loss += loss.item()
            total_batches += 1

            # Get predictions
            preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    # Calculate Metrics
    accuracy = accuracy_score(all_labels, all_preds)

    # Calculate Perplexity
    avg_loss = total_loss / total_batches
    perplexity = np.exp(avg_loss)

    return {
        "accuracy": accuracy,
        "perplexity": perplexity,
    }



In [14]:
# Compute Metrics
metrics = evaluate_model(untrained_model, test_dataloader)
print(f"Evaluation Results for Untrained Model: {metrics}")

# Clear any unused GPU memory
torch.cuda.empty_cache()

Evaluating: 100%|██████████| 22/22 [00:08<00:00,  2.57it/s]

Evaluation Results for Untrained Model: {'accuracy': 0.5759312320916905, 'perplexity': 1.9801576779164154}





Training ALBERT for NSP

In [15]:
# Step 2: Fine-Tune ALBERT for NSP
trained_model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
trained_model.to(device)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=

In [16]:
# Optimizer and Loss Function
optimizer = torch.optim.AdamW(trained_model.parameters(), lr=2e-5)

# Training Loop
def train_model(model, train_dataloader, val_dataloader, epochs=3):
    """
    Train the ALBERT model for NSP task.

    Args:
        model: ALBERT model.
        train_dataloader: Dataloader for training data.
        val_dataloader: Dataloader for validation data.
        epochs: Number of epochs to train.

    Returns:
        Trained model.
    """
    model.train()
    best_val_loss = float("inf")

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        total_loss = 0.0

        for batch in tqdm(train_dataloader, desc="Training"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Training Loss: {avg_train_loss:.4f}")

        # Evaluate on validation set
        val_metrics = evaluate_model(model, val_dataloader)
        print(f"Validation Metrics: {val_metrics}")

        # Save the best model based on validation loss
        if val_metrics["perplexity"] < best_val_loss:
            best_val_loss = val_metrics["perplexity"]
            torch.save(model.state_dict(), "best_albert_nsp_model.pt")
            print("Saved best model!")

    print("Training Complete!")
    return model


In [18]:
# Train the model
epochs = 10
trained_model = train_model(trained_model, train_dataloader, val_dataloader, epochs=epochs)

Epoch 1/10


Training: 100%|██████████| 69/69 [00:58<00:00,  1.17it/s]


Training Loss: 0.4834


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.44it/s]


Validation Metrics: {'accuracy': 0.7472677595628415, 'perplexity': 1.6192032825464104}
Saved best model!
Epoch 2/10


Training: 100%|██████████| 69/69 [00:47<00:00,  1.44it/s]


Training Loss: 0.4821


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.44it/s]


Validation Metrics: {'accuracy': 0.7472677595628415, 'perplexity': 1.6186747304465663}
Saved best model!
Epoch 3/10


Training: 100%|██████████| 69/69 [00:47<00:00,  1.44it/s]


Training Loss: 0.4828


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.42it/s]


Validation Metrics: {'accuracy': 0.7472677595628415, 'perplexity': 1.6176383035707265}
Saved best model!
Epoch 4/10


Training: 100%|██████████| 69/69 [00:47<00:00,  1.45it/s]


Training Loss: 0.4847


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.46it/s]


Validation Metrics: {'accuracy': 0.7459016393442623, 'perplexity': 1.630962674878983}
Epoch 5/10


Training: 100%|██████████| 69/69 [00:47<00:00,  1.45it/s]


Training Loss: 0.4843


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.49it/s]


Validation Metrics: {'accuracy': 0.7472677595628415, 'perplexity': 1.6300617639651773}
Epoch 6/10


Training: 100%|██████████| 69/69 [00:47<00:00,  1.46it/s]


Training Loss: 0.4816


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.50it/s]


Validation Metrics: {'accuracy': 0.744535519125683, 'perplexity': 1.627757123408249}
Epoch 7/10


Training: 100%|██████████| 69/69 [00:47<00:00,  1.44it/s]


Training Loss: 0.4786


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.52it/s]


Validation Metrics: {'accuracy': 0.7472677595628415, 'perplexity': 1.6273247311052184}
Epoch 8/10


Training: 100%|██████████| 69/69 [00:47<00:00,  1.45it/s]


Training Loss: 0.4866


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.80it/s]


Validation Metrics: {'accuracy': 0.7472677595628415, 'perplexity': 1.640359303389189}
Epoch 9/10


Training: 100%|██████████| 69/69 [00:49<00:00,  1.40it/s]


Training Loss: 0.4832


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.67it/s]


Validation Metrics: {'accuracy': 0.7472677595628415, 'perplexity': 1.617623438382054}
Saved best model!
Epoch 10/10


Training: 100%|██████████| 69/69 [00:47<00:00,  1.46it/s]


Training Loss: 0.4804


Evaluating: 100%|██████████| 23/23 [00:06<00:00,  3.69it/s]

Validation Metrics: {'accuracy': 0.7472677595628415, 'perplexity': 1.6203362159870336}
Training Complete!





In [19]:

# Load the best model
trained_model.load_state_dict(torch.load("best_albert_nsp_model.pt"))

# Evaluate the trained model on the test set
test_metrics = evaluate_model(trained_model, test_dataloader)
print(f"Evaluation Results for Trained Model: {test_metrics}")

# Clear GPU memory
torch.cuda.empty_cache()

Evaluating: 100%|██████████| 22/22 [00:06<00:00,  3.42it/s]

Evaluation Results for Trained Model: {'accuracy': 0.7464183381088825, 'perplexity': 1.6200539837383197}





Testing the model performance on different domain dataset

In [20]:
amzn_df = pd.read_csv("amazon_reviews.csv")
amzn_df["text"] = amzn_df["reviewText"].str.lower()
amzn_df = amzn_df.drop(["Unnamed: 0", "reviewText"], axis=1)
amzn_df.head()

Unnamed: 0,reviewerID,text
0,A30TL5EWN6DFXT,they look good and stick good! i just don't li...
1,ASY55RVNIL0UD,these stickers work like the review says they ...
2,A2TMXE2AFO7ONB,these are awesome and make my phone look so st...
3,AWJ0WZQYMYFQ4,item arrived in great time and was in perfect ...
4,ATX7CZYFXI1KW,"awesome! stays on, and looks great. can be use..."


In [None]:
amzn_df = amzn_df.sample(n = 1000, random_state= seed)
tokenized_amzndata = preprocess_nsp(amzn_df, tokenizer, max_length=max_length)
amzndata_dataset = NSPDataset(tokenized_amzndata)
amzndata_dataloader = DataLoader(amzndata_dataset, batch_size, shuffle=True)


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [22]:
# Step 2: Fine-Tune ALBERT for NSP
trained_model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
trained_model.to(device)

# Load the best model
trained_model.load_state_dict(torch.load("best_albert_nsp_model.pt"))

# Evaluate the trained model on the test set
test_metrics = evaluate_model(trained_model, amzndata_dataloader)
print(f"Evaluation Results for Trained Model: {test_metrics}")

# Clear GPU memory
torch.cuda.empty_cache()

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


Evaluating: 100%|██████████| 241/241 [02:10<00:00,  1.85it/s]

Evaluation Results for Trained Model: {'accuracy': 0.5760051880674448, 'perplexity': 1.9431653631815056}



