### Perplexity Comparison:


In [None]:
# # if needed:
# install dependencies
# !pip install torch
# !pip install transformers[torch]

In [None]:
# imports
import torch
import pandas as pd
import numpy as np
from torch.utils.data.dataloader import DataLoader
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
from torch.utils.data import random_split, DataLoader
from sklearn.model_selection import train_test_split
import math
import time
import matplotlib.pyplot as plt


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # if using GPU

# set transformer seed
set_seed(42)

In [None]:
# mount drive
drive.mount('/content/drive')

# define path to folder - CHANGE TO WHICHEVER DIRECTORY YOU WANT
path_folder = f"/content/drive/MyDrive/Deep_Learning_project/splits"
path_model = f"/content/drive/MyDrive/Deep_Learning_project/model"

path_folder_tokenizer = f"/content/drive/MyDrive/Deep_Learning_project/splits/tokenizer"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
input_ids_path = path_folder+'/all_input_ids.npy'
attn_masks_path = path_folder+'/all_attention_masks.npy'

In [None]:
# load untuned model
model_name = "gpt2"
model_untuned = AutoModelForCausalLM.from_pretrained(model_name)

# load tuned model
tokenizer = AutoTokenizer.from_pretrained(path_folder_tokenizer)
model = AutoModelForCausalLM.from_pretrained(path_model)

model.resize_token_embeddings(len(tokenizer))
model_untuned.resize_token_embeddings(len(tokenizer))

# move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # if using GPU
model_untuned = model_untuned.to(device)
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# data set class, implimenting __init__, __len__, __getitem__ for dataloader purposes
class songsDataset:
    def __init__(self, input_ids, attn_masks):
        self.input_ids = input_ids
        self.attn_masks = attn_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.input_ids[idx], device=device), torch.tensor(self.attn_masks[idx], device=device)

In [None]:
# load input ids and attention masks
input_ids = np.load(input_ids_path)
attn_masks = np.load(attn_masks_path)

# split into training and validation again by using the same random state that we used for training
input_ids_train, input_ids_val = train_test_split(input_ids, test_size = 0.0005, random_state=123)
attn_masks_train, attn_masks_val = train_test_split(attn_masks, test_size = 0.0005, random_state=123)

# create instances of validation dataset
dataset_val = songsDataset(input_ids_val, attn_masks_val)

print("len of val dataset:", len(dataset_val))

len of val dataset: 139


In [None]:
# define dataloader
validation_dataloader = DataLoader(dataset_val, batch_size = 1)

In [None]:
# use validation loop on the model to calculate perplexity
def perplexity_model(validation_loader, m):
    total_perplexity = 0
    val_steps = len(validation_dataloader)

    # iterate over validation set
    for ids, masks in validation_dataloader:
        with torch.no_grad():  # disable gradients for validation
            outputs = m(ids, attention_mask = masks, labels = ids)  # run model forward

            loss = outputs[0]
            batch_loss = loss.item()
            total_perplexity += math.exp(batch_loss)  # calculate perplexity = exponent of loss

    average_perplexity = total_perplexity / val_steps

    return average_perplexity  # return average perplexity of model - exponent of average loss

In [None]:
# Compare perplexity
perplexity_untuned = perplexity_model(validation_dataloader, model_untuned)
perplexity_tuned = perplexity_model(validation_dataloader, model)
print(f"Average perplexity of untuned model over validation dataet: {perplexity_untuned}")
print(f"Average perplexity of tuned model over validation dataet: {perplexity_tuned}")

Average perplexity of untuned model over validation dataet: 5.676198831197441e+51
Average perplexity of tuned model over validation dataet: 8.476248169339895
