In [68]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TextDataset

import torch
from tqdm import tqdm

# Perplexity

### Load dataset

In [69]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

### Perplexity

In [70]:

def perplexity(model, encodings):
    nlls = []
    for input_ids in encodings:
        target_ids = input_ids.clone()
        with torch.no_grad():
            outputs = model(input_ids.unsqueeze(0), labels=target_ids.unsqueeze(0))
            neg_log_likelihood = outputs.loss
        nlls.append(neg_log_likelihood)
    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl

In [71]:
def perplexity_metric(base_model, model_id, train_file_path):
    if base_model == 'gpt2':
        model = GPT2LMHeadModel.from_pretrained(model_id)
        tokenizer = GPT2Tokenizer.from_pretrained(model_id)   
        encodings = load_dataset(train_file_path, tokenizer)
        return perplexity(model, encodings)
    elif base_model == "EleutherAI/pythia-410m":
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(model_id)
        encodings = load_dataset(train_file_path, tokenizer)
        return perplexity(model, encodings)

In [73]:
model_id = "gpt2"
model_id_articles_fine_tunned = "gpt2_articles"
model_id_popsongs_fine_tunned = "gpt2_popsongs"
base_model = "gpt2"
train_file_path =  "processed_data/topSongsLyrics1950_2019.txt"

gpt2_perplexity_baseline = perplexity_metric(base_model, model_id, train_file_path)
gpt2_perplexity_articles = perplexity_metric(base_model, model_id_popsongs_fine_tunned, train_file_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [74]:
print(gpt2_perplexity_baseline)
print(gpt2_perplexity_articles)

tensor(51.7010)
tensor(16.0382)


In [77]:
model_id = "EleutherAI/pythia-410m"
model_id_articles_fine_tunned = "eleutherAI_articles"
model_id_popsongs_fine_tunned = "eleutherAI_popsongs"
base_model = "EleutherAI/pythia-410m"
train_file_path =  "processed_data/topSongsLyrics1950_2019.txt"

#eleuther_perplexity_baseline = perplexity_metric(base_model, model_id, train_file_path)
eleuther_perplexity_articles = perplexity_metric(base_model, model_id_articles_fine_tunned, train_file_path)
#eleuther_perplexity_articles = perplexity_metric(base_model, model_id_popsongs_fine_tunned, train_file_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [78]:
#print(eleuther_perplexity_baseline)
#print(eleuther_perplexity_articles)
print(eleuther_perplexity_articles)



tensor(432.7373)
