In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import DataLoader, Dataset

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define a custom dataset to process your Shakespearean text
class ShakespeareTextDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=1024):
        self.examples = []
        with open(file_path, "r") as f:
            text = f.read()
            tokenized_text = tokenizer.encode(text, add_special_tokens=True, max_length=max_length)
            self.examples.append(tokenized_text)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx])

def compute_perplexity(file):
    # Load your Shakespearean text data
    dataset = ShakespeareTextDataset(file, tokenizer)

    # Create a DataLoader for batching
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

    # Compute perplexity
    total_loss = 0
    total_tokens = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch.to(model.device)
            output = model(input_ids, labels=input_ids)
            loss, logits = output.loss, output.logits
            total_loss += loss.item()
            total_tokens += input_ids.size(1)

    perplexity = torch.exp(torch.tensor(total_loss) / total_tokens)

    print(f"Perplexity: {perplexity}")

compute_perplexity('data/shakespeare_char/input.txt')
compute_perplexity('data/shakespeare_char/generated.txt')

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

def compute_bleu_score(input_file, output_file):
    # Read the reference and candidate files
    with open(input_file, 'r', encoding='utf-8') as input_f, open(output_file, 'r', encoding='utf-8') as output_f:
        reference_text = input_f.read()
        candidate_text = output_f.read()

    # Tokenize the reference and candidate text
    reference_tokens = word_tokenize(reference_text)
    candidate_tokens = word_tokenize(candidate_text)

    # Compute the BLEU score
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens)

    return bleu_score

# Specify the paths to your input.txt and output.txt files
input_file = 'data/shakespeare_char/input.txt'
output_file = 'data/shakespeare_char/generated.txt'

# Compute and print the BLEU score
score = compute_bleu_score(input_file, output_file)
print(f"BLEU Score: {score}")