In [None]:
# Necessary packages, in case Requirements.txt fails
# !pip install spacy transformers numpy pandas datasets scikit-learn matplotlib evaluate wandb torch

In [2]:
import os
import pickle
import spacy
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    pipeline
)
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate
from tqdm.auto import tqdm
tqdm.pandas()
import wandb
wandb.init(mode="disabled") 
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Cleaning and Normalization Pipeline

import re

def clean_shakespeare(text: str) -> str:

    # remove any ALL CAPS words (Titles, character names, stage directions, etc.)
    text = re.sub(r'\b[A-Z]{2,}\.', '', text)
    text = re.sub(r'\b[A-Z]{2,}\b', '', text)

    # lowercase normalization
    text = text.lower()

    # remove ACT/SCENE markers
    text = re.sub(r'\bact\s+[ivx]+\b', '', text)
    text = re.sub(r'\bscene\s+[ivx]+\b', '', text)

    # remove bracketed text [ ... ]
    text = re.sub(r'\[.*?\]', '', text)

    # remove standalone numbers (often sonnet numbers or line counts)
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)

    # remove play headers and all-uppercase lines (likely metadata, not verse)
    text = re.sub(r'^[A-Z\s]{3,}$', '', text, flags=re.MULTILINE)

    # collapse multiple newlines
    text = re.sub(r'\n+', '\n', text)

    # remove '\n' as our model will not need to generate new lines in the style of Sonnets
    text = text.replace("\n", " ")

    # strip leading/trailing whitespace
    text = text.strip()

    return text

# Clean the file
with open("shakespeare.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = clean_shakespeare(raw_text)

with open("shakespeare_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)


In [4]:
# Load Text
with open("shakespeare_cleaned.txt", "r", encoding="utf-8") as f:
    corpus = f.read()

# Sentence segmentation (extract sentences from .txt)

nlp = spacy.load("en_core_web_sm")
nlp.max_length = len(corpus) + 1000  # handle long corpus

doc = nlp(corpus)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

print(f"Number of sentences: {len(sentences)}")

Number of sentences: 70932


In [5]:
# Explore sentences

print("=== Example Sentences ===")
for i, s in enumerate(sentences[:10]):  # show first 10
    print(f"{i+1}: {s}")
print("\nTotal sentences:", len(sentences))

=== Example Sentences ===
1: from fairest creatures we desire increase, that thereby beauty’s rose might never die, but as the riper should by time decease, his tender heir might bear his memory: but thou contracted to thine own bright eyes, feed’st thy light’s flame with self-substantial fuel, making a famine where abundance lies, thyself thy foe, to thy sweet self too cruel: thou that art now the world’s fresh ornament, and only herald to the gaudy spring, within thine own bud buriest thy content, and, tender churl, mak’st waste in niggarding:     pity the world, or else this glutton be,     to eat the world’s due, by the grave and thee.
2: when forty winters shall besiege thy brow, and dig deep trenches in thy beauty’s field, thy youth’s proud livery so gazed on now, will be a tattered weed of small worth held: then being asked, where all thy beauty lies, where all the treasure of thy lusty days; to say, within thine own deep sunken eyes, were an all-eating shame, and thriftless pra

In [6]:
# Hugging Face Dataset

dataset = Dataset.from_dict({"text": sentences})

# Explore Dataset

print("\n=== Example Dataset Entries ===")
for i in range(10):  # show first 10
    print(f"{i+1}: {dataset[i]}")
print("\nDataset length:", len(dataset))



=== Example Dataset Entries ===
1: {'text': 'from fairest creatures we desire increase, that thereby beauty’s rose might never die, but as the riper should by time decease, his tender heir might bear his memory: but thou contracted to thine own bright eyes, feed’st thy light’s flame with self-substantial fuel, making a famine where abundance lies, thyself thy foe, to thy sweet self too cruel: thou that art now the world’s fresh ornament, and only herald to the gaudy spring, within thine own bud buriest thy content, and, tender churl, mak’st waste in niggarding:     pity the world, or else this glutton be,     to eat the world’s due, by the grave and thee.'}
2: {'text': 'when forty winters shall besiege thy brow, and dig deep trenches in thy beauty’s field, thy youth’s proud livery so gazed on now, will be a tattered weed of small worth held: then being asked, where all thy beauty lies, where all the treasure of thy lusty days; to say, within thine own deep sunken eyes, were an all-eat

In [None]:
# Load tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load pre-trained model

model = GPT2LMHeadModel.from_pretrained("gpt2")

# GPT-2 doesn’t have a padding token by default

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

def tokenize_function(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
        #return_tensors="pt"
    )

    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/70932 [00:00<?, ? examples/s]

In [None]:
# Save to use on WSL for training with GPU

#tokenized_dataset.save_to_disk("shakespeare_tokenized")


#Open on WSL 

#tokenized_dataset = load_from_disk("shakespeare_tokenized")

Saving the dataset (0/1 shards):   0%|          | 0/70932 [00:00<?, ? examples/s]

In [None]:
# Split dataset into train/eval
split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

# Metric: Perplexity (based on loss)
# Hugging Face's `evaluate` doesn't include perplexity directly,
# but we can derive it from cross-entropy loss.
def compute_metrics(eval_pred):
    loss = eval_pred.metrics["eval_loss"] if "eval_loss" in eval_pred.metrics else None
    if loss is None:
        return {}
    perplexity = np.exp(loss)
    return {"perplexity": perplexity, "loss": loss}

In [None]:
# Visualizations for Training NO IDEA IF IT WORKS

import transformers
from transformers import TrainerCallback
from IPython.display import clear_output
import matplotlib.pyplot as plt

train_losses = []
eval_losses = []
eval_steps = []

def plot_metrics():
    clear_output(wait=True)
    plt.plot(train_losses, label="Train Loss")
    plt.plot(eval_steps, eval_losses, label="Eval Loss")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

class PlotCallback(transformers.TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:
                train_losses.append(logs["loss"])
            if "eval_loss" in logs:
                eval_losses.append(logs["eval_loss"])
                eval_steps.append(state.global_step)
                plot_metrics()

In [15]:
# Training arguments

training_args = TrainingArguments(
    output_dir="C:/Users/Gaels/MATH-Proj-GPT2/shakespeare_gpt2",
    #eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    disable_tqdm=False,
    logging_dir="C:/Users/Gaels/MATH-Proj-GPT2/shakespeare_logs",
    logging_steps=50,
    logging_strategy="steps",
    eval_steps=200,
    logging_first_step=True,
)

#Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=[PlotCallback],   # 👈 add live plotting
)

#Train the model

trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,8.5146


KeyboardInterrupt: 

In [None]:
# Save Model + Tokenizer

save_dir = "C:/Users/Gaels/MATH-Proj-GPT2/shakespeare_gpt2_final"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

In [None]:
#Evaluation

# 1. Perplexity Evaluation
print("\n--- Perplexity Evaluation ---")
eval_results = trainer.evaluate()
print(f"Evaluation Loss: {eval_results['eval_loss']}")
print(f"Perplexity: {math.exp(eval_results['eval_loss'])}")

# 3. BLEU/ROUGE Scores
print("\n--- BLEU/ROUGE Evaluation ---")
import evaluate
metric = evaluate.load("rouge")


In [None]:
# Inference (text generation)

generator = pipeline(
    "text-generation",
    model=save_dir,
    tokenizer=save_dir,
    pad_token_id=tokenizer.eos_token_id
)

print("\n=== Sample Shakespearean Text ===\n")
print(generator("to be, or not to be", max_length=100, num_return_sequences=1)[0]["generated_text"])