This notebook outlines the data preparation pipeline for the Latin poetry corpus. It covers the three main stages:



*   Automated web scraping of texts from The Latin Library
*   Preprocessing to clean and normalize the raw data
*   Final dataset formatting, where the text is chunked and tokenized to create training, validation, and test sets for fine-tuning

Additionally,

*   Document covers the generation of synthetic poetry using GPT-4, which was a specific step required for the creation of Dataset V4

# Imports



In [None]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import re
import unicodedata
import os
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict

# 1. Scraping & Preprocessing

To build the dataset, the Latin poetry texts were programmatically collected from The Latin Library. The process involved creating a web scraping script in Python to automate the download and cleaning of the texts.

The script shown below was developed to handle this task. It is demonstrated here using Ovid's Metamorphoses as the primary example.

The script executes a series of steps for each book of a given work:

*   Requesting Content: It sends an HTTP request to the specific URL for each book, mimicking a web browser to ensure a successful response.
*   Parsing HTML: Upon receiving the webpage's content, it uses the BeautifulSoup library to parse the raw HTML and extract all the text.
*   Text Cleaning and Normalization: This is the most critical step. The raw text is processed to make it suitable for analysis:
*   Irrelevant Content Removal: Headers, footers, titles, and any lines written in all caps (which are typically headings) are filtered out.
*   Whitespace and Unicode Normalization: All forms of whitespace are standardized to single spaces, and Unicode characters are normalized to a consistent format.
*   Line Number Removal: The script uses regular expressions to identify and remove the line numbers that are often embedded at the end or even in the middle of lines in the source text.
*   Paragraph Detection: Paragraph breaks in the original text, indicated by indentation, are detected and preserved by adding a blank line.
*   Saving the Data: Once cleaned, the lines of poetry for each book are saved into a separate .txt file, organized into a directory named after the author and the work.

The script's parameters (such as the author name and URL structure) were adjusted for each of the following works:


*   Virgil, Aeneid
*   Statius, Thebaid
*   Lucan, Pharsalia
*   Silius Italicus, Punica
*   Valerius Flaccus, Argonautica
*   Juvenal, Satires

## Ovid - Metamorphoses

In [None]:
# Path
base_path = Path("C:/Users/nadir/Desktop/ClassicalLatinPoetryGeneration/datasets/HexameterPoetry/Ovid-Metamorphoses")
base_path.mkdir(parents=True, exist_ok=True)

# Loop through all books
for book_num in range(1, 16):
    url = f"http://www.thelatinlibrary.com/ovid/ovid.met{book_num}.shtml"
    print(f"📥 Processing Book {book_num} from {url}")

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    raw_lines = soup.get_text().splitlines()

    poem_lines = []

    for line in raw_lines:
        original_line = line  # Keep indentation for paragraph detection
        line = line.rstrip()

        is_paragraph_start = bool(re.match(r"^\s{3,}", original_line))
        line = line.strip()

        # Skip irrelevant lines
        if (
            not line
            or line.isupper()
            or line.startswith("—")
            or line.lower().startswith("ovid")
        ):
            continue

        # Unicode normalization
        line = unicodedata.normalize("NFKC", line)

        # Remove trailing numbers
        line = re.sub(r"\s{2,}\d+[a-zA-Z]?$", "", line)
        line = re.sub(r"\s{2,}\d{1,3}$", "", line)

        # Normalize whitespace
        line = re.sub(r"[ ]+", " ", line)
        line = re.sub(r"\s{2,}", " ", line)
        line = line.replace("’", "'")

        # Filter by reasonable poetic line length
        if not (25 <= len(line) <= 120):
            continue

        # Mid-line numbering split
        mid_line_split = re.split(r"(\s[.,;:!?]?\s*\d{1,3}\s+)", line)
        if len(mid_line_split) > 1:
            parts = [part.strip() for part in mid_line_split if part.strip() and not part.strip().isdigit()]
            if is_paragraph_start and len(poem_lines) > 0:
                poem_lines.append("")
            poem_lines.extend(parts)
        else:
            if is_paragraph_start and len(poem_lines) > 0:
                poem_lines.append("")
            poem_lines.append(line)

    print(f"   → {len(poem_lines)} cleaned lines from Book {book_num}")

    # Saving
    output_file = base_path / f"ovid_metamorphoses-book-{book_num}.txt"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(poem_lines))

    print(f" Saved to {output_file}")

# 2. Merging All Books into One File

In [1]:
# Path
base_dir = Path("C:/Users/nadir/Desktop/ClassicalLatinPoetryGeneration/datasets/HexameterPoetry")

# Output path
output_dir = base_dir / "Dataset V3"
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "latin_poetry_all.txt"

# 3. List the subfolders
folders_to_process = [
    "Ovid-Metamorphoses",
    "Virgil-Aeneid",
    "Statius-Thebaid",
    "Lucan-Pharsalia",
    "Silius-Punica",
    "Juvenal-Satires"
]

all_books_content = []

# Go through each folder
for folder_name in folders_to_process:
    folder_path = base_dir / folder_name
        continue

    # This key function splits 'author-work-book-12' at '-' and takes the last part ('12').
    book_files = sorted(folder_path.glob("*.txt"), key=lambda p: int(p.stem.split("-")[-1]))

    # Read each sorted book file.
    for file_path in book_files:
        with open(file_path, "r", encoding="utf-8") as f:
            # Read the entire content of the book.
            book_text = f.read().strip()
            # Add the book's text to our master list.
            all_books_content.append(book_text)

# Combine all the individual book texts into one large string.
# Each book will be separated by two newlines, which is a good separator for training.
combined_text = "\n\n".join(all_books_content)

# Saving
with open(output_file, "w", encoding="utf-8") as out_f:
    out_f.write(combined_text)

# 3. Chunking Paragraphs with Tokenizer

The script:

*   Uses max token length of 512.
*   It defines "< unused0 >" and "< unused1 >" as special markers. These will be used to tell the model exactly where a piece of poetry begins and ends, which helps it learn the structure of the data.

It reads text and splits it into paragraphs. Then, for each paragraph, it checks its length in tokens:

*   If a paragraph is short enough to fit within the EFFECTIVE_MAX_LENGTH, it's kept as is.

*   If a paragraph is too long, the script intelligently breaks it down into smaller "chunks." It does this carefully, line by line, to ensure that none of the new chunks exceed the token limit.

After created chunks shuffled randomly, they divided into: training set (80%), validation set (10%), test set (10%).

This specific script is designed for Gemma 3 models (they all use same tokenizer), a very similar script is used for Llama models.


In [None]:
# Configuration for Gemma Dataset V3
random.seed(42)

# Max length for the text is 512 - 4 (BOS, EOS, unused0, unused1) - 2 (safety margin)
EFFECTIVE_MAX_LENGTH = 506

# Using the Gemma 3 tokenizer
TOKENIZER_PATH = r"C:\Users\nadir\Desktop\ClassicalLatinPoetryGeneration\models\gemma-3-27b-it-qat-q4_0-unquantized"

base_dir = Path(r"C:\Users\nadir\Desktop\ClassicalLatinPoetryGeneration\datasets\HexameterPoetry\Dataset V3")
input_file = base_dir / "latin_poetry_all.txt"

output_dir = Path(r"C:\Users\nadir\Desktop\ClassicalLatinPoetryGeneration\datasets\HexameterPoetry\Dataset V3 Gemma")
output_dir.mkdir(exist_ok=True)

# Special Tokens for Gemma 3 models
start_token = "<unused0>"
end_token = "<unused1>"

In [2]:
# Tokenizer Loading
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Script
with open(input_file, "r", encoding="utf-8") as f:
    full_text = f.read().strip()

original_paragraphs = [p.strip() for p in full_text.split("\n\n") if p.strip()]
processed_paragraphs = []

for para in original_paragraphs:
    # Check length of the text only
    para_tokens = tokenizer.encode(para, add_special_tokens=False)

    if len(para_tokens) <= EFFECTIVE_MAX_LENGTH:
        processed_paragraphs.append(para)
    else:
        # Split long paragraphs into smaller chunks
        lines = para.split('\n')
        current_chunk_tokens = []
        current_chunk_text = ""

        for line in lines:
            line_tokens = tokenizer.encode(('\n' if current_chunk_text else '') + line, add_special_tokens=False)

            if len(current_chunk_tokens) + len(line_tokens) > EFFECTIVE_MAX_LENGTH:
                processed_paragraphs.append(current_chunk_text)
                current_chunk_text = line
                current_chunk_tokens = tokenizer.encode(line, add_special_tokens=False)
            else:
                current_chunk_text += ('\n' if current_chunk_text else '') + line
                current_chunk_tokens.extend(line_tokens)

        processed_paragraphs.append(current_chunk_text)

# Shuffle and Split
random.shuffle(processed_paragraphs)
n = len(processed_paragraphs)
train_split = processed_paragraphs[:int(n * 0.8)]
val_split = processed_paragraphs[int(n * 0.8):int(n * 0.9)]
test_split = processed_paragraphs[int(n * 0.9):]

# Formatting and Saving
def format_and_save(paragraphs, file_path):
    # Wrap each paragraph/chunk with the special tokens
    formatted_paragraphs = [f"{start_token}\n{p}\n{end_token}" for p in paragraphs]
    content_to_save = "\n\n".join(formatted_paragraphs)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content_to_save)

format_and_save(train_split, output_dir / "train.txt")
format_and_save(val_split, output_dir / "val.txt")
format_and_save(test_split, output_dir / "test.txt")

# 4. Tokenizing Datasets

Very similar script used for Llama models.

In [None]:
# Input directory containing text files formatted with <unused0> and <unused1>
input_dir = r"C:\Users\nadir\Desktop\ClassicalLatinPoetryGeneration\datasets\HexameterPoetry\Dataset V3 Gemma"
# Final output directory for the tokenized Gemma Dataset V3
output_dir = r"C:\Users\nadir\Desktop\ClassicalLatinPoetryGeneration\datasets\HexameterPoetry\Dataset V3 Gemma\tokenized"

TRAIN_FILE_PATH = os.path.join(input_dir, "train.txt")
VAL_FILE_PATH = os.path.join(input_dir, "val.txt")
TEST_FILE_PATH = os.path.join(input_dir, "test.txt")

# Set Padding Side to 'right'
tokenizer.padding_side = "right"

tokenizer.pad_token = tokenizer.eos_token

# Functions
def create_paragraph_list(file_path):
    """Reads a text file and splits it into a list of pre-formatted paragraphs/chunks."""
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # The paragraphs are already formatted with our special tokens
    paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
    return paragraphs

def tokenize_function(examples):
    """Tokenizes the pre-formatted text, adding the final EOS token for the model."""
    # The text already has our start/end tokens. We just add the final EOS token.
    # The tokenizer will also automatically add the BOS token.
    formatted_texts = [f"{text}{tokenizer.eos_token}" for text in examples["text"]]

    tokenized_outputs = tokenizer(
        formatted_texts,
        truncation=True,
        max_length=512,
        padding="max_length"
    )

    tokenized_outputs["labels"] = tokenized_outputs["input_ids"].copy()

    return tokenized_outputs


# Tokenizing

train_paragraphs = create_paragraph_list(TRAIN_FILE_PATH)
val_paragraphs = create_paragraph_list(VAL_FILE_PATH)
test_paragraphs = create_paragraph_list(TEST_FILE_PATH)

raw_datasets = DatasetDict({
    'train': Dataset.from_dict({'text': train_paragraphs}),
    'validation': Dataset.from_dict({'text': val_paragraphs}),
    'test': Dataset.from_dict({'text': test_paragraphs})
})

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Saving
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
tokenized_datasets.save_to_disk(output_dir)

# 5. "Modern" Poems

This section outlines the specific and additional preprocessing steps that were undertaken to create Dataset V4.

Since Dataset V4 created for an instruct model, following templates used for each chunk:


```
# Structure for Classical Poem Chunk:
<start_of_turn>user
Write a poem in Latin, dactylic hexameter style. Just type the poem. Nothing more.<end_of_turn>
<start_of_turn>model
[A classical Latin poetry paragraph]<end_of_turn>

# Structure for Modern Poem Chunk:
<start_of_turn>user
Write a poem in Latin, on a modern topic, in dactylic hexameter style. Just type the poem. Nothing more.<end_of_turn>
<start_of_turn>model
[A synthetically generated modern Latin poem]<end_of_turn>
```



## Creating Poems with GPT-4

In [None]:
from openai import OpenAI
from pathlib import Path
import time
import os

In [3]:
from google.colab import userdata
userdata.get('GPT')

In [None]:
API_KEY_FILE = "/content/drive/MyDrive/ClassicalLatinPoetryGeneration/openai_api_key.txt"

base_dir = Path("/content/drive/MyDrive/ClassicalLatinPoetryGeneration/DatasetV4-IT/DatasetV4/")
output_file = base_dir / "latin_poetry_modern_generated.txt"

In [None]:
# Topics

modern_topics = [
    # Technology & Science
    "a smartphone", "a rocket launching to the moon", "the internet connecting the world",
    "a self-driving car", "a social media feed", "a video game world", "a quantum computer",
    "a drone flying over a landscape", "a virtual reality headset", "an artificial intelligence writing code",
    "a 3D printer creating an object", "a satellite orbiting Earth", "a cryptocurrency transaction",
    "a genetic engineering lab", "a particle accelerator", "an online encyclopedia", "a GPS navigation system",
    "a data center server rack", "a software update", "a Mars rover exploring a crater",
    "a neural network learning", "a satellite internet constellation", "an augmented reality map",
    "a smart city's sensor network", "a space telescope discovering a new galaxy", "a robotic surgeon",
    "a brain-computer interface",

    # Everyday Modern Life
    "a cup of coffee in the morning", "a remote work video call", "a bustling airport terminal",
    "a quiet electric car", "a traffic jam on a highway", "a quiet library with computers",
    "a home security camera", "an airplane taking off", "a skyscraper elevator",
    "a credit card payment", "a barcode scanner", "an online shopping cart",
    "a fitness tracker", "a smart home assistant", "a self-checkout lane",
    "a food delivery app", "a streaming movie service", "a high-speed train",
    "a ride-sharing service", "a coffee shop filled with people on laptops",
    "a crowded subway car during rush hour", "an online food recipe", "a home-delivered package",
    "a podcast playing on a commute", "a digital family photo album", "a video call with a grandparent",
    "a self-service gas station", "a modern gym with treadmills", "a quiet Sunday morning in a city apartment",
    "an online language learning app", "a smart thermostat adjusting the temperature", "a meal-kit delivery box",
    "a dashcam recording a journey", "a contactless payment at a store", "a pop-up advertisement on a screen",
    "a fast food restaurant's drive-thru", "an e-book reader at night",

    # Romantic Love & Existentialism (Expanded)
    "the beginning of a new romance", "a quiet, intimate moment with a loved one", "the heartbreak of a breakup",
    "unrequited love in the age of social media", "a modern wedding ceremony", "the search for purpose in a vast universe",
    "contemplating one's own mortality", "the feeling of absolute freedom and its resulting dread", "the absurdity of a daily office routine",
    "creating personal meaning in a meaningless world", "the quiet comfort of a long-term relationship",
    "the jealousy sparked by a social media post", "the decision to end a relationship", "a first date's awkwardness and hope",
    "remembering a lost love", "the feeling of being insignificant under the stars", "the weight of making a life-altering decision",
    "the conflict between free will and determinism", "the human desire to leave a legacy", "the quiet beauty of an ordinary day",
    "the fear of being forgotten after death", "the search for authenticity in a superficial world",
    "the feeling of being a stranger in one's own life", "the fleeting nature of happiness", "a love letter sent as a text message",
    "the comfortable silence between old lovers", "the pain of betrayal", "the challenge of forgiving someone",
    "wondering about the 'what ifs' of past choices", "the feeling of time passing too quickly", "finding beauty in imperfection",

    # Contemporary Experiences & Emotions
    "the feeling of urban loneliness", "a long-distance relationship", "the anxiety of a job interview",
    "the quiet of a city after a rainstorm", "a protest for social change", "the memory of a past love",
    "the hope for a better future", "the stress of modern work life", "a moment of unexpected kindness",
    "the feeling of nostalgia for childhood", "the joy of a small success", "the fear of climate change",
    "a difficult moral choice", "the comfort of a close friendship", "the bittersweet feeling of leaving home",
    "the excitement of traveling to a new country", "a moment of quiet reflection", "the struggle against injustice",
    "the feeling of being an outsider", "the feeling of 'information overload'", "the challenge of finding truth online",
    "the joy of reconnecting with an old friend", "the melancholy of a passing season in a city park", "the energy of a startup company",
    "the quiet satisfaction of a finished project", "the collective grief of a public tragedy", "the hope of a new beginning",
    "the struggle for work-life balance", "the feeling of being watched by algorithms", "the comfort of a pet waiting at home",
    "the anxiety of a global pandemic", "the excitement of a new scientific discovery", "the loneliness of a crowded room",
    "the peace of a solo hike in nature", "the feeling of 'deja vu'", "the impact of social media on society",
    "the nature of a digital footprint", "a modern election's tension", "the quiet of a house after a party",

    # Places & Scenes
    "a bustling modern city at night", "a wind turbine farm", "solar panels on a roof",
    "a modern suspension bridge", "a music festival", "a quiet suburban neighborhood",
    "a bustling stock market floor", "a scientist in a cleanroom", "a modern art gallery",
    "a sports stadium during a game", "a power grid", "a hydroelectric dam",
    "a modern hospital emergency room", "a video blogger's studio", "an international space station",
    "a modern university lecture hall", "a recycling plant", "an automated warehouse",
    "a city's subway system", "a modern concert hall", "a modern farmer's market",
    "a skyscraper's observation deck", "a modern political debate", "a modern art museum",
    "a bustling international food market", "a quiet co-working space", "a massive container ship in a port",
    "a wind-swept coastal highway", "a university research library", "a high-tech greenhouse",
    "a neon-lit street in Tokyo", "a silent data archive", "a modern architectural marvel",
    "a solar-powered desert community", "a high-security government building", "a film festival red carpet",
    "a street art mural on a brick wall", "a large-scale music recording studio", "a quiet electric bicycle on a path",
    "a skyscraper's rooftop garden", "a digital map showing traffic", "a modern concert hall's acoustics",
    "a bustling public square",

    # Abstract Concepts
    "a complex algorithm", "a global supply chain", "a digital news headline", "a streaming music playlist",
    "an online forum discussion", "the concept of a digital identity", "the nature of a computer virus",
    "the idea of a 'global village'", "the ethics of artificial intelligence", "the spread of a viral meme",
    "the complexity of the global financial system", "the privacy of personal data",
    "the theory of relativity explained simply", "the nature of open-source software",
    "the process of machine learning", "the idea of a multiverse"
]

prompt_template = "Topic: {topic}"

In [None]:
# Generating Poems

try:
    with open(API_KEY_FILE, 'r') as f:
        api_key = f.read().strip()
    client = OpenAI(api_key=api_key)

if client:
    with open(output_file, "a", encoding="utf-8") as f:
        print(f"\nStarting generation of {len(modern_topics)} poems...")

        for i, topic in enumerate(modern_topics[4:]):
            print(f"--- Generating poem {i+1}/{len(modern_topics)}: '{topic}' ---")

            # Fill in the topic for the current iteration
            prompt = prompt_template.format(topic=topic)

            try:
                # Make the API call to GPT-4o
                response = client.chat.completions.create(
                    model="gpt-4o",
                    messages=[
                        {"role": "system", "content": "You are a master of classical Latin poetry who writes in strict dactylic hexameter. Your task is to compose a 10-line Latin poem in dactylic hexameter about the topic provided by the user. You must only output the poem, with no introduction, translation, or commentary before and after."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7,
                    max_tokens=200,
                )

                # Extract the poem from the response
                poem = response.choices[0].message.content.strip()

                # Append the new poem to the file, followed by a double newline
                f.write(poem + "\n\n")
                f.flush()

            # Wait for a second to be respectful of the API rate limits
            time.sleep(1)

    print("Done")