In [16]:
from pathlib import Path
import re
from collections import Counter

folder = Path("/Users/ivan/Desktop/Fontys/AML1/AM1-1/data/texts") # write import path/reader yourself as on macOS pathes 
#looks different than on Windows, so generalised path doesnt work properly lol 

books = []

for file in folder.glob("*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()      # read whole file as one string
        books.append(text)   # add entire book as ONE string

print(len(books))           # number of books
print(type(books[0]))       # should be <class 'str'>

94
<class 'str'>


In [4]:
#cleaning the text
print(books[0][:1000])
print(books[0][-1000:])

ï»¿The Project Gutenberg eBook of The Complete Works of William Shakespeare
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Complete Works of William Shakespeare

Author: William Shakespeare

Release date: January 1, 1994 [eBook #100]
                Most recently updated: August 24, 2025

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK THE COMPLETE WORKS OF WILLIAM SHAKESPEARE ***




The Complete Works of William Shakespeare

by William Shakespeare




                    Contents

    THE SONNETS
    ALLâS WELL THAT ENDS WELL
    THE TRAGEDY OF ANTONY AND CLEOPAT

In [7]:
START_RE = re.compile(r"\*\*\*\s*START OF (?:THE )?PROJECT GUTENBERG EBOOK.*?\*\*\*", re.IGNORECASE | re.DOTALL)
END_RE   = re.compile(r"\*\*\*\s*END OF (?:THE )?PROJECT GUTENBERG EBOOK.*?\*\*\*", re.IGNORECASE | re.DOTALL)

def clean_gutenberg(text: str) -> str:
    # 1) remove header before START marker - 1st pattern
    m = START_RE.search(text)
    if m:
        text = text[m.end():]

    # 2) remove footer after END marker (license etc.) - 2nd pattern
    m = END_RE.search(text)
    if m:
        text = text[:m.start()]

    # 3) remove common structural patterns that bias stats
    # 3a) chapter headings (roman numerals + digits)
    text = re.sub(r"(?im)^\s*chapter\s+([ivxlcdm]+|\d+)\b.*$", "", text)

    # 3b) collapse excessive blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()


cleaned_books = [clean_gutenberg(b) for b in books]

In [13]:
print(cleaned_books[0][:1000])

The Complete Works of William Shakespeare

by William Shakespeare

                    Contents

    THE SONNETS
    ALLâS WELL THAT ENDS WELL
    THE TRAGEDY OF ANTONY AND CLEOPATRA
    AS YOU LIKE IT
    THE COMEDY OF ERRORS
    THE TRAGEDY OF CORIOLANUS
    CYMBELINE
    THE TRAGEDY OF HAMLET, PRINCE OF DENMARK
    THE FIRST PART OF KING HENRY THE FOURTH
    THE SECOND PART OF KING HENRY THE FOURTH
    THE LIFE OF KING HENRY THE FIFTH
    THE FIRST PART OF HENRY THE SIXTH
    THE SECOND PART OF KING HENRY THE SIXTH
    THE THIRD PART OF KING HENRY THE SIXTH
    KING HENRY THE EIGHTH
    THE LIFE AND DEATH OF KING JOHN
    THE TRAGEDY OF JULIUS CAESAR
    THE TRAGEDY OF KING LEAR
    LOVEâS LABOURâS LOST
    THE TRAGEDY OF MACBETH
    MEASURE FOR MEASURE
    THE MERCHANT OF VENICE
    THE MERRY WIVES OF WINDSOR
    A MIDSUMMER NIGHTâS DREAM
    MUCH ADO ABOUT NOTHING
    THE TRAGEDY OF OTHELLO, THE MOOR OF VENICE
    PERICLES, PRINCE OF TYRE
    KING RICHARD THE SECOND
    KI

In [14]:
raw_tokens = []
for text in cleaned_books:
    raw_tokens.extend(text.split())

In [17]:
counter = Counter(raw_tokens)

print("Total tokens:", len(raw_tokens))
print("Unique tokens:", len(counter))
print("Top 20 tokens:", counter.most_common(20))

Total tokens: 13758303
Unique tokens: 472496
Top 20 tokens: [('the', 660536), ('and', 422476), ('of', 390691), ('to', 353059), ('a', 256963), ('in', 222829), ('I', 193344), ('was', 152970), ('that', 150253), ('he', 141079), ('his', 140785), ('with', 112474), ('for', 94901), ('as', 93690), ('had', 92086), ('is', 91305), ('it', 88995), ('not', 85752), ('at', 81835), ('you', 81206)]


In [18]:
#ex2 - BPE tokenizer
from tokenizers import Tokenizer # main object for tokenization
from tokenizers.models import BPE # model implementing Byte Pair Encoding.
from tokenizers.trainers import BpeTrainer # trains the BPE vocabulary
from tokenizers.pre_tokenizers import Whitespace # simple pre-tokenizer splitting on spaces

In [20]:
tokenizer = Tokenizer(BPE())  # BPE model with unknown token
tokenizer.pre_tokenizer = Whitespace() 
trainer = BpeTrainer(vocab_size=20000, min_frequency=1, special_tokens=[], end_of_word_suffix="</w>")
tokenizer.train_from_iterator(cleaned_books, trainer=trainer)






In [21]:
tokenizer.save("bpe_tokenizer.json")

In [None]:
# Merge all cleaned books into one corpus
all_text = " ".join(cleaned_books)

# Encode entire corpus
encoding = tokenizer.encode(all_text)

# Save token IDs (recommended for ML use)
with open("bpe_token_ids.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(map(str, encoding.ids)))

KeyboardInterrupt: 