In [None]:
!pip install -q transformers datasets


In [None]:
from datasets import load_dataset
import itertools, requests, re, json, os
import json
import os
from google.colab import drive

In [None]:
START_RE = re.compile(r"\*\*\*\s*START OF (THIS|THE) PROJECT GUTENBERG EBOOK.*?\*\*\*", re.IGNORECASE | re.DOTALL)
END_RE   = re.compile(r"\*\*\*\s*END OF (THIS|THE) PROJECT GUTENBERG EBOOK.*?\*\*\*", re.IGNORECASE | re.DOTALL)

def clean_gutenberg_text(text):
    if not text:
        return ""

    # Cut to content between START/END markers if present
    m = START_RE.search(text)
    if m:
        text = text[m.end():]
    m2 = END_RE.search(text)
    if m2:
        text = text[:m2.start()]

    # Normalize whitespace
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text

def plain_text_url(formats):
    # UTF-8 plain text if available
    for k, url in formats.items():
        if k.startswith("text/plain") and "utf-8" in k.lower():
            return url

    for k, url in formats.items():
        if k.startswith("text/plain"):
            return url

    return None



In [None]:
#Filtering only the fairytales

def fetch_all_fairytale_books_metadata():
    url = "https://gutendex.com/books"
    params = {
        "languages": "en",
        "topic": "fairy tales",
        "mime_type": "text/plain",
    }

    all_results = []

    while True:
        r = requests.get(url, params=params, timeout=60)
        r.raise_for_status()
        data = r.json()

        results = data.get("results", [])
        all_results.extend(results)

        next_url = data.get("next", None)
        if not next_url:
            break


        url = next_url
        params = None

    return all_results

books_meta = fetch_all_fairytale_books_metadata()
print("Books found:", len(books_meta))
print("First title:", books_meta[0]["title"] if books_meta else "None")


Books found: 501
First title: Grimms' Fairy Tales


In [None]:
#Create dictionary with fairytales

fairytales = {}

for book in books_meta:
    title = book.get("title", "Untitled").strip()
    book_id = book.get("id")

    formats = book.get("formats", {})
    text_url = plain_text_url(formats)

    if not text_url:
        # skip if no plain text url
        continue

    try:
        resp = requests.get(text_url, timeout=60)
        resp.raise_for_status()
        raw_text = resp.text
        cleaned = clean_gutenberg_text(raw_text)

        if len(cleaned) < 500:
            continue

        # only unique title
        key = title
        if key in fairytales:
            key = f"{title} (ID {book_id})"

        fairytales[key] = cleaned


    except Exception as e:
        print("Failed:", title, "|", e)

print("Total loaded fairytales:", len(fairytales))


Total loaded fairytales: 501


In [None]:
#Remove junk in the text

def extra_clean(text: str) -> str:
    if not text:
        return ""

    # remove common lines
    lines = text.split("\n")
    cleaned_lines = []
    for line in lines:
        s = line.strip()

        # drop empty extra spaces
        s = re.sub(r"\s+", " ", s)

        # skip common non-story noise
        if s.lower().startswith("produced by"):
            continue
        if s.lower().startswith("transcriber's note"):
            continue
        if s.lower().startswith("proofreading team"):
            continue
        if s.lower().startswith("[illustration"):
            continue
        if s.lower() in {"illustration", "[illustration]"}:
            continue

        cleaned_lines.append(s)

    text = "\n".join(cleaned_lines)

    # normalize newlines
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text


for title in list(fairytales.keys()):
    fairytales[title] = extra_clean(fairytales[title])


In [None]:
titles = list(fairytales.keys())
print("Example titles:")
for t in titles[:4]:
    print("-", t)

if titles:
    t0 = titles[1]
    print("\nPreview of:", t0)
    print(fairytales[t0][:1])


Example titles:
- Grimms' Fairy Tales
- The Thousand and One Nights, Vol. I.: Commonly Called the Arabian Nights' Entertainments
- Grimms' Fairy Tales (ID 20027)
- Fairy Tales of Hans Christian Andersen

Preview of: The Thousand and One Nights, Vol. I.: Commonly Called the Arabian Nights' Entertainments
D


In [None]:
#Filter by lenght. Very small text could be a junk

def word_count(text: str) -> int:
    return len(re.findall(r"\w+", text))

MIN_WORDS = 300


filtered_fairytales = {}
for title, text in fairytales.items():
    wc = word_count(text)
    if wc < MIN_WORDS:
        continue
    filtered_fairytales[title] = text

print("Before:", len(fairytales))
print("After length filter:", len(filtered_fairytales))


Before: 501
After length filter: 500


In [None]:
records = [{"title": t, "text": x} for t, x in filtered_fairytales.items()]

print("Records:", len(records))

Records: 500


In [None]:
#Save dataset into file

os.makedirs("data", exist_ok=True)


with open("data/fairytales_clean.json", "w", encoding="utf-8") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)


print("Saved:")


Saved:
