<a href="https://colab.research.google.com/github/Hearlvein/colab/blob/main/guten_tag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install commands
%pip install gutenbergpy beautifulsoup4 requests
%pip install datasets
%pip install transformers
%pip install accelerate

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
from gutenbergpy.textget import get_text_by_id
from gutenbergpy.gutenbergcache import GutenbergCache
from bs4 import BeautifulSoup
import requests

# Step 1: Scrape the bookshelf for book IDs
def get_book_ids_from_bookshelf(url, limit=10):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    book_links = soup.select('li.booklink a.link')
    book_ids = []

    for link in book_links:
        href = link.get('href')
        if href.startswith('/ebooks/'):
            book_id = href.split('/')[-1]
            if book_id.isdigit():
                book_ids.append(int(book_id))
                if len(book_ids) == limit:
                    break
    return book_ids

# Step 2: Download and save books (skip if file exists)
def download_books(book_ids, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    print("Loading Gutenberg metadata cache...")
    cache = GutenbergCache.get_cache()
    for book_id in book_ids:
        output_path = os.path.join(output_folder, f"{book_id}.txt")
        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
            print(f"Book {book_id} already exists at {output_path}, skipping download.")
            continue
        print(f"Downloading book ID {book_id}...")
        try:
            text_bytes = get_text_by_id(book_id)
            text_str = text_bytes.decode('utf-8', errors='ignore')
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text_str)
            print(f"Saved book {book_id} to {output_path}")
        except Exception as e:
            print(f"Error downloading book {book_id}: {e}")

# Utility: Download books by genre into a coherent folder structure
def download_books_to_dataset(bookshelf_url, genre, limit=10, base_folder="gutenberg_dataset"):
    output_folder = os.path.join(base_folder, genre)
    book_ids = get_book_ids_from_bookshelf(bookshelf_url, limit=limit)
    download_books(book_ids, output_folder=output_folder)

# Example genres and bookshelf URLs
bookshelves = {
    'fiction': 'https://www.gutenberg.org/ebooks/bookshelf/480',
    'poetry': 'https://www.gutenberg.org/ebooks/bookshelf/60',
    # Add more genres/bookshelves as needed
}

# Download for each genre into a clean structure
for genre, url in bookshelves.items():
    download_books_to_dataset(url, genre=genre, limit=10)


Loading Gutenberg metadata cache...
Book 84 already exists at gutenberg_dataset/fiction/84.txt, skipping download.
Book 43 already exists at gutenberg_dataset/fiction/43.txt, skipping download.
Book 345 already exists at gutenberg_dataset/fiction/345.txt, skipping download.
Book 41445 already exists at gutenberg_dataset/fiction/41445.txt, skipping download.
Book 55 already exists at gutenberg_dataset/fiction/55.txt, skipping download.
Book 2148 already exists at gutenberg_dataset/fiction/2148.txt, skipping download.
Book 829 already exists at gutenberg_dataset/fiction/829.txt, skipping download.
Book 1251 already exists at gutenberg_dataset/fiction/1251.txt, skipping download.
Book 16 already exists at gutenberg_dataset/fiction/16.txt, skipping download.
Book 36 already exists at gutenberg_dataset/fiction/36.txt, skipping download.
Loading Gutenberg metadata cache...
Book 16328 already exists at gutenberg_dataset/poetry/16328.txt, skipping download.
Book 1322 already exists at gutenber

## Building a Structured Gutenberg Dataset

All books are now organized by genre in subfolders under `gutenberg_dataset/`.

- `gutenberg_dataset/fiction/` contains fiction books (bookshelf 480).
- `gutenberg_dataset/poetry/` contains poetry books (bookshelf 60).
- Each book is saved as a `.txt` file named by its Gutenberg ID.

This structure is suitable for LLM dataset preparation and can be extended with more genres.

In [None]:
import os
import re
import json
from pathlib import Path
from tqdm import tqdm

# Configuration
INPUT_DIRS = {
    "fiction": Path("gutenberg_dataset/fiction"),
    "poetry": Path("gutenberg_dataset/poetry"),
}
OUTPUT_FILE = Path("gutenberg_dataset.jsonl")

# Regex patterns to strip Gutenberg headers/footers
HEADER_PATTERN = re.compile(
    r"\*{3}\s*START OF THIS PROJECT GUTENBERG EBOOK.*?\*{3}", re.IGNORECASE | re.DOTALL
)
FOOTER_PATTERN = re.compile(
    r"\*{3}\s*END OF THIS PROJECT GUTENBERG EBOOK.*", re.IGNORECASE | re.DOTALL
)


def clean_gutenberg_text(text: str) -> str:
    """
    Remove Project Gutenberg header/footer and extra whitespace.
    """
    # Remove header
    text = HEADER_PATTERN.sub("", text)
    # Remove footer
    text = FOOTER_PATTERN.sub("", text)
    # Normalize whitespace
    text = text.strip()
    return text


def process_and_write_jsonl(input_dirs: dict, output_path: Path):
    """
    Walk through input_dirs, clean each .txt file, and write to a single JSONL output.
    Each JSONL record has fields: source, filename, text.
    """
    if output_path.exists() and output_path.stat().st_size > 0:
        print(f"{output_path} already exists and is non-empty, skipping cleaning and writing.")
        return
    with output_path.open("w", encoding="utf-8") as out_file:
        for source_label, folder in input_dirs.items():
            txt_files = list(folder.rglob("*.txt"))
            for txt_path in tqdm(txt_files, desc=f"Processing {source_label}"):
                try:
                    raw = txt_path.read_text(encoding="utf-8", errors="ignore")
                    clean = clean_gutenberg_text(raw)
                    if not clean:
                        continue
                    record = {
                        "source": source_label,
                        "filename": txt_path.name,
                        "text": clean,
                    }
                    out_file.write(json.dumps(record, ensure_ascii=False) + "\n")
                except Exception as e:
                    print(f"Error processing {txt_path}: {e}")


if __name__ == "__main__":
    os.makedirs(OUTPUT_FILE.parent, exist_ok=True)
    process_and_write_jsonl(INPUT_DIRS, OUTPUT_FILE)
    print(f"Dataset written to {OUTPUT_FILE}")

gutenberg_dataset.jsonl already exists and is non-empty, skipping cleaning and writing.
Dataset written to gutenberg_dataset.jsonl


In [None]:
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
import os

# Paths
DATASET_PATH = "gutenberg_dataset.jsonl"
MODEL_NAME = (
    "distilgpt2"  # or try "gpt2" / "tiiuae/falcon-rw-1b" if you want larger models
)

# Step 1: Load dataset
print("Loading dataset...")
print(os.path.exists("gutenberg_dataset.jsonl"))
print(os.path.getsize("gutenberg_dataset.jsonl"))
dataset = load_dataset("json", data_files=DATASET_PATH, split="train")

# Optional: filter very short or very long texts
# dataset = dataset.filter(lambda x: 100 < len(x["text"]) < 5000)

# Print dataset statistics
print(f"Dataset loaded with {len(dataset)} records.")

Loading dataset...
True
8114735
{"source": "fiction", "filename": "84.txt", "text": "*** START OF THE PROJECT GUTENBERG EBOOK 84 ***\n\nFrankenstein;\n\nor, the Modern Prometheus\n\nby Mary Wollstonecraft (Godwin) Shelley\n\n\n CONTENTS\n\n Letter 1\n Letter 2\n Letter 3\n Letter 4\n Chapter 1\n Chapter 2\n Chapter 3\n Chapter 4\n Chapter 5\n Chapter 6\n Chapter 7\n Chapter 8\n Chapter 9\n Chapter 10\n Chapter 11\n Chapter 12\n Chapter 13\n Chapter 14\n Chapter 15\n Chapter 16\n Chapter 17\n Chapter 18\n Chapter 19\n Chapter 20\n Chapter 21\n Chapter 22\n Chapter 23\n Chapter 24\n\n\n\n\nLetter 1\n\n_To Mrs. Saville, England._\n\n\nSt. Petersburgh, Dec. 11th, 17—.\n\n\nYou will rejoice to hear that no disaster has accompanied the\ncommencement of an enterprise which you have regarded with such evil\nforebodings. I arrived here yesterday, and my first task is to assure\nmy dear sister of my welfare and increasing confidence in the success\nof my undertaking.\n\nI am already far north of

In [None]:
# Step 2: Load tokenizer and model
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = (
    tokenizer.eos_token
)  # GPT-style models don't have pad_token by default

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)


# Step 3: Tokenize dataset
def tokenize_function(examples):
    encodings = tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=512
    )
    encodings["labels"] = encodings["input_ids"].copy()
    return encodings


print("Tokenizing dataset...")
tokenized_dataset = dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "filename", "source"]
)

# Step 4: Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Step 5: Training arguments
training_args = TrainingArguments(
    output_dir="./poetic-sci-fi-model",
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_steps=100,
    fp16=True,
    remove_unused_columns=False,  # <<< this fixes your current error
)

# Step 6: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Loading tokenizer and model...
Tokenizing dataset...


Map: 100%|██████████| 20/20 [00:03<00:00,  5.16 examples/s]
  trainer = Trainer(


In [None]:
# Step 7: Train
print("Starting training...")
trainer.train()

# Step 8: Save locally
model_path = "./poetic-sci-fi-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

print(f"Model saved to {model_path}")

Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


Model saved to ./poetic-sci-fi-model


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

MODEL_PATH = "./poetic-sci-fi-model"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Sci-fi poetic prompt
prompt = "Beneath the rusted moons of Elarion, the last poet of Earth recited verses to the wind."

# Generate a full story
output = generator(
    prompt,
    max_new_tokens=300,
    do_sample=True,
    temperature=0.95,
    top_k=50,
    top_p=0.92,
    repetition_penalty=1.1,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,  # optional, helps cut off
)

print("\nGenerated Poetic Sci-Fi Story:\n")
print(output[0]["generated_text"])

Device set to use cpu



Generated Poetic Sci-Fi Story:

Beneath the rusted moons of Elarion, the last poet of Earth recited verses to the wind. In his poem titled ‣Sebastianism‬, Baudrillard writes that "I should be a virgin in one hundred thousand years' time (the ages before) and I will die by nature." This is something common on modern English literature:
If we are only not acquainted with certain things as well as those who have seen or read these pages for three generations it becomes obvious whether any given phrase was written using an archaic system such as this… If he has been taught about our species heretical doctrine at least some thought goes back down into ancient times – all because then there must have always been what seems like so little else except perhaps 'a very strange fact'. It turns out even though Shakespeare himself said more than once when writing lines involving human beings... There cannot be less truth; therefore if men were born from nothing they would never go through slavery.