<a href="https://colab.research.google.com/github/Hearlvein/colab/blob/main/guten_tag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning GPT-2 on Sci-Fi & Poetry with LoRA
Adapting from: https://huggingface.co/blog/dvgodoy/fine-tuning-llm-hugging-face

In [1]:
# Install dependencies
%pip install -q datasets transformers accelerate peft bitsandbytes einops
%pip install -q beautifulsoup4 requests gutenbergpy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import json
import re
import requests
from bs4 import BeautifulSoup
from pathlib import Path
from gutenbergpy.textget import get_text_by_id
from gutenbergpy.gutenbergcache import GutenbergCache
from tqdm import tqdm

In [3]:
# Utility: Extract book IDs from a Gutenberg bookshelf
def get_book_ids_from_bookshelf(url, limit=10):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    book_links = soup.select("li.booklink a.link")
    book_ids = []
    for link in book_links:
        href = link.get("href")
        if href.startswith("/ebooks/"):
            book_id = href.split("/")[-1]
            if book_id.isdigit():
                book_ids.append(int(book_id))
                if len(book_ids) == limit:
                    break
    return book_ids

# Download and cache books
def download_books(book_ids, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    print("Loading Gutenberg metadata cache...")
    cache = GutenbergCache.get_cache()
    for book_id in book_ids:
        output_path = os.path.join(output_folder, f"{book_id}.txt")
        if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
            print(f"Book {book_id} already exists. Skipping.")
            continue
        try:
            print(f"Downloading book ID {book_id}...")
            text_bytes = get_text_by_id(book_id)
            text_str = text_bytes.decode("utf-8", errors="ignore")
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text_str)
        except Exception as e:
            print(f"Failed to download {book_id}: {e}")

def download_books_to_dataset(bookshelf_url, genre, limit=10, base_folder="gutenberg_dataset"):
    folder = os.path.join(base_folder, genre)
    ids = get_book_ids_from_bookshelf(bookshelf_url, limit)
    download_books(ids, folder)

In [4]:
# Define bookshelves
bookshelves = {
    "fiction": "https://www.gutenberg.org/ebooks/bookshelf/480",
    "poetry": "https://www.gutenberg.org/ebooks/bookshelf/60",
}

# Download books by genre
for genre, url in bookshelves.items():
    download_books_to_dataset(url, genre, limit=10)

Loading Gutenberg metadata cache...
Downloading book ID 84...
Downloading book ID 43...
Downloading book ID 345...
Downloading book ID 41445...
Downloading book ID 55...
Downloading book ID 2148...
Downloading book ID 829...
Downloading book ID 1251...
Downloading book ID 16...
Downloading book ID 36...
Loading Gutenberg metadata cache...
Downloading book ID 16328...
Downloading book ID 1322...
Downloading book ID 228...
Downloading book ID 2490...
Downloading book ID 14568...
Downloading book ID 9622...
Downloading book ID 3333...
Downloading book ID 1321...
Downloading book ID 20...
Downloading book ID 847...


In [5]:
# Clean and structure the dataset
HEADER_PATTERN = re.compile(r"\*{3}\s*START OF THIS PROJECT GUTENBERG EBOOK.*?\*{3}", re.IGNORECASE | re.DOTALL)
FOOTER_PATTERN = re.compile(r"\*{3}\s*END OF THIS PROJECT GUTENBERG EBOOK.*", re.IGNORECASE | re.DOTALL)

def clean_text(text):
    text = HEADER_PATTERN.sub("", text)
    text = FOOTER_PATTERN.sub("", text)
    return text.strip()

def build_jsonl_dataset(input_dirs, output_file):
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        print(f"{output_file} exists. Skipping creation.")
        return
    with open(output_file, "w", encoding="utf-8") as out_f:
        for genre, folder in input_dirs.items():
            for path in Path(folder).rglob("*.txt"):
                try:
                    raw = path.read_text(encoding="utf-8", errors="ignore")
                    cleaned = clean_text(raw)
                    if cleaned:
                        json.dump({"source": genre, "filename": path.name, "text": cleaned}, out_f, ensure_ascii=False)
                        out_f.write("\n")
                except Exception as e:
                    print(f"Error processing {path}: {e}")

# Prepare dataset
INPUT_DIRS = {
    "fiction": "gutenberg_dataset/fiction",
    "poetry": "gutenberg_dataset/poetry",
}
OUTPUT_FILE = "gutenberg_dataset.jsonl"
build_jsonl_dataset(INPUT_DIRS, OUTPUT_FILE)

In [6]:
# Load dataset
from datasets import Dataset
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]
dataset = Dataset.from_list(data)
print(f"Loaded {len(dataset)} samples.")

Loaded 20 samples.


In [7]:
# Filter and tokenize
from transformers import AutoTokenizer

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    result = tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text", "filename", "source"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [8]:
# LoRA setup
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType

base_model = AutoModelForCausalLM.from_pretrained(model_name)
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)
model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364




In [9]:
# Prepare training
output_dir = "./gpt2-lora-sci-fi-poetry"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="epoch",
    report_to="none",
    fp16=True,
    remove_unused_columns=False,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
# Train model
trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


('./gpt2-lora-sci-fi-poetry/tokenizer_config.json',
 './gpt2-lora-sci-fi-poetry/special_tokens_map.json',
 './gpt2-lora-sci-fi-poetry/vocab.json',
 './gpt2-lora-sci-fi-poetry/merges.txt',
 './gpt2-lora-sci-fi-poetry/added_tokens.json',
 './gpt2-lora-sci-fi-poetry/tokenizer.json')

In [11]:
# Generate text
from transformers import pipeline

generator = pipeline("text-generation", model=output_dir, tokenizer=output_dir)

prompt = "Beneath the rusted moons of Elarion, the last poet of Earth recited verses to the wind."

output = generator(
    prompt,
    max_new_tokens=300,
    temperature=0.95,
    top_k=50,
    top_p=0.92,
    repetition_penalty=1.1,
    eos_token_id=tokenizer.eos_token_id,
)

print("\nGenerated Poetic Sci-Fi Story:\n")
print(output[0]["generated_text"])

Device set to use cuda:0



Generated Poetic Sci-Fi Story:

Beneath the rusted moons of Elarion, the last poet of Earth recited verses to the wind. The world was filled with a thousand songs that could have made our lives better for many generations—even now we listen to them everyday as they sing and play like children in love from their old home on Jupiter's surface."
"It seemed obvious once again," said Einhorn at this time about what she'd done: "All he knew is my own life went horribly wrong when I had not been able take care — even if it didn't hurt or bother me deeply (I couldn\'t sleep long enough)." She paused before answering him bluntly: "'And you know well how often people try to blame your failure so badly.' My experience has taught us something else; though sometimes difficult things can go much worse than others…"
His eyes lit up between two figures lying next by his side : someone carrying an electric guitar where one used both hands while another played back its strings only after eying themselv