<a href="https://colab.research.google.com/github/Hearlvein/colab/blob/main/guten_tag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from gutenbergpy.textget import get_text_by_id
from gutenbergpy.gutenbergcache import GutenbergCache
from bs4 import BeautifulSoup
import requests

# Step 1: Scrape the bookshelf for book IDs
def get_book_ids_from_bookshelf(url, limit=10):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    book_links = soup.select('li.booklink a.link')
    book_ids = []

    for link in book_links:
        href = link.get('href')
        if href.startswith('/ebooks/'):
            book_id = href.split('/')[-1]
            if book_id.isdigit():
                book_ids.append(int(book_id))
                if len(book_ids) == limit:
                    break
    return book_ids

# Step 2: Download and save books
def download_books(book_ids, output_folder="gutenberg_books"):
    os.makedirs(output_folder, exist_ok=True)

    # GutenbergPy uses a cached metadata database; load it
    print("Loading Gutenberg metadata cache...")
    cache = GutenbergCache.get_cache()

    for book_id in book_ids:
        print(f"Downloading book ID {book_id}...")
        try:
            text_bytes = get_text_by_id(book_id)
            text_str = text_bytes.decode('utf-8', errors='ignore')
            output_path = os.path.join(output_folder, f"{book_id}.txt")
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text_str)
            print(f"Saved book {book_id} to {output_path}")
        except Exception as e:
            print(f"Error downloading book {book_id}: {e}")

# Run the script
if __name__ == "__main__":
    bookshelf_url = "https://www.gutenberg.org/ebooks/bookshelf/480"
    book_ids = get_book_ids_from_bookshelf(bookshelf_url, limit=10)
    download_books(book_ids)


Loading Gutenberg metadata cache...
Downloading book ID 84...
Saved book 84 to gutenberg_books/84.txt
Downloading book ID 43...
Saved book 43 to gutenberg_books/43.txt
Downloading book ID 345...
Saved book 345 to gutenberg_books/345.txt
Downloading book ID 41445...
Saved book 41445 to gutenberg_books/41445.txt
Downloading book ID 55...
Saved book 55 to gutenberg_books/55.txt
Downloading book ID 2148...
Saved book 2148 to gutenberg_books/2148.txt
Downloading book ID 829...
Saved book 829 to gutenberg_books/829.txt
Downloading book ID 1251...
Saved book 1251 to gutenberg_books/1251.txt
Downloading book ID 16...
Saved book 16 to gutenberg_books/16.txt
Downloading book ID 36...
Saved book 36 to gutenberg_books/36.txt
