In [1]:
import requests
from bs4 import BeautifulSoup
import json
import random

# Book IDs for testing (randomly selected between 1 and 100000)
book_ids = random.sample(range(1, 100000), 20)

# Base URL for Project Gutenberg books
base_url = "https://www.gutenberg.org/ebooks/"

# List to store book metadata
books_metadata = []

# Function to split subjects and remove duplicates
def process_subjects(subjects):
    # Split by "--" and flatten the list, then remove duplicates
    split_subjects = [part.strip() for subject in subjects for part in subject.split(" -- ")]
    return list(set(split_subjects))  # Remove duplicates

# Loop over each book ID and scrape the data
for book_id in book_ids:
    book_url = f"{base_url}{book_id}"
    print(f"Scraping {book_url}")
    
    try:
        response = requests.get(book_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Look for the div with the id="bibrec"
        bibrec_section = soup.find('div', {'id': 'bibrec'})

        if bibrec_section:
            # Title - from the "Title" row in the bibrec section
            title_tag = bibrec_section.find('td', itemprop="headline")
            title = title_tag.text.strip() if title_tag else 'No title available'
            
            # Author - from the "Author" row in the bibrec section
            author_tag = bibrec_section.find('th', text='Author')
            author = author_tag.find_next('td').find('a').text.strip() if author_tag else 'No author available'

            # Description - from the "Summary" row in the bibrec section
            summary_tag = bibrec_section.find('th', text='Summary')
            description = summary_tag.find_next('td').text.strip() if summary_tag else 'No description available'

            # Language - from the "Language" row in the bibrec section
            language_tag = bibrec_section.find('tr', {'property': 'dcterms:language'})
            language = language_tag.find('td').text.strip() if language_tag else 'No language available'
            
            # Find all subject rows with property 'dcterms:subject' and datatype='dcterms:LCSH'
            subject_tags = bibrec_section.find_all('td', {'datatype': 'dcterms:LCSH'})
            
            # Extract the subject text from <a> tags within these rows
            subjects = [subject_tag.find('a').text.strip() for subject_tag in subject_tags if subject_tag.find('a')]

            # Process the subjects by splitting and removing duplicates
            categories = process_subjects(subjects)

            # Store metadata in dictionary
            book_metadata = {
                'Title': title,
                'Author': author,
                'Description': description,
                'Language': language,
                'Category': categories,
                'Link': book_url
            }
            
            # Append to books metadata list
            books_metadata.append(book_metadata)

        else:
            print(f"Could not find 'About this ebook' section for {book_url}")

    except Exception as e:
        print(f"Error extracting data from {book_url}: {e}")
        continue

# Save all scraped data to a JSON file
with open('gutenberg_books_processed.json', 'w', encoding='utf-8') as jsonfile:
    json.dump(books_metadata, jsonfile, ensure_ascii=False, indent=4)

print(f"Scraping complete! Data saved to 'gutenberg_books_processed.json'")

Scraping https://www.gutenberg.org/ebooks/30879


  author_tag = bibrec_section.find('th', text='Author')
  summary_tag = bibrec_section.find('th', text='Summary')


Scraping https://www.gutenberg.org/ebooks/49973
Scraping https://www.gutenberg.org/ebooks/3253
Scraping https://www.gutenberg.org/ebooks/43636
Scraping https://www.gutenberg.org/ebooks/6452
Scraping https://www.gutenberg.org/ebooks/56358
Scraping https://www.gutenberg.org/ebooks/68320
Scraping https://www.gutenberg.org/ebooks/59771
Scraping https://www.gutenberg.org/ebooks/50200
Scraping https://www.gutenberg.org/ebooks/98408
Could not find 'About this ebook' section for https://www.gutenberg.org/ebooks/98408
Scraping https://www.gutenberg.org/ebooks/73716
Scraping https://www.gutenberg.org/ebooks/76010
Could not find 'About this ebook' section for https://www.gutenberg.org/ebooks/76010
Scraping https://www.gutenberg.org/ebooks/14304
Scraping https://www.gutenberg.org/ebooks/64539
Scraping https://www.gutenberg.org/ebooks/15582
Scraping https://www.gutenberg.org/ebooks/75627
Could not find 'About this ebook' section for https://www.gutenberg.org/ebooks/75627
Scraping https://www.gutenb

KeyboardInterrupt: 