In [3]:
import requests
from bs4 import BeautifulSoup
import csv

# URL of the book page on Project Gutenberg
url = 'https://www.gutenberg.org/ebooks/8600'

# Make a request to the webpage
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table element (this assumes there is only one table, or you can refine this)
    table = soup.find('table', class_='bibrec')

    if table:
        # Open or create a CSV file to save the data
        with open('bibrec_table.csv', 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)

            # Extract rows from the table
            rows = table.find_all('tr')

            # Extract column headers (from <th> tags)
            headers = [row.find('th').get_text(strip=True) for row in rows if row.find('th')]
            writer.writerow(headers)  # Write the headers to the CSV file

            # Extract data (from <td> tags) under each header
            values = [row.find('td').get_text(strip=True) if row.find('td') else '' for row in rows]
            writer.writerow(values)  # Write the data to the CSV file

        print("Data from the 'bibrec' table extracted and saved to 'bibrec_table.csv'")
    else:
        print("No table with class 'bibrec' found.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Data from the 'bibrec' table extracted and saved to 'bibrec_table.csv'


In [6]:
import requests
from bs4 import BeautifulSoup
import csv

# Function to scrape data from a given URL
def scrape_data(ebook_id):
    url = f'https://www.gutenberg.org/ebooks/{ebook_id}'
    
    # Make a request to the webpage
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the table with class 'bibrec'
        table = soup.find('table', class_='bibrec')

        if table:
            # Initialize a dictionary to store the row data
            data = {
                "Author": "",
                "Title": "",
                "Credits": "",
                "Summary": "",
                "Language": "",
                "LoC Class": "",
                "Subject": "",
                "Subject_2": "",
                "Subject_3": "",
                "Subject_4": "",
                "Category": "",
                "EBook-No.": "",
                "Release Date": "",
                "Most Recently Updated": "",
                "Copyright Status": "",
                "Downloads": ""
            }

            # Loop through each row of the table and extract the th/td pairs
            for row in table.find_all('tr'):
                if row.find('th'):
                    th = row.find('th').get_text(strip=True)
                    td = row.find('td').get_text(strip=True)
                    
                    # Match the <th> text to the correct column in the CSV
                    if th == "Author":
                        data["Author"] = td
                    elif th == "Title":
                        data["Title"] = td
                    elif th == "Credits":
                        data["Credits"] = td
                    elif th == "Summary":
                        data["Summary"] = td
                    elif th == "Language":
                        data["Language"] = td
                    elif th == "LoC Class":
                        data["LoC Class"] = td
                    elif th == "Subject":
                        # Handle multiple subjects by assigning them to separate fields
                        if not data["Subject"]:
                            data["Subject"] = td
                        elif not data["Subject_2"]:
                            data["Subject_2"] = td
                        elif not data["Subject_3"]:
                            data["Subject_3"] = td
                        else:
                            data["Subject_4"] = td
                    elif th == "Category":
                        data["Category"] = td
                    elif th == "EBook-No.":
                        data["EBook-No."] = td
                    elif th == "Release Date":
                        data["Release Date"] = td
                    elif th == "Most Recently Updated":
                        data["Most Recently Updated"] = td
                    elif th == "Copyright Status":
                        data["Copyright Status"] = td
                    elif th == "Downloads":
                        data["Downloads"] = td

            return data
        else:
            print(f"No 'bibrec' table found for ebook ID: {ebook_id}")
            return None
    else:
        print(f"Failed to retrieve the page for ebook ID: {ebook_id}. Status code: {response.status_code}")
        return None


# Main code to loop through ebook IDs and write data to CSV
start_id = 1  # Starting ebook ID
end_id = 2000    # Ending ebook ID (change this range as needed)

# Open or create a CSV file to save the data
with open('gutenberg_ebooks.csv', 'w', newline='', encoding='utf-8') as csvfile:
    # Define the CSV column headers
    headers = ["Author", "Title", "Credits", "Summary", "Language", "LoC Class", 
               "Subject", "Subject_2", "Subject_3", "Subject_4", "Category", 
               "EBook-No.", "Release Date", "Most Recently Updated", 
               "Copyright Status", "Downloads"]
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    
    # Write the headers to the CSV file
    writer.writeheader()

    # Loop through ebook IDs and scrape data
    for ebook_id in range(start_id, end_id + 1):
        data = scrape_data(ebook_id)
        if data:
            writer.writerow(data)
        else:
            print(f"Skipping ebook ID: {ebook_id}")

print("Data extraction completed and saved to 'gutenberg_ebooks.csv'")


<table class="bibrec">
<colgroup>
<col class="narrow"/>
<col/>
</colgroup>
<tr>
<th>Author</th>
<td>
<a about="/authors/528" href="/ebooks/author/528" itemprop="creator" rel="marcrel:aut" typeof="pgterms:agent">Zola, Émile, 1840-1902</a></td>
</tr>
<tr>
<th>Title</th>
<td itemprop="headline">
L'Assommoir
</td>
</tr><tr>
<th>Credits</th>
<td>
John Bickers, Dagny and David Widger
</td>
</tr><tr>
<th>Summary</th>
<td>
"L'Assommoir" by Émile Zola is a novel written during the late 19th century, an era characterized by the realism movement in literature. The book explores the struggles of Gervaise, a laundress trying to build a life for herself and her children amidst the oppressive and often brutal conditions of working-class Paris. The story highlights themes of poverty, domestic strife, and the impact of alcoholism on individuals and families.  The opening of "L'Assommoir" presents Gervaise in a state of despair, anxiously awaiting the return of Lantier, her partner, who has been increas

In [12]:
import requests
from bs4 import BeautifulSoup
import csv
import os

# Function to scrape data from a given URL
def scrape_data(ebook_id):
    url = f'https://www.gutenberg.org/ebooks/{ebook_id}'
    
    # Make a request to the webpage
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the table with class 'bibrec'
        table = soup.find('table', class_='bibrec')

        if table:
            # Initialize a dictionary to store the row data
            data = {
                "Ebook ID": ebook_id,
                "Author": "",
                "Title": "",
                "Credits": "",
                "Summary": "",
                "Language": "",
                "LoC Class": "",
                "Subject": "",
                "Subject_2": "",
                "Subject_3": "",
                "Subject_4": "",
                "Category": "",
                "EBook-No.": "",
                "Release Date": "",
                "Most Recently Updated": "",
                "Copyright Status": "",
                "Downloads": ""
            }

            # Loop through each row of the table and extract the th/td pairs
            for row in table.find_all('tr'):
                if row.find('th'):
                    th = row.find('th').get_text(strip=True)
                    td = row.find('td').get_text(strip=True)
                    
                    # Match the <th> text to the correct column in the CSV
                    if th == "Author":
                        data["Author"] = td
                    elif th == "Title":
                        data["Title"] = td
                    elif th == "Credits":
                        data["Credits"] = td
                    elif th == "Summary":
                        data["Summary"] = td
                    elif th == "Language":
                        data["Language"] = td
                    elif th == "LoC Class":
                        data["LoC Class"] = td
                    elif th == "Subject":
                        # Handle multiple subjects by assigning them to separate fields
                        if not data["Subject"]:
                            data["Subject"] = td
                        elif not data["Subject_2"]:
                            data["Subject_2"] = td
                        elif not data["Subject_3"]:
                            data["Subject_3"] = td
                        else:
                            data["Subject_4"] = td
                    elif th == "Category":
                        data["Category"] = td
                    elif th == "EBook-No.":
                        data["EBook-No."] = td
                    elif th == "Release Date":
                        data["Release Date"] = td
                    elif th == "Most Recently Updated":
                        data["Most Recently Updated"] = td
                    elif th == "Copyright Status":
                        data["Copyright Status"] = td
                    elif th == "Downloads":
                        data["Downloads"] = td

            return data
        else:
            print(f"No 'bibrec' table found for ebook ID: {ebook_id}")
            return None
    else:
        print(f"Failed to retrieve the page for ebook ID: {ebook_id}. Status code: {response.status_code}")
        return None

# Function to check if the ebook ID is already in the CSV file
def is_id_in_csv(ebook_id, csv_filename):
    if not os.path.exists(csv_filename):
        return False

    with open(csv_filename, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row["Ebook ID"] == str(ebook_id):
                return True
    return False

# Main code to loop through ebook IDs and append data to the CSV if not already present
start_id = 17900  # Starting ebook ID
end_id = 20000    # Ending ebook ID (change this range as needed)
csv_filename = 'gutenberg_ebooks.csv'

# Open or create the CSV file for appending data
with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
    # Define the CSV column headers
    headers = ["Ebook ID", "Author", "Title", "Credits", "Summary", "Language", "LoC Class", 
               "Subject", "Subject_2", "Subject_3", "Subject_4", "Category", 
               "EBook-No.", "Release Date", "Most Recently Updated", 
               "Copyright Status", "Downloads"]
    
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    
    # Write the headers if the file is new
    if os.stat(csv_filename).st_size == 0:
        writer.writeheader()

    # Loop through ebook IDs and scrape data if the ID is not already in the CSV
    for ebook_id in range(start_id, end_id + 1):
        if not is_id_in_csv(ebook_id, csv_filename):
            data = scrape_data(ebook_id)
            if data:
                writer.writerow(data)
                print(f"Appended data for ebook ID: {ebook_id}")
            else:
                print(f"Skipping ebook ID: {ebook_id} (no data)")
        else:
            print(f"Ebook ID: {ebook_id} is already in the CSV, skipping...")

print("Data extraction completed.")


Ebook ID: 17900 is already in the CSV, skipping...
Ebook ID: 17901 is already in the CSV, skipping...
Ebook ID: 17902 is already in the CSV, skipping...
Ebook ID: 17903 is already in the CSV, skipping...
Ebook ID: 17904 is already in the CSV, skipping...
Ebook ID: 17905 is already in the CSV, skipping...
Ebook ID: 17906 is already in the CSV, skipping...
Ebook ID: 17907 is already in the CSV, skipping...
Ebook ID: 17908 is already in the CSV, skipping...
Ebook ID: 17909 is already in the CSV, skipping...
Ebook ID: 17910 is already in the CSV, skipping...
Ebook ID: 17911 is already in the CSV, skipping...
Ebook ID: 17912 is already in the CSV, skipping...
Ebook ID: 17913 is already in the CSV, skipping...
Ebook ID: 17914 is already in the CSV, skipping...
Ebook ID: 17915 is already in the CSV, skipping...
Ebook ID: 17916 is already in the CSV, skipping...
Ebook ID: 17917 is already in the CSV, skipping...
Ebook ID: 17918 is already in the CSV, skipping...
Ebook ID: 17919 is already in t