# Crawl Goodreads Book Pages to extract metadata

In [12]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import re

### Import previously obtained authors dataset, filtered with only JAFF titles

In [16]:
file = 'combined_JAFF_authors_grouped.csv'

# The output directory
html_dir = 'reviews/'

# Read the core dataset as a pandas DataFrame and show the Goodreads link column
source_df = pd.read_csv(file) 
source_df.head()


Unnamed: 0,Author ID,Author name,Date of birth,Place of birth,Biography,Genres,URL_authors,title,URL_books,average_rating,year_of_publication,Number of works
0,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,Les Enqu√™tes de Jane Austen - tome 2 - Un vol...,https://www.goodreads.com/book/show/222068206-...,0.0,,85.0
1,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,The Austen Intrigue (Regency Secrets #4),https://www.goodreads.com/book/show/232290671-...,0.0,,85.0
2,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,Les enqu√™tes de Jane Austen - Tome 2: Un vole...,https://www.goodreads.com/book/show/220520725-...,0.0,,85.0
3,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,Jane Austen Investigates: The Abbey Mystery (J...,https://www.goodreads.com/book/show/56933218-j...,3.98,,85.0
4,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,"The Burglar's Ball (Jane Austen Investigates, #2)",https://www.goodreads.com/book/show/58445472-t...,4.11,,85.0


### Extract metadata from book/show URLs

In [None]:
# Define sleep time to prevent banning from server. Decrease to 1 second for a faster procedure
def sleep():
    sleep_time = 1 + random.random()  
    time.sleep(sleep_time)

# List of URLs to scrape
urls = source_df.URL_books.to_list()
total_urls = len(urls)

data = []
save_every = 50  # Save every 50 processed items
output_file = "scraping_progress.csv"

# Extract author id as unique identifier
def extract_author_id(url):
    match = re.search(r'/author/(?:show|list)/(\d+)', url)
    return match.group(1) if match else "N/A"

# Iterate over each URL to request the page and extract edition and author details using BeautifulSoup
for index, url in enumerate(urls):
    print(f"Processing URL {index+1}/{total_urls} ({url})")
    try:
         # Send a GET request to the URL
        response = requests.get(url)

        # Create a BeautifulSoup object
        soup = BeautifulSoup(response.text, "html.parser")
       
        all_editions_link = soup.find('a', href=re.compile(r'/work/editions/\d+'))
        author_id = "N/A"  # Default value
        
        author_link = soup.find('a', href=re.compile(r'/author/show/\d+'))
        if author_link and author_link.has_attr('href'):
            match = re.search(r'/author/show/(\d+)', author_link['href'])
            author_id = match.group(1) if match else "N/A"
        else:
            author_id = "N/A"
        

        # Extract metadata
        title = soup.find("h1", attrs={"data-testid":"bookTitle"}).text.strip()
        author = soup.find("span", attrs={"data-testid":"name"}).text.strip()
        average_rating = soup.find("div", attrs={"class":"RatingStatistics__rating"}).text.strip()
        ratings_count = soup.find("span", attrs={"data-testid":"ratingsCount"}).text.strip()
        reviews_count = soup.find("span", attrs={"data-testid":"reviewsCount"}).text.strip()
        description = soup.find("div", attrs={"data-testid":"description"}).text.strip()
        genres = [genre.text.strip() for genre in soup.find_all("span", attrs={"class":"BookPageMetadataSection__genreButton"})]
        pages_format = soup.find("p", attrs={"data-testid":"pagesFormat"}).text.strip()
        publication_info = soup.find("p", attrs={"data-testid":"publicationInfo"}).text.strip()
        ratings_histogram = [rating.text.strip() for rating in soup.find_all("div", attrs={"class":"RatingsHistogram__labelTotal"})]

        # Append the extracted data to the list
        data.append({
            "title": title,
            "Author ID": author_id,
            "author": author,
            "average_rating": average_rating,
            "ratings_count": ratings_count,
            "reviews_count": reviews_count,
            "description": description,
            "genres": genres,
            "pages_format": pages_format,
            "publication_info": publication_info,
            "ratings_histogram": ratings_histogram
        })

        # Save progress
        if (index + 1) % save_every == 0:
            pd.DataFrame(data).to_csv(output_file, index=False)
            print(f"Saved progress after {index+1} URLs")
            
        sleep()
        
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        continue

# Save dataset
scraped_df = pd.DataFrame(data)
scraped_df.to_csv("final_scraped_metadata_def.csv", index=False)
print("Scraping completed!")

Processing URL 1/8226 (https://www.goodreads.com/book/show/222068206-les-enqu-tes-de-jane-austen---tome-2---un-voleur-au-bal)
Processing URL 2/8226 (https://www.goodreads.com/book/show/232290671-the-austen-intrigue)
Processing URL 3/8226 (https://www.goodreads.com/book/show/220520725-les-enqu-tes-de-jane-austen---tome-2)
Processing URL 4/8226 (https://www.goodreads.com/book/show/56933218-jane-austen-investigates)
Processing URL 5/8226 (https://www.goodreads.com/book/show/58445472-the-burglar-s-ball)
Processing URL 6/8226 (https://www.goodreads.com/book/show/59880929-jane-austen-investigates)


In [13]:
# Save to csv
df.to_csv("JAFF_books_metadata.csv")