## Crawl Goodreads Book Pages 

### Import necessary libraries and upload the original csv containing Goodreads pages of all authors

In [1]:
import time
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import random

# The core dataset with book titles, ISBNs and Goodreads URLs 
file_authors_pages = 'JAFF authors_and_books_commercially published_.csv'

df = pd.read_csv(file_authors_pages, delimiter=';')

new_file_authors = 'JAFF authors_and_books_commercially published_comma.csv'
# Save the DataFrame as a new CSV file with comma delimiter
df.to_csv(new_file_authors, index=False)

# Read the core dataset as a pandas DataFrame and show the Goodreads link column
source_df = pd.read_csv(new_file_authors)
source_df.head()

Unnamed: 0,URL_authors,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74
0,https://www.goodreads.com/author/list/20380663...,,,,,,,,,,...,,,,,,,,,,
1,https://www.goodreads.com/author/show/1001650....,,,,,,,,,,...,,,,,,,,,,
2,https://www.goodreads.com/author/show/1005837....,,,,,,,,,,...,,,,,,,,,,
3,https://www.goodreads.com/author/show/1008186....,,,,,,,,,,...,,,,,,,,,,
4,https://www.goodreads.com/author/show/10089.Ph...,,,,,,,,,,...,,,,,,,,,,


### Next, extract book and author-level information for all URLs in the core dataset.

In [2]:
# Define sleep time between requests to avoid blocking
def sleep():
    sleep_time = 1 + random.random()
    time.sleep(sleep_time)

# Extract author id as unique identifier
def extract_author_id(url):
    match = re.search(r'/author/show/(\d+)', url)
    return match.group(1) if match else None

# Scrape author metadata from a Goodreads author/show URL
def get_author_info(author_url):
    try:
        response = requests.get(author_url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract author information
        author_name = soup.find("h1", class_="authorName").get_text(strip=True) if soup.find("h1", class_="authorName") else "N/A"
        
        # Date of birth
        dob_div = soup.find("div", class_="dataItem", itemprop="birthDate")
        date_of_birth = dob_div.get_text(strip=True) if dob_div else "N/A"
        
        # Place of birth
        born_div = soup.find("div", class_="dataTitle", string=re.compile("Born"))
        place_of_birth = "N/A"
        if born_div:
            place_of_birth = born_div.find_next_sibling("div").get_text(strip=True)
            place_of_birth = re.sub(r'^in\s*', '', place_of_birth, flags=re.IGNORECASE)
        
        # Biography
        bio_div = soup.find("div", class_="aboutAuthorInfo")
        bio = " ".join([p.get_text(" ", strip=True) for p in bio_div.find_all("span")]) if bio_div else "N/A"
        
        # Genres
        genres_div = soup.find("div", class_="dataTitle", string=re.compile("Genre"))
        genres = genres_div.find_next_sibling("div").get_text("; ", strip=True) if genres_div else "N/A"

        # Number of works 
        number_of_works = "N/A"
        for a in soup.find_all('a', href=re.compile(r'/author/list/')):
            text = a.get_text(strip=True)
            match = re.search(r'(\d+)\s+distinct works', text)
            if match:
                number_of_works = int(match.group(1))
                break
        
        return {
            "Author ID": extract_author_id(author_url),
            "Author name": author_name,
            "Date of birth": date_of_birth,
            "Place of birth": place_of_birth,
            "Biography": bio,
            "Genres": genres,
            "Number of works": number_of_works,
            "Author URL": author_url
        }
        
    except Exception as e:
        print(f"Error getting author info: {str(e)}")
        return None


# Scrape all books from author/list/ pages
def get_author_books(author_id):
    books = []
    page = 1
    while True:
        try:
            # Construct general author/list/ URL, showing up to 150 books per page
            url = f"https://www.goodreads.com/author/list/{author_id}?page={page}&per_page=150"
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Extract books
            for row in soup.find_all('tr', itemtype="http://schema.org/Book"):
                title_tag = row.find('a', class_='bookTitle')
                title = title_tag.find('span', itemprop='name').get_text(strip=True) if title_tag else "N/A"
                book_url = "https://www.goodreads.com" + title_tag['href'] if title_tag else "N/A"
                
                # Extract rating
                rating_text = row.find('span', class_='minirating').get_text(" ", strip=True) if row.find('span', class_='minirating') else ""
                avg_rating = re.search(r'(\d+\.\d+)', rating_text).group(1) if re.search(r'(\d+\.\d+)', rating_text) else "N/A"
                
                # Extract publication year
                pub_text = row.find('span', class_='greyText').get_text() if row.find('span', class_='greyText') else ""
                pub_year = re.search(r'published\s+(\d{4})', pub_text).group(1) if re.search(r'published\s+(\d{4})', pub_text) else "N/A"
                
                books.append({
                    "Title": title,
                    "Book URL": book_url,
                    "Average Rating": avg_rating,
                    "Publication Year": pub_year
                })
                
            # Check for next page
            next_button = soup.find('a', class_='next_page')
            if not next_button or 'disabled' in next_button.get('class', []):
                break
                
            page += 1
            sleep()
            
        except Exception as e:
            print(f"Error getting books page {page}: {str(e)}")
            break
            
    return books

# Configuration
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}

# Main scraping process
author_urls = source_df.URL_authors.to_list()
data = []
save_every = 50  

for idx, author_url in enumerate(author_urls):
    print(f"Processing author {idx+1}/{len(author_urls)}: {author_url}")
    
    # Get author info from author/show page
    author_info = get_author_info(author_url)
    if not author_info:
        continue
        
    # Get books from author/list pages
    author_id = extract_author_id(author_url)
    if author_id:
        author_info["Books"] = get_author_books(author_id)
    
    data.append(author_info)
    
    # Save progress
    if (idx + 1) % save_every == 0:
        df = pd.DataFrame(data)
        exploded = df.explode("Books")
        books_df = pd.json_normalize(exploded["Books"])
        final_df = pd.concat([exploded.drop("Books", axis=1).reset_index(drop=True), books_df], axis=1)
        final_df.to_csv("progress.csv", index=False)
        print(f"Saved progress after {idx+1} authors")

# Final save
df = pd.DataFrame(data)
exploded = df.explode("Books")
books_df = pd.json_normalize(exploded["Books"])
final_df = pd.concat([exploded.drop("Books", axis=1).reset_index(drop=True), books_df], axis=1)
final_df.to_csv("final_author_data.csv", index=False)
print("Scraping completed!")
print(final_df.head())

Processing author 1/1807: https://www.goodreads.com/author/list/20380663.Carin_Grace
Processing author 2/1807: https://www.goodreads.com/author/show/1001650.Denise_Adams
Processing author 3/1807: https://www.goodreads.com/author/show/1005837.Joan_Mason_Hurley
Processing author 4/1807: https://www.goodreads.com/author/show/1008186.Myretta_Robens
Processing author 5/1807: https://www.goodreads.com/author/show/10089.Philip_Jos_Farmer
Processing author 6/1807: https://www.goodreads.com/author/show/101553.Claire_M_Johnson
Processing author 7/1807: https://www.goodreads.com/author/show/10291695.Carol_Pratt_Bradley
Processing author 8/1807: https://www.goodreads.com/author/show/1029317.Jane_Odiwe
Processing author 9/1807: https://www.goodreads.com/author/show/10318.Jo_Beverley
Processing author 10/1807: https://www.goodreads.com/author/show/1044699.Susan_Petrone
Processing author 11/1807: https://www.goodreads.com/author/show/10785364.Linda_Blanchette
Processing author 12/1807: https://www.go

In [3]:
print(len(final_df))

42743


In [75]:
final_df.to_csv("JAFF_complete_dataset.csv")