# Crawl Goodreads Book Pages 
## Import necessary libraries

In [2]:
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
import random
import re

## Import dataset obtained by scraping books metadata 

In [8]:
file = 'combined_JAFF_authors_grouped.csv'

# The output directory
html_dir = 'reviews/'

# Read the core dataset as a pandas DataFrame and show the Goodreads link column
source_df = pd.read_csv(file) 
source_df.head()

Unnamed: 0,Author ID,Author name,Date of birth,Place of birth,Biography,Genres,URL_authors,title,URL_books,average_rating,year_of_publication,Number of works
0,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,Les Enqu√™tes de Jane Austen - tome 2 - Un vol...,https://www.goodreads.com/book/show/222068206-...,0.0,,85.0
1,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,The Austen Intrigue (Regency Secrets #4),https://www.goodreads.com/book/show/232290671-...,0.0,,85.0
2,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,Les enqu√™tes de Jane Austen - Tome 2: Un vole...,https://www.goodreads.com/book/show/220520725-...,0.0,,85.0
3,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,Jane Austen Investigates: The Abbey Mystery (J...,https://www.goodreads.com/book/show/56933218-j...,3.98,,85.0
4,1332.0,Julia Golding,,Website,My journey to becoming an author has been a ro...,"Young Adult; ,; Children's",https://www.goodreads.com/author/show/1332.Jul...,"The Burglar's Ball (Jane Austen Investigates, #2)",https://www.goodreads.com/book/show/58445472-t...,4.11,,85.0


## Generate URLs for the page with info about all the editions.

In [None]:
def sleep():
    time.sleep(1 + random.random())

# Placeholder to store the generated URLs
generated_urls = [] 

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
save_every = 50  # Save progress every 50 rows
output_file = "goodreads_editions_progress.csv"
total = len(source_df['URL_books'])

for i, url in enumerate(source_df['URL_books']):
    print(f"Processing {i+1}/{total}: {url}")

    try:
        response = requests.get(url, headers=headers)
        sleep()
        if response.status_code == 200:
            html = response.text
            pattern = r'https://www\.goodreads\.com/work/editions/\d+'
            matches = re.findall(pattern, html)
            if matches:
                generated_urls.append(matches[0])
            else:
                generated_urls.append("Fail")
        else:
            generated_urls.append("Fail")
    except Exception as e:
        print(f"Error at row {i+1}: {e}")
        generated_urls.append("Fail")

    # Save progress every N rows 
    if (i + 1) % save_every == 0:
        source_df.loc[:i, 'Generated_URL'] = generated_urls[:i+1]
        source_df.iloc[:i+1].to_csv(output_file, index=False)
        print(f"Progress saved at row {i+1}")


source_df['Generated_URL'] = generated_urls
source_df.to_csv("goodreads_editions_final_grouped.csv", index=False)
print("Scraping completed and saved to goodreads_editions_final.csv")
source_df.head()

Processing 1/8226: https://www.goodreads.com/book/show/222068206-les-enqu-tes-de-jane-austen---tome-2---un-voleur-au-bal
Processing 2/8226: https://www.goodreads.com/book/show/232290671-the-austen-intrigue
Processing 3/8226: https://www.goodreads.com/book/show/220520725-les-enqu-tes-de-jane-austen---tome-2
Processing 4/8226: https://www.goodreads.com/book/show/56933218-jane-austen-investigates
Processing 5/8226: https://www.goodreads.com/book/show/58445472-the-burglar-s-ball
Processing 6/8226: https://www.goodreads.com/book/show/59880929-jane-austen-investigates
Processing 7/8226: https://www.goodreads.com/book/show/2152.The_Jane_Austen_Book_Club
Processing 8/8226: https://www.goodreads.com/book/show/18914877-the-complete-novels
Processing 9/8226: https://www.goodreads.com/book/show/136750299-persuasions
Processing 10/8226: https://www.goodreads.com/book/show/43432578-the-essential-guide-to-jane-austen
Processing 11/8226: https://www.goodreads.com/book/show/18300267-pride-and-prejudice

In [3]:
file_2 = 'goodreads_editions_final_grouped.csv'

df_2 = pd.read_csv(file_2)


Open file containing authors missing in file_2 (from authors who were missed or excluded during the initial filtering 
of JAFF books from the total dataset)

In [None]:
file_3 = 'missing_books_generated_links.csv'

df_3 = pd.read_csv(file_3)

df_3.head()

Make df_3 columns the same order as df_2 and concatenate the datasets

In [None]:
df_3 = df_3[df_2.columns]

df_concat = pd.concat([df_2, df_3], ignore_index=True)
df_concat.to_csv('ready_to_scrape.csv', index=False)

In [None]:
file_4 = 'ready_to_scrape.csv'

df_4 = pd.read_csv(file_4)

df_4.head()

## Extract metadata at edition level

In [None]:
def sleep():
    sleep_time = 1 + random.random() # decrease it to 1 sec for faster scraping 
    time.sleep(sleep_time)

# List of URLs to scrape
urls = df_4['Generated_URL'].tolist()
total = len(urls)

# Initialize lists to store each field for the DataFrame
book_titles = []
publication_dates = []
publishers = []
formats = []
page_counts = []
authors = []
author_ids = []
isbns = []
asins = []
languages = []
ratings = []
rating_counts = []
book_urls = []

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}

save_every = 50  # Save progress every 50 URLs
output_file = "goodreads_editions_data_progress.csv"

for i, url in enumerate(urls):
    print(f"Processing work {i+1}/{total}: {url}")
    # Remove existing ?page= to safely add our custom page param
    base_url = re.sub(r'\?page=\d+', '', url)
    page = 1

    while True:  # loop over all pages of editions!
        page_url = f"{base_url}?page={page}"
        try:
            response = requests.get(page_url, headers=headers)
            sleep()
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                book_elements = soup.find_all("div", class_="elementList clearFix")

                if not book_elements: # No more editions on this page
                    print(f"No more editions on {page_url}")
                    break

                for book in book_elements:
                    # Extract title and format
                    title_element = book.find("a", class_="bookTitle")
                    title = title_element.text.strip() if title_element else "N/A"
                    book_titles.append(title)

                    # Extract publication date and publisher
                    publication_info = book.find_all("div", class_="dataRow")
                    if len(publication_info) > 1:
                        pub_parts = publication_info[1].text.strip().split('by')
                        publication_date = pub_parts[0].replace("Published", "").strip() if len(pub_parts) > 0 else "N/A"
                        publisher = pub_parts[1].strip() if len(pub_parts) > 1 else "N/A"
                    else:
                        publication_date = "N/A"
                        publisher = "N/A"
                    publication_dates.append(publication_date)
                    publishers.append(publisher)

                    # Extract format and page count
                    if len(publication_info) > 2:
                        format_parts = publication_info[2].text.strip().split(", ")
                        book_format = format_parts[0] if len(format_parts) > 0 else "N/A"
                        page_count = format_parts[1].replace("pages", "").strip() if len(format_parts) > 1 else "N/A"
                    else:
                        book_format = "N/A"
                        page_count = "N/A"
                    formats.append(book_format)
                    page_counts.append(page_count)

                    # Extract author name
                    author_element = book.find("a", class_="authorName")
                    author = author_element.text.strip() if author_element else "N/A"
                    authors.append(author)

                    # Extract author ID from the href
                    if author_element and author_element.has_attr('href'):
                        match = re.search(r'/author/show/(\d+)', author_element['href'])
                        author_id = match.group(1) if match else "N/A"
                    else:
                        author_id = "N/A"
                    author_ids.append(author_id)

                    # Extract ISBN
                    isbn_element = book.find('div', class_='dataTitle', text=re.compile(r'ISBN:'))
                    isbn = isbn_element.find_next("div", class_="dataValue").text.strip().split()[0] if isbn_element else "N/A"
                    isbns.append(isbn)

                    # Extract ASIN
                    asin_element = book.find('div', class_='dataTitle', text=re.compile(r'ASIN:'))
                    asin = asin_element.find_next("div", class_="dataValue").text.strip() if asin_element else "N/A"
                    asins.append(asin)

                    # Extract edition language
                    language_element = book.find('div', class_='dataTitle', text=re.compile(r'Edition language:'))
                    language = language_element.find_next("div", class_="dataValue").get_text(strip=True) if language_element else "N/A"
                    languages.append(language)

                    # Extract rating and rating count
                    rating_element = book.find('div', class_='dataTitle', text=re.compile(r'Average rating:'))
                    if rating_element:
                        rating_info = rating_element.find_next("div", class_="dataValue").text.strip().split()
                        rating = rating_info[0]
                        rating_count = rating_info[1].replace("(", "").replace("ratings)", "").replace(",", "")
                    else:
                        rating = "N/A"
                        rating_count = "N/A"
                    ratings.append(rating)
                    rating_counts.append(rating_count)

                    # Extract book URL
                    book_title_link = book.find('a', class_='bookTitle')
                    if book_title_link and book_title_link.has_attr('href'):
                        book_url = f"https://www.goodreads.com{book_title_link['href']}"
                    else:
                        book_url = "N/A"
                    book_urls.append(book_url)

                # Go to the next page of editions for this book (if existing)
                page += 1

            else:
                print(f"Failed to fetch {page_url} with status code {response.status_code}")
                break 
            

        except Exception as e:
            print(f"Error: {e}")
            print(f"Skipping URL: {url}")
            break

    # Save progress every save_every URLs
    if (i + 1) % save_every == 0:
        df_partial = pd.DataFrame({
            "Title": book_titles,
            "Publication Date": publication_dates,
            "Publisher": publishers,
            "Format": formats,
            "Page Count": page_counts,
            "Author": authors,
            "Author ID": author_ids, 
            "ISBN": isbns,
            "ASIN": asins,
            "Edition Language": languages,
            "Average Rating": ratings,
            "Rating Count": rating_counts,
            "Book URL": book_urls
        })
        df_partial.to_csv(output_file, index=False)
        print(f"Progress saved at URL {i+1}")

        # Final save
df_5 = pd.DataFrame({
    "Title": book_titles,
    "Publication Date": publication_dates,
    "Publisher": publishers,
    "Format": formats,
    "Page Count": page_counts,
    "Author": authors,
    "Author ID": author_ids,
    "ISBN": isbns,
    "ASIN": asins,
    "Edition Language": languages,
    "Average Rating": ratings,
    "Rating Count": rating_counts,
    "Book URL": book_urls
})
df_5.to_csv("editions_metadata_final_def.csv", index=False)
print("Scraping completed and saved to goodreads_editions_final.csv")
df_5.head()


In [None]:
def sleep():
    sleep_time = 1 + random.random() # decrease it to 1 sec for faster scraping 
    time.sleep(sleep_time)

# List of URLs to scrape
urls = df['Generated_URL'].tolist()
total = len(urls)

# Initialize lists to store each field for the DataFrame
book_titles = []
publication_dates = []
publishers = []
formats = []
page_counts = []
authors = []
author_ids = []
isbns = []
asins = []
languages = []
ratings = []
rating_counts = []
book_urls = []

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}

save_every = 50  # Save progress every 50 URLs
output_file = "goodreads_editions_data_progress.csv"

for i, url in enumerate(urls):
    print(f"Processing {i+1}/{total}: {url}")
    try:
        response = requests.get(url, headers=headers)
        sleep()
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            book_elements = soup.find_all("div", class_="elementList clearFix")

            for book in book_elements:
                # Extract title and format
                title_element = book.find("a", class_="bookTitle")
                title = title_element.text.strip() if title_element else "N/A"
                book_titles.append(title)

                # Extract publication date and publisher
                publication_info = book.find_all("div", class_="dataRow")
                if len(publication_info) > 1:
                    pub_parts = publication_info[1].text.strip().split('by')
                    publication_date = pub_parts[0].replace("Published", "").strip() if len(pub_parts) > 0 else "N/A"
                    publisher = pub_parts[1].strip() if len(pub_parts) > 1 else "N/A"
                else:
                    publication_date = "N/A"
                    publisher = "N/A"
                publication_dates.append(publication_date)
                publishers.append(publisher)

                # Extract format and page count
                if len(publication_info) > 2:
                    format_parts = publication_info[2].text.strip().split(", ")
                    book_format = format_parts[0] if len(format_parts) > 0 else "N/A"
                    page_count = format_parts[1].replace("pages", "").strip() if len(format_parts) > 1 else "N/A"
                else:
                    book_format = "N/A"
                    page_count = "N/A"
                formats.append(book_format)
                page_counts.append(page_count)

                # Extract author name
                author_element = book.find("a", class_="authorName")
                author = author_element.text.strip() if author_element else "N/A"
                authors.append(author)

                # Extract author ID from the href
                if author_element and author_element.has_attr('href'):
                    match = re.search(r'/author/show/(\d+)', author_element['href'])
                    author_id = match.group(1) if match else "N/A"
                else:
                    author_id = "N/A"
                author_ids.append(author_id)

                # Extract ISBN
                isbn_element = book.find('div', class_='dataTitle', text=re.compile(r'ISBN:'))
                isbn = isbn_element.find_next("div", class_="dataValue").text.strip().split()[0] if isbn_element else "N/A"
                isbns.append(isbn)

                # Extract ASIN
                asin_element = book.find('div', class_='dataTitle', text=re.compile(r'ASIN:'))
                asin = asin_element.find_next("div", class_="dataValue").text.strip() if asin_element else "N/A"
                asins.append(asin)

                # Extract edition language
                language_element = book.find('div', class_='dataTitle', text=re.compile(r'Edition language:'))
                language = language_element.find_next("div", class_="dataValue").get_text(strip=True) if language_element else "N/A"
                languages.append(language)

                # Extract rating and rating count
                rating_element = book.find('div', class_='dataTitle', text=re.compile(r'Average rating:'))
                if rating_element:
                    rating_info = rating_element.find_next("div", class_="dataValue").text.strip().split()
                    rating = rating_info[0]
                    rating_count = rating_info[1].replace("(", "").replace("ratings)", "").replace(",", "")
                else:
                    rating = "N/A"
                    rating_count = "N/A"
                ratings.append(rating)
                rating_counts.append(rating_count)

                # Extract book URL
                book_title_link = book.find('a', class_='bookTitle')
                if book_title_link and book_title_link.has_attr('href'):
                    book_url = f"https://www.goodreads.com{book_title_link['href']}"
                else:
                    book_url = "N/A"
                book_urls.append(book_url)

        else:
            print(f"Failed to fetch {url} with status code {response.status_code}")

    except Exception as e:
        print(f"Error: {e}")
        print(f"Skipping URL: {url}")
        continue

    # Save progress every save_every URLs
    if (i + 1) % save_every == 0:
        df_partial = pd.DataFrame({
            "Title": book_titles,
            "Publication Date": publication_dates,
            "Publisher": publishers,
            "Format": formats,
            "Page Count": page_counts,
            "Author": authors,
            "Author ID": author_ids, 
            "ISBN": isbns,
            "ASIN": asins,
            "Edition Language": languages,
            "Average Rating": ratings,
            "Rating Count": rating_counts,
            "Book URL": book_urls
        })
        df_partial.to_csv(output_file, index=False)
        print(f"Progress saved at URL {i+1}")

# Final save
df = pd.DataFrame({
    "Title": book_titles,
    "Publication Date": publication_dates,
    "Publisher": publishers,
    "Format": formats,
    "Page Count": page_counts,
    "Author": authors,
    "Author ID": author_ids,
    "ISBN": isbns,
    "ASIN": asins,
    "Edition Language": languages,
    "Average Rating": ratings,
    "Rating Count": rating_counts,
    "Book URL": book_urls
})
df.to_csv("goodreads_editions_metadata_final_def.csv", index=False)
print("Scraping completed and saved to goodreads_editions_final.csv")
df.head()

Processing 1/8226: https://www.goodreads.com/work/editions/229064092


  isbn_element = book.find('div', class_='dataTitle', text=re.compile(r'ISBN:'))
  asin_element = book.find('div', class_='dataTitle', text=re.compile(r'ASIN:'))
  language_element = book.find('div', class_='dataTitle', text=re.compile(r'Edition language:'))
  rating_element = book.find('div', class_='dataTitle', text=re.compile(r'Average rating:'))


Processing 2/8226: https://www.goodreads.com/work/editions/246919373
Processing 3/8226: https://www.goodreads.com/work/editions/227465077
Processing 4/8226: https://www.goodreads.com/work/editions/85156807
Processing 5/8226: https://www.goodreads.com/work/editions/91731825
Processing 6/8226: https://www.goodreads.com/work/editions/94298597
Processing 7/8226: https://www.goodreads.com/work/editions/3498000
Processing 8/8226: https://www.goodreads.com/work/editions/2494662
Processing 9/8226: https://www.goodreads.com/work/editions/153254462
Processing 10/8226: https://www.goodreads.com/work/editions/67516039


In [13]:
file2 = 'goodreads_editions_metadata_final_def.csv'


# Read the core dataset as a pandas DataFrame and show the Goodreads link column
df = pd.read_csv(file2) # for testing a few lines, add [0:3]
df.head()
print(len(df))

20382


In [27]:
df.to_csv("editions_metadata.csv")