# Goodreads Book Scraper - Selenium Version

This notebook uses Selenium to scrape Goodreads reviews by simulating browser interactions.
This allows us to click the "Show more reviews" button and load additional pages dynamically.

## Books to be scraped:
Brandon Sanderson's 57 books (same list as the original scraper)

## Data collected:
- Book details: Author, Title, Publication Date, Page count, Genres, Overall rating, Overall reviews
- Review details: rating, review_text, likes, review_length, book_title, word_count

In [None]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import random

In [None]:
def setup_driver():
    """
    Set up and return a configured Chrome WebDriver
    """
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in background (remove this to see the browser)
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-blink-features=AutomationControlled')
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.set_page_load_timeout(30)
    return driver

In [None]:
def get_book_details_selenium(driver, url):
    """
    Scrape basic book details from Goodreads page using Selenium
    """
    try:
        print(f"Fetching book details from: {url}")
        driver.get(url)
        time.sleep(random.uniform(2, 4))  # Wait for page to load
        
        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'lxml')
        
        # Initialize book details dictionary
        book_details = {}
        
        # Get title
        title_element = (soup.find('h1', class_='Text__title1') or 
                        soup.find('h1', class_='BookPageTitleSection__title') or
                        soup.find('h1'))
        book_details['title'] = title_element.text.strip() if title_element else 'Unknown'
        
        # Get author
        author_element = (soup.find('span', class_='ContributorLink__name') or
                         soup.find('a', class_='ContributorLink') or
                         soup.find('span', {'data-testid': 'name'}))
        book_details['author'] = author_element.text.strip() if author_element else 'Unknown'
        
        # Get publication details
        details_div = (soup.find('div', {'data-testid': 'bookDetails'}) or
                      soup.find('div', {'data-testid': 'publicationInfo'}))
        if details_div:
            details_text = details_div.get_text()
            
            # Extract publication date
            pub_date_match = (re.search(r'First published (\w+ \d+,? \d{4})', details_text) or
                            re.search(r'Published\s+(\w+\s+\d+(?:st|nd|rd|th)?,?\s+\d{4})', details_text))
            book_details['publication_date'] = pub_date_match.group(1) if pub_date_match else None
            
            # Extract page count
            pages_match = re.search(r'(\d+)\s*pages?', details_text)
            book_details['page_count'] = int(pages_match.group(1)) if pages_match else None
        else:
            book_details['publication_date'] = None
            book_details['page_count'] = None
        
        # Get genres
        genre_elements = (soup.find_all('span', class_='BookPageMetadataSection__genreButton') or
                         soup.find_all('span', {'data-testid': 'genreLink'}))
        book_details['genres'] = [genre.text.strip() for genre in genre_elements] if genre_elements else []
        
        # Get overall rating
        rating_div = (soup.find('div', {'class': 'RatingStatistics__rating'}) or
                     soup.find('div', {'data-testid': 'average'}))
        if rating_div:
            try:
                book_details['overall_rating'] = float(rating_div.text.strip())
            except ValueError:
                book_details['overall_rating'] = None
        else:
            book_details['overall_rating'] = None
        
        # Get review count
        reviews_element = (soup.find('div', {'data-testid': 'reviewsCount'}) or
                          soup.find('span', {'data-testid': 'reviewsCount'}))
        if reviews_element:
            reviews_text = reviews_element.text.strip()
            reviews_count = ''.join(filter(str.isdigit, reviews_text))
            book_details['overall_reviews'] = int(reviews_count) if reviews_count else 0
        else:
            book_details['overall_reviews'] = 0
        
        print(f"✓ Successfully scraped details for: {book_details['title']}")
        return book_details
    
    except Exception as e:
        print(f"✗ Error scraping book details: {e}")
        return None

In [None]:
def get_reviews_selenium(driver, url, num_reviews=1000, max_clicks=50):
    """
    Scrape reviews from Goodreads using Selenium to click "Show more reviews" button
    
    Parameters:
    - driver: Selenium WebDriver instance
    - url: Book URL
    - num_reviews: Target number of unique reviews to collect
    - max_clicks: Maximum number of times to click "Show more" button
    """
    reviews_list = []
    seen_review_texts = set()
    clicks = 0
    consecutive_no_new = 0
    max_consecutive_no_new = 3
    
    try:
        # Navigate to reviews page
        reviews_url = f"{url}/reviews"
        print(f"\nNavigating to: {reviews_url}")
        driver.get(reviews_url)
        time.sleep(random.uniform(3, 5))  # Wait for initial page load
        
        while len(reviews_list) < num_reviews and clicks < max_clicks:
            # Parse current page content
            soup = BeautifulSoup(driver.page_source, 'lxml')
            
            # Find all review containers
            review_containers = (
                soup.find_all('div', class_='ReviewCard') or
                soup.find_all('article', class_='ReviewCard') or
                soup.find_all('div', class_='Review')
            )
            
            print(f"\nIteration {clicks + 1}: Found {len(review_containers)} review containers")
            new_reviews_this_iteration = 0
            
            # Extract review data
            for container in review_containers:
                if len(reviews_list) >= num_reviews:
                    break
                
                review = {}
                
                # Get rating
                rating_element = container.find(class_=re.compile(r'(?i)(star|rating|static)'))
                if not rating_element:
                    rating_element = container.find(attrs={'aria-label': re.compile(r'\d+\s+of\s+5')})
                
                if rating_element:
                    rating_text = rating_element.get('aria-label') or rating_element.get('title') or rating_element.text
                    rating_match = re.search(r"(\d+)", rating_text)
                    review['rating'] = int(rating_match.group(1)) if rating_match else None
                else:
                    review['rating'] = None
                
                # Get review text
                review_text_elem = (
                    container.find('div', class_='Formatted') or
                    container.find('div', class_='ReviewText') or
                    container.find('span', class_='Formatted')
                )
                review['review_text'] = review_text_elem.text.strip() if review_text_elem else ''
                
                # Get likes
                review['likes'] = 0
                like_patterns = [
                    r'(\d+)\s*likes?',
                    r'(\d+)\s*people liked this'
                ]
                
                for text in container.stripped_strings:
                    for pattern in like_patterns:
                        match = re.search(pattern, text, re.I)
                        if match:
                            review['likes'] = int(match.group(1))
                            break
                    if review['likes'] > 0:
                        break
                
                # Check if review is unique and has content
                if review['review_text']:
                    review_identifier = review['review_text'].strip().lower()
                    
                    if review_identifier and review_identifier not in seen_review_texts:
                        seen_review_texts.add(review_identifier)
                        reviews_list.append(review)
                        new_reviews_this_iteration += 1
            
            print(f"  → Added {new_reviews_this_iteration} new unique reviews")
            print(f"  → Total unique reviews: {len(reviews_list)}/{num_reviews}")
            
            # Check if we got new reviews
            if new_reviews_this_iteration == 0:
                consecutive_no_new += 1
                print(f"  ⚠ No new reviews found ({consecutive_no_new}/{max_consecutive_no_new})")
                
                if consecutive_no_new >= max_consecutive_no_new:
                    print(f"\n⛔ Stopping: No new reviews after {max_consecutive_no_new} attempts")
                    break
            else:
                consecutive_no_new = 0
            
            # Check if we have enough reviews
            if len(reviews_list) >= num_reviews:
                print(f"\n✓ Target reached: {len(reviews_list)} unique reviews collected")
                break
            
            # Try to click "Show more reviews" button
            try:
                # Wait for the button to be present and clickable
                wait = WebDriverWait(driver, 5)
                
                # Try multiple selectors for the "Show more" button
                button = None
                button_selectors = [
                    (By.XPATH, "//button[contains(., 'Show more reviews')]"),
                    (By.XPATH, "//button[contains(@data-testid, 'loadMore')]"),
                    (By.XPATH, "//button[contains(., 'more reviews')]"),
                    (By.CSS_SELECTOR, "button[data-testid='loadMore']")
                ]
                
                for by, selector in button_selectors:
                    try:
                        button = wait.until(EC.element_to_be_clickable((by, selector)))
                        break
                    except TimeoutException:
                        continue
                
                if button:
                    # Scroll to button
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
                    time.sleep(1)
                    
                    # Click the button
                    try:
                        button.click()
                    except ElementClickInterceptedException:
                        # Try JavaScript click if regular click fails
                        driver.execute_script("arguments[0].click();", button)
                    
                    clicks += 1
                    print(f"  ✓ Clicked 'Show more' button (click #{clicks})")
                    
                    # Wait for new content to load
                    time.sleep(random.uniform(2, 4))
                else:
                    print("\n⛔ No 'Show more' button found - reached end of reviews")
                    break
                    
            except TimeoutException:
                print("\n⛔ Timeout waiting for 'Show more' button - no more reviews available")
                break
            except Exception as e:
                print(f"\n⚠ Error clicking button: {e}")
                break
        
        print(f"\n{'='*60}")
        print(f"✓ Successfully scraped {len(reviews_list)} unique reviews")
        print(f"  Total button clicks: {clicks}")
        print(f"{'='*60}")
        
        return reviews_list
    
    except Exception as e:
        print(f"\n✗ Error scraping reviews: {e}")
        return reviews_list

In [None]:
# Test with a single book first
url = "https://www.goodreads.com/book/show/68427.Elantris"

# Initialize the driver
print("Initializing Chrome WebDriver...")
driver = setup_driver()

try:
    # Get book details
    book_details = get_book_details_selenium(driver, url)
    
    if book_details:
        print("\nBook Details:")
        print(pd.Series(book_details))
        
        # Get reviews (try to get 100 for testing, then you can increase to 1000)
        print("\n" + "="*60)
        print("Starting review scraping...")
        print("="*60)
        reviews = get_reviews_selenium(driver, url, num_reviews=100, max_clicks=10)
        
        # Convert to DataFrame
        reviews_df = pd.DataFrame(reviews)
        
        # Add additional columns
        reviews_df['book_title'] = book_details.get('title', 'Unknown')
        reviews_df['review_length'] = reviews_df['review_text'].apply(lambda x: len(str(x)))
        reviews_df['word_count'] = reviews_df['review_text'].apply(lambda x: len(str(x).split()))
        
        # Reorder columns
        reviews_df = reviews_df[['rating', 'review_text', 'likes', 'review_length', 'book_title', 'word_count']]
        
        # Display results
        print("\n" + "="*60)
        print("RESULTS SUMMARY")
        print("="*60)
        print(f"Total reviews collected: {len(reviews_df)}")
        print(f"\nColumns: {list(reviews_df.columns)}")
        print(f"\nFirst 5 reviews:")
        print(reviews_df.head())
        print(f"\nData types:")
        print(reviews_df.dtypes)
    else:
        print("Failed to retrieve book details.")

finally:
    # Always close the driver
    print("\nClosing browser...")
    driver.quit()
    print("Done!")

In [None]:
# Save the test results
if 'reviews_df' in locals() and not reviews_df.empty:
    book_df = pd.DataFrame([book_details])
    book_df.to_csv('elantris_details_selenium.csv', index=False)
    reviews_df.to_csv('elantris_reviews_selenium.csv', index=False)
    print("✓ Data saved to CSV files")
else:
    print("No data to save")

## Full Scraper - All 57 Books

Once you've tested the scraper above and confirmed it works, run this cell to scrape all books.

In [None]:
# URLs for all Brandon Sanderson books
urls = [
    "https://www.goodreads.com/book/show/68427.Elantris",
    "https://www.goodreads.com/book/show/1268479.Warbreaker",
    "https://www.goodreads.com/book/show/28862254-white-sand-volume-1",
    "https://www.goodreads.com/book/show/33551363-white-sand-volume-2",
    "https://www.goodreads.com/book/show/39298848-white-sand-volume-3",
    "https://www.goodreads.com/book/show/60696519-white-sand-omnibus",
    "https://www.goodreads.com/book/show/28595941-arcanum-unbounded",
    "https://www.goodreads.com/book/show/13578175-the-emperor-s-soul",
    "https://www.goodreads.com/book/show/68428.Mistborn",
    "https://www.goodreads.com/book/show/68429.The_Well_of_Ascension",
    "https://www.goodreads.com/book/show/2767793-the-hero-of-ages",
    "https://www.goodreads.com/book/show/28698036-secret-history",
    "https://www.goodreads.com/book/show/10803121-the-alloy-of-law",
    "https://www.goodreads.com/book/show/16065004-shadows-of-self",
    "https://www.goodreads.com/book/show/18739426-the-bands-of-mourning",
    "https://www.goodreads.com/book/show/23947089-the-lost-metal",
    "https://www.goodreads.com/book/show/7235533-the-way-of-kings",
    "https://www.goodreads.com/book/show/17332218-words-of-radiance",
    "https://www.goodreads.com/book/show/34703445-edgedancer",
    "https://www.goodreads.com/book/show/34002132-oathbringer",
    "https://www.goodreads.com/book/show/54511226-dawnshard",
    "https://www.goodreads.com/book/show/49021976-rhythm-of-war",
    "https://www.goodreads.com/book/show/203578847-wind-and-truth",
    "https://www.goodreads.com/book/show/17182126-steelheart",
    "https://www.goodreads.com/book/show/18966322-mitosis",
    "https://www.goodreads.com/book/show/15704459-firefight",
    "https://www.goodreads.com/book/show/15704486-calamity",
    "https://www.goodreads.com/book/show/58419574-lux",
    "https://www.goodreads.com/book/show/13552643-defending-elysium",
    "https://www.goodreads.com/book/show/36642458-skyward",
    "https://www.goodreads.com/book/show/42769202-starsight",
    "https://www.goodreads.com/book/show/57903876-sunreach",
    "https://www.goodreads.com/book/show/57903879-redawn",
    "https://www.goodreads.com/book/show/58465495-evershore",
    "https://www.goodreads.com/book/show/57571215-cytonic",
    "https://www.goodreads.com/book/show/43606308-defiant",
    "https://www.goodreads.com/book/show/60531406-tress-of-the-emerald-sea",
    "https://www.goodreads.com/book/show/60531410-the-frugal-wizard-s-handbook-for-surviving-medieval-england",
    "https://www.goodreads.com/book/show/60531416-yumi-and-the-nightmare-painter",
    "https://www.goodreads.com/book/show/60531420-the-sunlit-man",
    "https://www.goodreads.com/book/show/210300489-isles-of-the-emberdark",
    "https://www.goodreads.com/book/show/49798827-dark-one",
    "https://www.goodreads.com/book/show/60373696-dark-one",
    "https://www.goodreads.com/book/show/54615879-the-original",
    "https://www.goodreads.com/book/show/40590407-alcatraz-vs-the-evil-librarians",
    "https://www.goodreads.com/book/show/3485562-alcatraz-versus-the-scrivener-s-bones",
    "https://www.goodreads.com/book/show/6366110-alcatraz-versus-the-knights-of-crystallia",
    "https://www.goodreads.com/book/show/7740659-alcatraz-versus-the-shattered-lens",
    "https://www.goodreads.com/book/show/26114421-the-dark-talent",
    "https://www.goodreads.com/book/show/59808314-bastille-vs-the-evil-librarians",
    "https://www.goodreads.com/book/show/13452375-legion",
    "https://www.goodreads.com/book/show/20886354-skin-deep",
    "https://www.goodreads.com/book/show/37640636-lies-of-the-beholder",
    "https://www.goodreads.com/book/show/39332065-legion",
    "https://www.goodreads.com/book/show/8562526-firstborn",
    "https://www.goodreads.com/book/show/25188109-perfect-state",
    "https://www.goodreads.com/book/show/31176804-snapshot"
]

# Initialize the driver
print("Initializing Chrome WebDriver...")
driver = setup_driver()

all_books_data = []
all_reviews_data = []
failed_urls = []

try:
    for i, url in enumerate(urls, 1):
        print(f"\n{'='*80}")
        print(f"Processing Book {i}/{len(urls)}")
        print(f"URL: {url}")
        print(f"{'='*80}")
        
        try:
            # Get book details
            book_details = get_book_details_selenium(driver, url)
            
            if not book_details:
                print(f"⚠ Skipping {url} - could not fetch book details")
                failed_urls.append(url)
                continue
            
            # Get reviews
            reviews = get_reviews_selenium(driver, url, num_reviews=1000, max_clicks=50)
            
            if not reviews:
                print(f"⚠ No reviews found for {book_details.get('title', 'Unknown')}")
                failed_urls.append(url)
                continue
            
            # Convert reviews to DataFrame
            reviews_df = pd.DataFrame(reviews)
            reviews_df['book_title'] = book_details.get('title', 'Unknown')
            reviews_df['review_length'] = reviews_df['review_text'].apply(lambda x: len(str(x)))
            reviews_df['word_count'] = reviews_df['review_text'].apply(lambda x: len(str(x).split()))
            reviews_df = reviews_df[['rating', 'review_text', 'likes', 'review_length', 'book_title', 'word_count']]
            
            # Save individual book files
            safe_title = re.sub(r'[^a-zA-Z0-9_-]', '_', book_details.get('title', 'unknown'))
            book_df = pd.DataFrame([book_details])
            book_df.to_csv(f'{safe_title}_details.csv', index=False)
            reviews_df.to_csv(f'{safe_title}_reviews.csv', index=False)
            
            # Add to combined lists
            all_books_data.append(book_details)
            all_reviews_data.append(reviews_df)
            
            print(f"\n✓ Successfully saved {safe_title} ({len(reviews_df)} reviews)")
            
            # Be polite - wait between books
            if i < len(urls):
                wait_time = random.uniform(5, 10)
                print(f"\n⏳ Waiting {wait_time:.1f} seconds before next book...")
                time.sleep(wait_time)
        
        except Exception as e:
            print(f"\n✗ Error processing {url}: {e}")
            failed_urls.append(url)
            continue
    
    # Save combined files
    if all_books_data:
        combined_books = pd.DataFrame(all_books_data)
        combined_books.to_csv('Combined_Details_Selenium.csv', index=False)
        print(f"\n✓ Saved combined book details ({len(combined_books)} books)")
    
    if all_reviews_data:
        combined_reviews = pd.concat(all_reviews_data, ignore_index=True)
        combined_reviews.to_csv('Combined_Reviews_Selenium.csv', index=False)
        print(f"✓ Saved combined reviews ({len(combined_reviews)} reviews)")
    
    # Summary
    print(f"\n{'='*80}")
    print("FINAL SUMMARY")
    print(f"{'='*80}")
    print(f"Total books processed: {len(all_books_data)}/{len(urls)}")
    print(f"Total reviews collected: {sum(len(df) for df in all_reviews_data)}")
    print(f"Failed URLs: {len(failed_urls)}")
    if failed_urls:
        print("\nFailed URLs:")
        for url in failed_urls:
            print(f"  - {url}")

finally:
    print("\n\nClosing browser...")
    driver.quit()
    print("✓ Done!")