# 6. Collect ratings from Goodreads
This isn't a fully deterministic script. The scraping system is unreliable depending on its connection to Goodreads. It's necessary to collect ratings from clusters of books rather than trying to do everything at once. Add a pause whenever the connection breaks. 

Once books have more than a few hundred ratings this won't collect all of them - the system for paging through reviews doesn't get along well with automated visits.

But the purpose isn't to collect all the data, which would be unfair use of the site anyway. The purpose is just to collect enough ratings to have a useful dataset.

In [None]:
import pickle
import sqlite3
import time
import pandas as pd

from bs4 import BeautifulSoup
from pathlib import Path
from tqdm import tqdm
from pathlib import Path

from selenium import webdriver  
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.chrome.options import Options  
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException

In [None]:
# Start with the existing book data and ratings database
data_dir = Path.cwd().parent / "data"
data = pd.read_csv(data_dir / "book_data_cut.csv")
conn = sqlite3.connect(data_dir / "book_ratings.db")

# Check how much is currently in the database
select_books = conn.execute("SELECT COUNT(DISTINCT book_id) FROM book_ratings")
for row in select_books:
    print("Book count:", row[0])

select_users = conn.execute("SELECT COUNT(DISTINCT user_id) FROM book_ratings")
for row in select_users:
    print("Book count:", row[0])

select_ratings = conn.execute("SELECT COUNT(*) FROM book_ratings")
for row in select_ratings:
    print("Ratings:", row[0])

conn.close()

In [None]:
# Dictionary to convert Goodreads descriptive ratings back into 1-5 star ratings
ratings = {
    "did not like it": 1, 
    "it was ok": 2, 
    "liked it": 3, 
    "really liked it": 4, 
    "it was amazing": 5
}

# FUNCTIONS FOR SCRAPER
def initialise(to_do, chrome_options, base, limit):
    """Creates a connection and begins collecting ratings
    
    Parameters:
    -----------
    to_do (list):
        List of books in the form of Goodreads IDs
        
    chrome_options (Options):
        Selenium object containing options for the session
        
    base (str):
        The first part of the web address for a Goodreads book page
        This will always be https://www.goodreads.com/book/show/
        
    limit (int):
        Max number of ratings to collect per book
    """
    driver = webdriver.Chrome(
        "chromedriver", 
        options=chrome_options
    )

    # Load the first page and click past any modal that appears
    current = to_do.pop()
    driver.get(base + str(current))
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'modalOpened'))).click()
        driver.refresh()
    except TimeoutException:
        print("Failed on modal check:", str(current))
        failures.append(current)
        return 1

    # Set up a progress bar
    pbar = tqdm(total=len(to_do) + 1, initial=0, leave=True)

    # Work through the books, pausing appropriately
    # Commit to the database after each books reviews gathered
    # Add failures to a global list so I can check them
    while to_do:
        if get_reviews(current, driver, limit, base):
            conn.commit()
            pbar.update(1)
            time.sleep(5)
            current = to_do.pop()
            driver.get(base + str(current))
        else:
            print("Failed - pausing")
            failures.append(current)
            driver.quit()
            pbar.close()
            time.sleep(300)
            return 1
    if get_reviews(current, driver, limit, base):
        conn.commit()
    else:
        print("Failed on last one")
        failures.append(current)
    driver.quit()
    pbar.close()
    
def get_reviews(book_id, driver, limit, base):
    """Starting with the webdriver already on a book's page, collect reviews page by page
    """
    counter = 0
    try:
        # Look for the reviews section in the html
        pause = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "bookReviews")))
    except Exception as e:
        failures.append(book_id)
        print("Failed finding reviews:", str(book_id), str(type(e)))
        return False

    while counter < limit: 
        # Run subfunction to get the reviews
        counter += add_reviews_to_database(driver, book_id)
        time.sleep(3)
        try:
            # Look for a next page button - this part frequently goes wrong
            element = driver.find_element_by_class_name("next_page")
        except NoSuchElementException:
            break
        if element.get_attribute("class") == "next_page disabled":
            break
        else:
            try:
                # Make sure the next page button has loaded before clicking
                element = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CLASS_NAME, "next_page"))).click()
            except: 
                print("Failed on click - skipping further pages:", book_id)
                skipped.append(book_id)
                return True
            try:
                # Don't collect next page of reviews until the current one is gone
                WebDriverWait(driver, 10).until(EC.staleness_of(element))            
            except:
                pass
    return True

def add_reviews_to_database(driver, book_id):
    """Parse through a page and send ratings and user ids to the database
    """
    page = driver.page_source
    doc = BeautifulSoup(page, 'lxml')
    reviews = doc.find_all(class_='reviewHeader')
    scores = []
    for r in reviews:
        stars = r.find(class_='staticStars')
        if stars:
            rating = ratings[stars['title']]
            user_id = r.find(class_='user')['href'][11:]
            scores.append((book_id, user_id[:user_id.find("-")], rating))
    # Send to the database at the end of each page rather than one by one
    conn.executemany(
        "REPLACE INTO book_ratings VALUES(?, ?, ?);",
        scores
    )
    return len(scores)

In [None]:
# PREPARE BOOK LIST 
# The smaller the review numbers, the more books, so use a small range at first
min_reviews = 5
max_reviews = 1000
to_do = data["Goodreads ID"][data['Review count'].between(min_reviews, max_reviews)].to_list()
failures = []
print("Books to scrape:", len(to_do))
print(
    "Max text reviews:", 
    data["Text review count"][data['Review count'] <= max_reviews].max())

In [None]:
# COMPARE TO CURRENT DB
# Optional cell if I'm going over previously examined books, trying again on failures
conn = sqlite3.connect(data_dir / "book_ratings.db")

select = conn.execute("SELECT DISTINCT book_id FROM book_ratings")
done = [book[0] for book in select.fetchall()]

to_do = [book for book in to_do if book not in done]
print("Books not already scraped:", len(to_do))

In [None]:
# RUN THE SCRAPER
skipped = []
chrome_options = Options()  
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-extensions")

BASE = 'https://www.goodreads.com/book/show/'
LIMIT = 300

conn = sqlite3.connect(data_dir / "book_ratings.db")

# Stop if it fails too much
fails = 0
while to_do and fails < 6: 
    initialise(to_do, chrome_options, BASE, LIMIT)
    fails += 1

# Don't count it as a failure if it only stops because it reaches the end of the list
if to_do == 0:
    fails -= 1
    
print("Failures:", len(failures))
select = conn.execute("SELECT COUNT(DISTINCT book_id) FROM book_ratings")
for row in select:
    print("In database:", row[0])
        
conn.close()