This Notebook contains my quest to get genres out of Goodreads.
First, we set up the webscraping.

In [1]:
from dotenv import load_dotenv
load_dotenv()
import os
from time import sleep
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service

In [2]:
#Fill in my Goodreads username and password from the .env
user_name = os.environ.get('USER')
password = os.environ.get('PASSWORD')

#This is just the URL I get when I go to goodreads and select log in by email.
login_url = os.environ.get('URL')

In [3]:
#Here I start up a headless Firefox browser through Selenium
s = Service("geckodriver.exe")
opts=Options()
opts.add_argument('-headless')
browser = webdriver.Firefox(service=s)
browser.get(login_url)

In [4]:
# Here I log into goodreads
log_email = browser.find_element(By.ID, value="ap_email")
log_pwd = browser.find_element(By.ID, value="ap_password")
log_email.send_keys(user_name)
log_pwd.send_keys(password)
log_pwd.submit()
sleep(5)

This is where things get shaky. I'm able to make this work for individual titles, but as soon as I put it into the function, the itemqueue ends up an empty list. Sometimes even when it's not in a function, it's an empty list. I haven't quite been able to figure out what triggers it to actually work. 

In [5]:
title = "The Thief"
browser.get('http://www.goodreads.com/search?q=&qid=')
search_book = browser.find_element(By.ID, value='search_query_main')
search_book.send_keys(title)
search_book.submit()
sleep(5)

This has a bit more than I need (I got it from a tutorial) but when I try to take anything out, I start getting an empty list. So I removed the star ratings and called it good.

In [6]:
itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
img = browser.find_elements(By.CLASS_NAME, value="bookCover")
book_list = list()
for i in range(len(itemqueue)):
    book_list.append(itemqueue[i].text.split('\n'))
    book_list_ap = list()
for i in range(0, len(book_list)):
     book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))

Here's what that "book_list_ap" variable looks like. Oddly, the cover jpg is the most useful part, since it contains the book's ID, which you need to get the URL of the book's page.

Here you can see a number of books with "The Thief" in the name.

In [7]:
book_list_ap[:4]

[('The Lightning Thief (Percy Jackson and the Olympians, #1)',
  'by Rick Riordan (Goodreads Author)',
  'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1400602609i/28187._SY75_.jpg'),
 ('The Book Thief',
  'by Markus Zusak (Goodreads Author)',
  'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1522157426i/19063._SY75_.jpg'),
 ("The Thief (The Queen's Thief, #1)",
  'by Megan Whalen Turner (Goodreads Author)',
  'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1427740839i/448873._SY75_.jpg'),
 ('The Kiss Thief',
  'by L.J. Shen (Goodreads Author)',
  'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1544101164i/41450662._SY75_.jpg')]

And here, I narrow it down to the only book written by the correct author and extract the URL from the JPG.

In [8]:
for book in book_list_ap:
    if "by Megan Whalen Turner" in book[1]:
        book_id = book[2].split('/')[-1].split('.')[0]
        break
book_url = f'https://www.goodreads.com/book/show/{book_id}'
book_url

'https://www.goodreads.com/book/show/448873'

In [9]:
browser.get(book_url)

In [10]:
genres = browser.find_elements(By.XPATH, value="//span[contains(@class, 'BookPageMetadataSection__genreButton')]")

In [11]:
for genre in genres:
    print(genre.text)

Fantasy
Young Adult
Fiction
Adventure
Historical Fiction
Young Adult Fantasy
High Fantasy


Success! I have found the genres for one book. All I need is a way to automate this for books from the sample.

In [13]:
def get_genres(title, author):
    browser.get('http://www.goodreads.com/search?q=&qid=')
    search_book = browser.find_element(By.ID, value='search_query_main')
    search_book.send_keys(title)
    search_book.submit()
    sleep(5)
    itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
    img = browser.find_elements(By.CLASS_NAME, value="bookCover")
    book_list = list()
    for i in range(len(itemqueue)):
        book_list.append(itemqueue[i].text.split('\n'))
        book_list_ap = list()
    for i in range(0, len(book_list)):
         book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
    for book in book_list_ap:
        if f'by {author}' in book[1]:
            book_id = book[2].split('/')[-1].split('.')[0]
            break
    book_url = f'https://www.goodreads.com/book/show/{book_id}'
    browser.get(book_url)
    genres = browser.find_elements(By.XPATH, value="//span[contains(@class, 'BookPageMetadataSection__genreButton')]")
    return [genre.text for genre in genres]

In [14]:
get_genres("The Thief", "Megan Whalen Turner")

['Fantasy',
 'Young Adult',
 'Fiction',
 'Adventure',
 'Historical Fiction',
 'Young Adult Fantasy',
 'High Fantasy']

In [15]:
browser.quit()