In [2]:
import requests
import json
import csv
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import random
import os
import string
from time import sleep
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException
from tqdm.auto import tqdm
from ratelimit import limits, sleep_and_retry
from dotenv import load_dotenv
load_dotenv();

In [4]:
tqdm.pandas()

In [5]:
books = pd.read_csv('no_null_prosecraft.csv',index_col=0)

Let's do a bit of exploring! Some books were unable to be found on Goodreads on my first pass. They'll have null values for both genre and year. 

Some other books were found on Goodreads, as evidenced by the year showing up in the dataframe, but Goodreads did not have data on their genres. 

In [6]:
no_goodreads_count = np.sum((books['genre'] == '[]') & books['year'].isna())
no_genres_count = np.sum((books['genre'] == '[]') & books['year'].notna())
print(f'''Of the {len(books)} books we have data for, we were unable to find {no_goodreads_count} of them on goodreads on our first try ({round(no_goodreads_count/len(books)*100,1)}%). 
In this notebook, we'll try again. 
{no_genres_count} ({round(no_genres_count/len(books)*100,1)}%) of the books were found, but had no listed genres. 
Since genre is our target feature, these books will have to be dropped. ''')

Of the 24857 books we have data for, we were unable to find 2362 of them on goodreads on our first try (9.5%). 
In this notebook, we'll try again. 
2157 (8.7%) of the books were found, but had no listed genres. 
Since genre is our target feature, these books will have to be dropped. 


In [7]:
#Dropping books found on Goodreads with no genre
to_drop = books[(books['genre'] == '[]') & books['year'].notna()]

In [8]:
books = books.merge(to_drop.drop_duplicates(), how='left',indicator=True)

In [9]:
books.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,genre,year,_merge
0,The Vanished Birds,Simon Jimenez,124205.0,55.18,6.37,1.95,0.36,1.58,"['Science Fiction', 'Fiction', 'Fantasy', 'Que...",2020.0,left_only
1,The Price of Honor,Jonathan P. Brazee,77253.0,35.35,8.71,2.63,0.71,1.92,['Science Fiction'],2017.0,left_only
2,The Mathematical Murder of Innocence,Michael Carter,37688.0,24.08,8.11,4.13,1.56,2.58,[],2020.0,both
3,The Case of the Baker Street Irregulars,Anthony Boucher,80557.0,32.33,8.41,3.72,1.64,2.08,"['Mystery', 'Fiction', 'Crime', 'Humor', 'Clas...",1940.0,left_only
4,Zombie Nation,Charlie Dalton,64396.0,51.11,8.22,2.21,0.58,1.63,[],2020.0,both


In [10]:
books = books[books['_merge'] == 'left_only'].drop('_merge', axis=1)

In [11]:
np.sum((books['genre'] == '[]') & books['year'].notna())

0

In [12]:
to_retry = books[(books['genre'] == '[]') & books['year'].isna()]

In [13]:
to_retry.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,genre,year
19,The Teacher,Michael Ben-Naftali,50233.0,37.06,7.2,2.93,1.25,1.68,[],
20,Ryan's Christmas,L. J. Ross,41547.0,41.47,8.49,2.84,0.94,1.91,[],
53,The Very Best of Caitlin R. Kiernan,Caitlin R. Kiernan,165604.0,62.2,7.67,3.2,0.9,2.3,[],
55,Christmas Pig,J. K. Rowling,51218.0,61.1,7.93,3.17,0.81,2.36,[],
63,Fatal Love,Michael Patterson,85334.0,18.0,11.14,5.15,2.14,3.01,[],


In [14]:
len(to_retry)

2362

In [112]:
#Fill in my Goodreads username and password from the .env
user_name = os.environ.get('USER')
password = os.environ.get('PASSWORD')

#This is just the URL I get when I go to goodreads and select log in by email.
login_url = os.environ.get('URL')

In [113]:
#Here I start up a headless Firefox browser through Selenium
s = Service("geckodriver.exe")
opts=Options()
opts.add_argument('-headless')
browser = webdriver.Firefox(service=s)
browser.get(login_url)

In [114]:
# Here I log into goodreads
log_email = browser.find_element(By.ID, value="ap_email")
log_pwd = browser.find_element(By.ID, value="ap_password")
log_email.send_keys(user_name)
log_pwd.send_keys(password)
log_pwd.submit()
sleep(5)

The get_genres_legacy() function was my first, somewhat involved attempt to find books the original function had missed. It improved upon the original function by not giving up after the first page of results, checking the second and third pages too. However, I later discovered that searching both the title and author at once has a much higher success rate, much faster. 

Of the first 5 books I tried, the legacy function managed to find one of them in about 5 minutes.

The new, simpler function managed to find 4 of them, in less than a minute and a half.

In [117]:
def get_genres_legacy(title, author):
    '''Replaced by a much simpler get_genres, this function searches the first three pages
    of title results and, if unsuccessful, searches the first three pages of author results.'''
    book_id = -1
    itemqueue = []
    trials = 0
    page = 1
    
    #Put the title into the search bar
    while (len(itemqueue) == 0) and (trials < 3):
        browser.get('http://www.goodreads.com/search?q=&qid=')
        search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(title)
        search_book.submit()
        sleep(random.uniform(3,10))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        book_list = list()
        trials += 1
    
    #Search the first page of results for the author's name
    #Use only first author if multiple
     
    for i in range(len(itemqueue)):
        book_list.append(itemqueue[i].text.split('\n'))
        book_list_ap = list()
    for i in range(0, len(book_list)):
         book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
    for book in book_list_ap:
        if f"by {author.split(' &')[0]}" in book[1]:
            book_id = book[2].split('/')[-1].split('.')[0]
            break
    
    #Now let's check the second and third page of results. 
    if book_id == -1:
        for j in range(2):
            sleep(random.uniform(3,10))
            book_list = list()
            try: 
                nextpage = browser.find_element(By.CLASS_NAME, value="next_page")
            except NoSuchElementException: 
                year = np.nan
                break
            nextpage.click()
            itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
            img = browser.find_elements(By.CLASS_NAME, value="bookCover")
            
            for i in range(len(itemqueue)):
                book_list.append(itemqueue[i].text.split('\n'))
                book_list_ap = list()
            for i in range(0, len(book_list)):
                book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
            for book in book_list_ap:
                if f"by {author.split(' &')[0]}" in book[1]:
                    book_id = book[2].split('/')[-1].split('.')[0]
                    break
            sleep(random.uniform(3,10))
            
        
        
            
    #Sometimes, a book's title is so common that the correct version isn't on the three pages
    #If that happens, try putting the author into the search bar
    #And searching the first page for the correct title
    
    if book_id == -1:
        browser.get('http://www.goodreads.com/search?q=&qid=')
        search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(author)
        search_book.submit()
        sleep(random.uniform(3,10))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        book_list = list()
        for i in range(len(itemqueue)):
            book_list.append(itemqueue[i].text.split('\n'))
            book_list_ap = list()
        for i in range(0, len(book_list)):
             book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
        for book in book_list_ap:
            if title in book[0]:
                book_id = book[2].split('/')[-1].split('.')[0]
                break
    
    #If that doesn't work, check the second and third author page.
    
    if book_id == -1:
        for j in range(2):
            sleep(random.uniform(3,10))
            try: 
                nextpage = browser.find_element(By.CLASS_NAME, value="next_page")
            except NoSuchElementException:
                year = np.nan
                break
            nextpage.click()
            itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
            img = browser.find_elements(By.CLASS_NAME, value="bookCover")
            book_list = list()
            for i in range(len(itemqueue)):
                book_list.append(itemqueue[i].text.split('\n'))
                book_list_ap = list()
            for i in range(0, len(book_list)):
                book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
            for book in book_list_ap:
                if title in book[0]:
                    book_id = book[2].split('/')[-1].split('.')[0]
                    break
        
    if book_id == -1:
        return ([], np.nan)
    book_url = f'https://www.goodreads.com/book/show/{book_id}'
    browser.get(book_url)
    genres = browser.find_elements(By.XPATH, value="//span[contains(@class, 'BookPageMetadataSection__genreButton')]")
    try: 
        datestring = browser.find_element(By.CSS_SELECTOR, ".FeaturedDetails > p:nth-child(2)")
        year = int(datestring.text[-4:])
    except NoSuchElementException:
        year = np.nan
    except ValueError:
        year = np.nan
    sleep(random.uniform(3,10))
    return ([genre.text for genre in genres], year)

In [213]:
@sleep_and_retry
def get_genres(index):
    book_id = -1
    itemqueue = []
    trials = 0
    page = 1
    row = books.loc[index].copy()
    title = row['title']
    author = row['author'].replace(' & ', ' ')
    
    #Put the title into the search bar
    while (len(itemqueue) == 0) and (trials < 3):
        browser.get('http://www.goodreads.com/search?q=&qid=')
        try: 
            search_book = browser.find_element(By.ID, value='search_query_main')
        except NoSuchElementException: 
            sleep(random.uniform(3,10))
            browser.get('http://www.goodreads.com/search?q=&qid=')
            search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(title + ' ' + author)
        search_book.submit()
        sleep(random.uniform(3,10))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        book_list = list()
        trials += 1
        
    #Sometimes middle initials mess up the search. Try again with just first and last name.       
    if(len(itemqueue) == 0): 
        browser.get('http://www.goodreads.com/search?q=&qid=')
        author = author.split()[0] + ' ' + author.split()[-1]
        try: 
            search_book = browser.find_element(By.ID, value='search_query_main')
        except NoSuchElementException: 
            sleep(random.uniform(3,10))
            browser.get('http://www.goodreads.com/search?q=&qid=')
            search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(title + ' ' + author)
        search_book.submit()
        sleep(random.uniform(3,10))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        book_list = list()
    
    if(len(itemqueue) > 0):
        for i in range(len(itemqueue)):
            book_list.append(itemqueue[i].text.split('\n'))
            book_list_ap = list()
        for i in range(0, len(book_list)):
             book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
        for book in book_list_ap:
            if f"{author.split()[-1]}" in book[1]:
                book_id = book[2].split('/')[-1].split('.')[0]
                break
    
    if book_id == -1:
        return row
    book_url = f'https://www.goodreads.com/book/show/{book_id}'
    browser.get(book_url)
    genres = browser.find_elements(By.XPATH, value="//span[contains(@class, 'BookPageMetadataSection__genreButton')]")
    try: 
        datestring = browser.find_element(By.CSS_SELECTOR, ".FeaturedDetails > p:nth-child(2)")
        year = int(datestring.text[-4:])
    except NoSuchElementException:
        year = np.nan
    except ValueError:
        year = np.nan
    sleep(random.uniform(3,10))
    row['genre'] = [genre.text for genre in genres]
    row['year'] = year
    return row

In [116]:
test_genres_legacy = to_retry.head().progress_apply(lambda row: get_genres_legacy(row['title'], row['author']),axis=1)
test_genres_legacy

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:51<00:00, 58.28s/it]


In [118]:
new_df

19                                            ([], nan)
20    ([Mystery, Crime, Fiction, Christmas, Audioboo...
53                                            ([], nan)
55                                            ([], nan)
63                                            ([], nan)
dtype: object

In [134]:
test_genres_new = to_retry.head().progress_apply(lambda row: get_genres_simple(row['title'], row['author']),axis=1)
test_genres_new

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:23<00:00, 16.79s/it]


In [154]:
new_df = to_retry.head(2).progress_apply(lambda row: get_genres(row['title'], row['author']), axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:28<00:00, 14.10s/it]


In [167]:
results = new_df.progress_apply(lambda row: get_genres(row.name), axis=1)

In [204]:
full = pd.DataFrame()

In [None]:
for i in tqdm.notebook.tqdm(range(int(len(to_retry)/30))):
    part = to_retry.iloc[50*i:50*(i+1)].copy()
    part = part.apply(lambda row: get_genres(row.name), axis=1)
    full = pd.concat([full, part])
    

In [177]:
splits = np.array_split(to_retry, 50)
successful_splits = []
df.progress_apply()

In [217]:
for i in tqdm(range(len(successful_splits), len(splits))):
    part = splits[i].apply(lambda row: get_genres(row.name), axis=1)
    full = pd.concat([full, part])
    successful_splits.append(i)

0it [00:00, ?it/s]

In [1]:
#with open('has_goodreads.csv', 'a') as file:
#    full.to_csv(file)

In [53]:
(new_df.loc[:,'genre'],new_df.loc[:,'year']) = new_df.progress_apply(lambda row: get_genres_simple(row['title'], row['author']), axis=1)

In [52]:
new_df.progress_apply(lambda row: test_assignment(row.Name, axis=1)

In [215]:
len(full)

1978