In [7]:
import requests
import json
import csv
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import random
import os
import string
from time import sleep
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException
from tqdm.auto import tqdm
from ratelimit import limits, sleep_and_retry
from dotenv import load_dotenv
import Levenshtein
load_dotenv();

In [16]:
tqdm.pandas()

In [17]:
with open('has_goodreads.csv') as file:
    books = pd.read_csv(file, index_col=0)

In [18]:
books.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,genre,year
19,The Teacher,Michael Ben-Naftali,50233.0,37.06,7.2,2.93,1.25,1.68,[],
20,Ryan's Christmas,L. J. Ross,41547.0,41.47,8.49,2.84,0.94,1.91,"['Mystery', 'Crime', 'Fiction', 'Christmas', '...",2019.0
53,The Very Best of Caitlin R. Kiernan,Caitlin R. Kiernan,165604.0,62.2,7.67,3.2,0.9,2.3,"['Horror', 'Short Stories', 'Fantasy', 'Fictio...",2019.0
55,Christmas Pig,J. K. Rowling,51218.0,61.1,7.93,3.17,0.81,2.36,"['Fantasy', 'Christmas', 'Fiction', 'Childrens...",2021.0
63,Fatal Love,Michael Patterson,85334.0,18.0,11.14,5.15,2.14,3.01,[],2019.0


In [19]:
#Check the length, check for duplicates.
print(f"This df has {len(books)} books of which {np.sum(books.duplicated())} are repeated.")

This df has 2288 books of which 0 are repeated.


Once again, we should check for books that have years but no genres--these have been found successfully but are missing the target feature, and must be dropped.

In [20]:
to_drop = books[(books['genre'] == '[]') & (books['year'].notna())]
print(f"{len(to_drop)} books are missing genres on Goodreads, or \
{round(len(to_drop)/len(books)*100,1)}% of our books.")

388 books are missing genres on Goodreads, or 17.0% of our books.


Wow! This is about twice the percentage of the books obtained on the first pass. But an examination of a sample of books in the to-drop list shows that Goodreads indeed show no genre data. It seemes plausible that a relationship between the ease of finding a book on a Goodreads search and the amount of data Goodreads has about the book would have reason to be correlated. 

In [21]:
to_drop.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,genre,year
63,Fatal Love,Michael Patterson,85334.0,18.0,11.14,5.15,2.14,3.01,[],2019.0
107,Child of the Night,Thomas Laird,93673.0,39.59,9.73,3.09,0.93,2.16,[],2022.0
144,The Tunnel,Baynard Kendrick,63974.0,53.65,8.93,3.64,1.37,2.26,[],2021.0
156,Danger Close,Travis Starnes,64975.0,26.42,10.41,3.15,1.09,2.06,[],2021.0
170,Be Still,Erik Carter,53957.0,52.31,7.53,2.89,0.99,1.9,[],2020.0


In [22]:
#repeated the dropping process from the previous notebook.
books = books.merge(to_drop.drop_duplicates(), how='left',indicator=True)
books.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,genre,year,_merge
0,The Teacher,Michael Ben-Naftali,50233.0,37.06,7.2,2.93,1.25,1.68,[],,left_only
1,Ryan's Christmas,L. J. Ross,41547.0,41.47,8.49,2.84,0.94,1.91,"['Mystery', 'Crime', 'Fiction', 'Christmas', '...",2019.0,left_only
2,The Very Best of Caitlin R. Kiernan,Caitlin R. Kiernan,165604.0,62.2,7.67,3.2,0.9,2.3,"['Horror', 'Short Stories', 'Fantasy', 'Fictio...",2019.0,left_only
3,Christmas Pig,J. K. Rowling,51218.0,61.1,7.93,3.17,0.81,2.36,"['Fantasy', 'Christmas', 'Fiction', 'Childrens...",2021.0,left_only
4,Fatal Love,Michael Patterson,85334.0,18.0,11.14,5.15,2.14,3.01,[],2019.0,both


In [23]:
books = books[books['_merge'] == 'left_only'].drop('_merge', axis=1)
books.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,genre,year
0,The Teacher,Michael Ben-Naftali,50233.0,37.06,7.2,2.93,1.25,1.68,[],
1,Ryan's Christmas,L. J. Ross,41547.0,41.47,8.49,2.84,0.94,1.91,"['Mystery', 'Crime', 'Fiction', 'Christmas', '...",2019.0
2,The Very Best of Caitlin R. Kiernan,Caitlin R. Kiernan,165604.0,62.2,7.67,3.2,0.9,2.3,"['Horror', 'Short Stories', 'Fantasy', 'Fictio...",2019.0
3,Christmas Pig,J. K. Rowling,51218.0,61.1,7.93,3.17,0.81,2.36,"['Fantasy', 'Christmas', 'Fiction', 'Childrens...",2021.0
5,The Institute,Jakub Zulczyk,66557.0,57.45,7.48,2.92,1.28,1.63,[],


In [24]:
#Next, let's extract the ones that need a third pass. 
to_retry = books[(books['genre'] == '[]') & (books['year'].isna())].reset_index(drop=True)
to_retry.head()

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,genre,year
0,The Teacher,Michael Ben-Naftali,50233.0,37.06,7.2,2.93,1.25,1.68,[],
1,The Institute,Jakub Zulczyk,66557.0,57.45,7.48,2.92,1.28,1.63,[],
2,"Emotional Abuse, A Manual For Self-Defense","Zak Mucha, Joel Dvoskin & Marc MacYoung",69187.0,16.18,9.42,2.55,0.92,1.63,[],
3,Relentless,Scott Bartlett & Joshua James,70433.0,36.34,9.27,3.18,1.1,2.09,[],
4,Buying Time,E. M. Brown,96759.0,44.14,7.47,2.79,0.75,2.03,[],


In [25]:
len(to_retry)

523

From watching the Goodreads program run, I have reason to believe that many of the missing values are based on simple typos, such as Julie -> Julia, or Georgia -> Georgina. 

I have contacted the owner of Prosecraft, Benji Smith, and offered him a list of potential errors, so now I am doubly motivated to find them.

In [52]:
possible_errors = []

In [28]:
#Fill in my Goodreads username and password from the .env
user_name = os.environ.get('USER')
password = os.environ.get('PASSWORD')

#This is just the URL I get when I go to goodreads and select log in by email.
login_url = os.environ.get('URL')

In [29]:
#Here I start up a headless Firefox browser through Selenium
s = Service("geckodriver.exe")
opts=Options()
opts.add_argument('-headless')
browser = webdriver.Firefox(service=s)
browser.get(login_url)

In [30]:
# Here I log into goodreads
log_email = browser.find_element(By.ID, value="ap_email")
log_pwd = browser.find_element(By.ID, value="ap_password")
log_email.send_keys(user_name)
log_pwd.send_keys(password)
log_pwd.submit()
sleep(5)

In [31]:
def get_genres(index):
    book_id = -1
    itemqueue = []
    trials = 0
    row = books.loc[index].copy()
    title = row['title']
    author = row['author'].replace(' & ', ' ')
    possible_error = False
    
    #Put the title and author into the search bar
    while (len(itemqueue) == 0) and (trials <= 1):
        browser.get('http://www.goodreads.com/search?q=&qid=')
        try: 
            search_book = browser.find_element(By.ID, value='search_query_main')
        except NoSuchElementException: 
            sleep(random.uniform(3,10))
            browser.get('http://www.goodreads.com/search?q=&qid=')
            search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(title + ' ' + author)
        search_book.submit()
        sleep(random.uniform(3,10))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        trials += 1
        
    #Sometimes middle initials mess up the search. Try again with just first and last name.       
    if(len(itemqueue) == 0): 
        browser.get('http://www.goodreads.com/search?q=&qid=')
        author = author.split()[0] + ' ' + author.split()[-1]
        try: 
            search_book = browser.find_element(By.ID, value='search_query_main')
        except NoSuchElementException: 
            sleep(random.uniform(3,10))
            browser.get('http://www.goodreads.com/search?q=&qid=')
            search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(title + ' ' + author)
        search_book.submit()
        sleep(random.uniform(3,10))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        book_list = list()
        
    
    if(len(itemqueue) > 0):
        for i in range(len(itemqueue)):
            book_list.append(itemqueue[i].text.split('\n'))
            book_list_ap = list()
        for i in range(0, len(book_list)):
             book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
        for book in book_list_ap:
            if f"{author.split()[-1]}" in book[1]:
                book_id = book[2].split('/')[-1].split('.')[0]
                break
    
    if book_id == -1:
    
    book_url = f'https://www.goodreads.com/book/show/{book_id}'
    browser.get(book_url)
    genres = browser.find_elements(By.XPATH, value="//span[contains(@class, 'BookPageMetadataSection__genreButton')]")
    try: 
        datestring = browser.find_element(By.CSS_SELECTOR, ".FeaturedDetails > p:nth-child(2)")
        year = int(datestring.text[-4:])
    except NoSuchElementException:
        year = np.nan
    except ValueError:
        year = np.nan
    
    #Keep a running list of possible typos as a list of tuples. 
    if possible_error:
        typo = row['author']
        try: 
            actual = browser.find_element(By.XPATH, value="//span[contains(@class, 'ContributorLink__name')]").text
        except:
            actual = '?'
        possible_errors.append((typo, actual))
        
    
    sleep(random.uniform(3,10))
    row['genre'] = [genre.text for genre in genres]
    row['year'] = year
    return row

IndentationError: expected an indented block after 'if' statement on line 56 (1113514771.py, line 58)

In [66]:
def get_genres_typos(index): 
    book_id = -1
    itemqueue = []
    row = to_retry.loc[index].copy()
    title = row['title']
    author = row['author'].replace(' & ', ' ')
    last_name = row['author'].split(' & ')[0].split()[-1]
    first_name = row['author'].split(' & ')[0].split()[0]
    possible_error = True
    book_list = list()
    

    browser.get('http://www.goodreads.com/search?q=&qid=')
    try: 
        search_book = browser.find_element(By.ID, value='search_query_main')
    except NoSuchElementException: 
        sleep(random.uniform(3,10))
        browser.get('http://www.goodreads.com/search?q=&qid=')
        search_book = browser.find_element(By.ID, value='search_query_main')
    search_book.send_keys(title + ' ' + author)
    search_book.submit()
    sleep(random.uniform(3,10))
    itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
    img = browser.find_elements(By.CLASS_NAME, value="bookCover")
    
    #Sometimes middle initials mess up the search. Try again with just first and last name.       
    if len(itemqueue) == 0: 
        browser.get('http://www.goodreads.com/search?q=&qid=')
        author = author.split()[0] + ' ' + author.split()[-1]
        try: 
            search_book = browser.find_element(By.ID, value='search_query_main')
        except NoSuchElementException: 
            sleep(random.uniform(3,10))
            browser.get('http://www.goodreads.com/search?q=&qid=')
            search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(title + ' ' + author)
        search_book.submit()
        sleep(random.uniform(3,10))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        sleep(random.uniform(3,10))

    #Try just searching for the last name, see if anything comes up.     
    if len(itemqueue) == 0:
        browser.get('http://www.goodreads.com/search?q=&qid=')
        try: 
            search_book = browser.find_element(By.ID, value='search_query_main')
        except NoSuchElementException: 
            sleep(random.uniform(3,10))
            browser.get('http://www.goodreads.com/search?q=&qid=')
            search_book = browser.find_element(By.ID, value='search_query_main')            
        
        search_book.send_keys(title + ' ' + last_name)
        search_book.submit()
        sleep(random.uniform(3,10))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        
        if len(itemqueue) == 0:
            try: 
                browser.get('http://www.goodreads.com/search?q=&qid=')
                search_book = browser.find_element(By.ID, value='search_query_main')
            except NoSuchElementException: 
                sleep(random.uniform(3,10))
                browser.get('http://www.goodreads.com/search?q=&qid=')
                search_book = browser.find_element(By.ID, value='search_query_main')
            search_book.send_keys(title + ' ' + first_name)
            search_book.submit()
            sleep(random.uniform(3,10))
            itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
            img = browser.find_elements(By.CLASS_NAME, value="bookCover")
                
    if len(itemqueue) > 0:
        for i in range(len(itemqueue)):
            book_list.append(itemqueue[i].text.split('\n'))
            book_list_ap = list()
        for i in range(0, len(book_list)):
             book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))                
        
        for book in book_list_ap:
            #Check if exact first and last name are found in the author's name
            if last_name in book[1] and first_name in book[1]:
                book_id = book[2].split('/')[-1].split('.')[0]
                possible_error = False
                break
                
            #Check if first and last name with small typos are found in the author's name    
            else:
                lastname_found = False
                firstname_found = False
                for name in book[1].split():
                    if Levenshtein.distance(name, last_name) <= 2:
                        lastname_found = True
                    if Levenshtein.distance(name, first_name) <= 2:
                        firstname_found = True
                if lastname_found and firstname_found:
                    book_id = book[2].split('/')[-1].split('.')[0]
            
        
    if book_id == -1:
        row['genre'] = []
        row['year'] = np.nan
        return row
    
    book_url = f'https://www.goodreads.com/book/show/{book_id}'
    browser.get(book_url)
    genres = browser.find_elements(By.XPATH, value="//span[contains(@class, 'BookPageMetadataSection__genreButton')]")
    try: 
        datestring = browser.find_element(By.CSS_SELECTOR, ".FeaturedDetails > p:nth-child(2)")
        year = int(datestring.text[-4:])
    except NoSuchElementException:
        year = np.nan
    except ValueError:
        year = np.nan
    
    #Keep a running list of possible typos as a list of tuples. 
    if possible_error:
        typo = row['author']
        try: 
            actual = browser.find_element(By.XPATH, value="//span[contains(@class, 'ContributorLink__name')]").text
        except:
            actual = '?'
        possible_errors.append((typo, actual))
        #print(f"Typo found! {typo} should be {actual}.")
        
    
    sleep(random.uniform(3,10))
    row['genre'] = [genre.text for genre in genres]
    row['year'] = year
    return row
        
        


        

In [68]:
next_trial = to_retry.progress_apply(lambda row: get_genres_typos(row.name), axis=1)

  0%|          | 0/523 [00:00<?, ?it/s]

In [70]:
next_trial.to_csv('typos_fixed.csv')

In [61]:
error_corrections = pd.DataFrame(possible_errors,columns=['Before','Corrected'])
#error_corrections.to_csv('error_corrections.csv')

In [87]:
fixed = next_trial[(next_trial['year'].notna())]
print(f"Of the {len(next_trial)} books processed this round, we were able to fix {len(fixed)}\
, or {round(len(fixed)/len(next_trial)*100, 1)}%!")

Of the 523 books processed this round, we were able to fix 202, or 38.6%!


In [64]:
next_trial = to_retry.head(2).progress_apply(lambda row: get_genres_typos(row.name), axis=1)

  0%|          | 0/2 [00:00<?, ?it/s]

Typo found! Michael Ben-Naftali should be Michal Ben-Naftali.
Typo found! Jakub Zulczyk should be Jakub Żulczyk.


In [110]:
books.iloc[0]['genre']

'[]'

In [108]:
next_trial.iloc[15]['genre']

[]