Putting it all together!

In [2]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import random
import os
from time import sleep
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from dotenv import load_dotenv
load_dotenv()

True

In [21]:
def get_info(title, author):
    '''Given a title and author of a book in the list,
    returns a dictionary of prosecraft's analysis about the book.'''
    
    #Get rid of special characters in URL
    chars_to_remove = [':', '’', '.', ",", '“', '”']
    info = {'title': title, 'author': author}
    title = title.replace('/', '-')
    URL = f"{author}/{title}/"
    for char in chars_to_remove: 
        URL = URL.replace(char, '')
    URL = URL.replace('&', 'and').replace(' ','-').lower()
    URL = "http://prosecraft.io/library/" + URL
    
    #Get data from Prosecraft and turn it into a dict
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    headings = soup.find_all("div", {"class": "book-info-metric-heading"})
    values = soup.find_all("div", {"class": "book-info-metric-value"})
    for heading, value in zip(headings, values):
        info[heading.text] = float(value.text.strip('%').replace(',',''))
    return info

In [3]:
def get_genres(title, author):
    book_id = -1
    
    #Put the title into the search bar
    browser.get('http://www.goodreads.com/search?q=&qid=')
    search_book = browser.find_element(By.ID, value='search_query_main')
    search_book.send_keys(title)
    search_book.submit()
    sleep(20)
    itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
    img = browser.find_elements(By.CLASS_NAME, value="bookCover")
    book_list = list()
    
    #Search the first page of results for the author's name
    #Use only first author if multiple
    for i in range(len(itemqueue)):
        book_list.append(itemqueue[i].text.split('\n'))
        book_list_ap = list()
    for i in range(0, len(book_list)):
         book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
    for book in book_list_ap:
        if f"by {author.split(' &')[0]}" in book[1]:
            book_id = book[2].split('/')[-1].split('.')[0]
            break
            
    #Sometimes, a book's title is so common that the correct version isn't on the first page
    #If that happens, try putting the author into the search bar
    #And searching the first page for the correct title
    
    if book_id == -1:
        browser.get('http://www.goodreads.com/search?q=&qid=')
        search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(author)
        search_book.submit()
        sleep(20)
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        book_list = list()
        for i in range(len(itemqueue)):
            book_list.append(itemqueue[i].text.split('\n'))
            book_list_ap = list()
        for i in range(0, len(book_list)):
             book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
        for book in book_list_ap:
            if title in book[0]:
                book_id = book[2].split('/')[-1].split('.')[0]
                break
    
    #If that doesn't work, return an empty list. 
    if book_id == -1:
        return []
        
        
    book_url = f'https://www.goodreads.com/book/show/{book_id}'
    browser.get(book_url)
    genres = browser.find_elements(By.XPATH, value="//span[contains(@class, 'BookPageMetadataSection__genreButton')]")
    sleep(20)
    return [genre.text for genre in genres]

In [4]:
with open('book_list.json', 'r') as lst:
    book_list = json.load(lst)


In [5]:
indices = random.sample(range(0, len(book_list)), 10)
books = []
for index in indices:
    books.append(book_list[index])

In [22]:
sample = []

for book in books:
    sample.append(get_info(book['t'], book['a']))
    

In [23]:
df = pd.DataFrame(sample)

In [24]:
df

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs
0,Serpentine,Jonathan Kellerman,88863.0,48.34,7.61,2.74,0.92,1.82
1,Mom Genes,Abigail Tucker,79688.0,53.19,6.13,3.96,1.89,2.06
2,The Fallout,Rebecca Thornton,105017.0,34.79,10.43,3.93,1.09,2.84
3,11/22/63,Stephen King,270519.0,47.8,8.4,2.8,0.82,1.99
4,The Country Guesthouse,Robyn Carr,85165.0,30.21,10.77,2.93,0.76,2.17
5,Secrets at St. Bride’s,Debbie Young,52562.0,50.69,7.97,2.95,0.96,1.99
6,Fault Lines,Voddie T. Baucham,68643.0,24.33,6.72,2.48,0.98,1.5
7,Ramona the Pest,Beverly Cleary,26582.0,47.77,7.96,2.66,0.78,1.88
8,The Lying Days,Nadine Gordimer,152597.0,51.93,7.05,3.14,1.25,1.9
9,Winter’s Heart,Robert Jordan,238949.0,54.79,8.02,3.31,1.27,2.04


In [9]:
#Fill in my Goodreads username and password from the .env
user_name = os.environ.get('USER')
password = os.environ.get('PASSWORD')

#This is just the URL I get when I go to goodreads and select log in by email.
login_url = os.environ.get('URL')

In [10]:
#Here I start up a headless Firefox browser through Selenium
s = Service("geckodriver.exe")
opts=Options()
opts.add_argument('-headless')
browser = webdriver.Firefox(service=s)
browser.get(login_url)

In [11]:
# Here I log into goodreads
log_email = browser.find_element(By.ID, value="ap_email")
log_pwd = browser.find_element(By.ID, value="ap_password")
log_email.send_keys(user_name)
log_pwd.send_keys(password)
log_pwd.submit()
sleep(5)

In [25]:
df['genres'] = df.apply(lambda x: get_genres(x['title'], x['author']), axis=1)

In [26]:
df

Unnamed: 0,title,author,total words,vividness,passive voice,all adverbs,ly-adverbs,non-ly-adverbs,genres
0,Serpentine,Jonathan Kellerman,88863.0,48.34,7.61,2.74,0.92,1.82,"[Mystery, Fiction, Thriller, Crime, Mystery Th..."
1,Mom Genes,Abigail Tucker,79688.0,53.19,6.13,3.96,1.89,2.06,"[Nonfiction, Science, Parenting, Psychology, A..."
2,The Fallout,Rebecca Thornton,105017.0,34.79,10.43,3.93,1.09,2.84,"[Contemporary, Fiction, Drama, Womens Fiction,..."
3,11/22/63,Stephen King,270519.0,47.8,8.4,2.8,0.82,1.99,"[Fiction, Historical Fiction, Science Fiction,..."
4,The Country Guesthouse,Robyn Carr,85165.0,30.21,10.77,2.93,0.76,2.17,"[Romance, Fiction, Contemporary Romance, Conte..."
5,Secrets at St. Bride’s,Debbie Young,52562.0,50.69,7.97,2.95,0.96,1.99,"[Mystery, Cozy Mystery, Contemporary, British ..."
6,Fault Lines,Voddie T. Baucham,68643.0,24.33,6.72,2.48,0.98,1.5,"[Christian, Nonfiction, Theology, Politics, Ch..."
7,Ramona the Pest,Beverly Cleary,26582.0,47.77,7.96,2.66,0.78,1.88,"[Childrens, Fiction, Middle Grade, Classics, Y..."
8,The Lying Days,Nadine Gordimer,152597.0,51.93,7.05,3.14,1.25,1.9,"[Fiction, Africa, South Africa, Novels, Histor..."
9,Winter’s Heart,Robert Jordan,238949.0,54.79,8.02,3.31,1.27,2.04,"[Fantasy, Fiction, Epic Fantasy, High Fantasy,..."


In [27]:
browser.quit()