Now, rather than saving a sample, attempting to load ALL the data onto my computer! 

In [1]:
import requests
import json
import csv
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import random
import os
import string
from time import sleep
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import WebDriverException
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
def get_genres(title, author):
    '''Given the title and author of a book,
    return a list of genres and the publication year'''
    book_id = -1
    itemqueue = []
    trials = 0
    
    #Put the title into the search bar
    while (len(itemqueue) == 0) and (trials < 3):
        browser.get('http://www.goodreads.com/search?q=&qid=')
        search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(title)
        search_book.submit()
        sleep(random.uniform(8,15))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        book_list = list()
        trials += 1
    if len(itemqueue) == 0:
        return ([], np.nan)
    
    #Search the first page of results for the author's name
    #Use only first author if multiple
        
    for i in range(len(itemqueue)):
        book_list.append(itemqueue[i].text.split('\n'))
        book_list_ap = list()
    for i in range(0, len(book_list)):
         book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
    for book in book_list_ap:
        if f"by {author.split(' &')[0]}" in book[1]:
            book_id = book[2].split('/')[-1].split('.')[0]
            break
            
    #Sometimes, a book's title is so common that the correct version isn't on the first page
    #If that happens, try putting the author into the search bar
    #And searching the first page for the correct title
    
    if book_id == -1:
        browser.get('http://www.goodreads.com/search?q=&qid=')
        search_book = browser.find_element(By.ID, value='search_query_main')
        search_book.send_keys(author)
        search_book.submit()
        sleep(random.uniform(8,15))
        itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
        img = browser.find_elements(By.CLASS_NAME, value="bookCover")
        book_list = list()
        for i in range(len(itemqueue)):
            book_list.append(itemqueue[i].text.split('\n'))
            book_list_ap = list()
        for i in range(0, len(book_list)):
             book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
        for book in book_list_ap:
            if title in book[0]:
                book_id = book[2].split('/')[-1].split('.')[0]
                break
    
    #If that doesn't work, return an empty list. 
    if book_id == -1:
        return ([], np.nan)
        
        
    book_url = f'https://www.goodreads.com/book/show/{book_id}'
    browser.get(book_url)
    genres = browser.find_elements(By.XPATH, value="//span[contains(@class, 'BookPageMetadataSection__genreButton')]")
    try: 
        datestring = browser.find_element(By.CSS_SELECTOR, ".FeaturedDetails > p:nth-child(2)")
        year = int(datestring.text[-4:])
    except NoSuchElementException:
        year = np.nan
    except ValueError:
        year = np.nan
    sleep(random.uniform(8,15))
    return ([genre.text for genre in genres], year)

In [3]:
def get_info(book_dict):
    '''Given a dict containing the title and author of a book in the list,
    returns a dictionary of prosecraft's analysis about the book,
    as well as a list of genres and the publication year from Goodreads'''
    title = book_dict['t']
    author = book_dict['a']
    #Get rid of special characters in URL
    chars_to_remove = [':', '’', '.', ",", '“', '”']
    info = {'title': title, 'author': author}
    title = title.replace('/', '-')
    URL = f"{author}/{title}/"
    for char in chars_to_remove: 
        URL = URL.replace(char, '')
    URL = URL.replace('&', 'and').replace(' ','-').lower()
    URL = "http://prosecraft.io/library/" + URL
    
    #Get data from Prosecraft and turn it into a dict
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    headings = soup.find_all("div", {"class": "book-info-metric-heading"})
    values = soup.find_all("div", {"class": "book-info-metric-value"})
    for heading, value in zip(headings, values):
        info[heading.text] = float(value.text.strip('%').replace(',',''))
    
    #Get genre data from Goodreads and include it as a list
    info['genre'], info['year'] = get_genres(info['title'], info['author'])
    
    return info

In [4]:
with open('book_list.json', 'r') as lst:
    book_list = json.load(lst)


In [5]:
with open('counter.txt','r') as counter:
    book_count = int(counter.read())

In [14]:
#with open ('counter.txt','w') as counter:
#    counter.write(str(book_count))

In [7]:
#Fill in my Goodreads username and password from the .env
user_name = os.environ.get('USER')
password = os.environ.get('PASSWORD')

#This is just the URL I get when I go to goodreads and select log in by email.
login_url = os.environ.get('URL')

In [8]:
#Here I start up a headless Firefox browser through Selenium
s = Service("geckodriver.exe")
opts=Options()
opts.add_argument('-headless')
browser = webdriver.Firefox(service=s)
browser.get(login_url)

In [9]:
# Here I log into goodreads
log_email = browser.find_element(By.ID, value="ap_email")
log_pwd = browser.find_element(By.ID, value="ap_password")
log_email.send_keys(user_name)
log_pwd.send_keys(password)
log_pwd.submit()
sleep(5)

In [10]:
dict_keys = ['title', 'author', 'total words', 'vividness', 'passive voice',
       'all adverbs', 'ly-adverbs', 'non-ly-adverbs', 'genre', 'year']

In [11]:
with open('book_data.csv', 'a', newline='',encoding="utf-8") as data:
    fail_count = 0
    writer = csv.DictWriter(data, dict_keys)
    for i in range(25000):
        try: 
            next_book = get_info(book_list[book_count])
            writer.writerow(next_book)
            book_count += 1
        except NoSuchElementException:
            book_trials = 1
            fail_count += 1
            while book_trials < 3:
                sleep(random.uniform(8,15))
                try: 
                    next_book = get_info(book_list[book_count])
                    writer.writerow(next_book)
                    book_count += 1
                    break
                except NoSuchElementException:
                    book_trials += 1
                fail_count += 1
            book_count += 1
        except requests.exceptions.ConnectionError:
            print("Connection Error")
            sleep(120)
        except WebDriverException:
            print("Web Driver Exception")
            sleep(120)
        except IndexError:
            print("Finished!")
            break
    
    

Connection Error
Connection Error
Connection Error
Connection Error
Web Driver Exception
Connection Error
Web Driver Exception
Connection Error
Connection Error
Connection Error
Connection Error
Connection Error
Connection Error
Connection Error


KeyboardInterrupt: 

In [13]:
book_count

19815

In [None]:
# I realized after adding about 50 books that the publication year, 
#in addition to the linguistic data, might be predictive of genre. 
# This was the function where I added a year to each of the titles I'd already added.
# After that, I incorporated it into the genres function for future books. 



#def add_year(title, author):
#     book_id = -1
#     browser.get('http://www.goodreads.com/search?q=&qid=')
#     try: 
#         search_book = browser.find_element(By.ID, value='search_query_main')
#     except NoSuchElementException: 
#         return(add_year(title, author))
#     search_book.send_keys(title)
#     search_book.submit()
#     sleep(random.uniform(10,20))
#     itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
#     img = browser.find_elements(By.CLASS_NAME, value="bookCover")
#     book_list = list()
    
    
#     #Search the first page of results for the author's name
#     #Use only first author if multiple
#     for i in range(len(itemqueue)):
#         book_list.append(itemqueue[i].text.split('\n'))
#         book_list_ap = list()
#     for i in range(0, len(book_list)):
#          book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
#     for book in book_list_ap:
#         if f"by {author.split(' &')[0]}" in book[1]:
#             book_id = book[2].split('/')[-1].split('.')[0]
#             break
            
#     #Sometimes, a book's title is so common that the correct version isn't on the first page
#     #If that happens, try putting the author into the search bar
#     #And searching the first page for the correct title
    
#     if book_id == -1:
#         browser.get('http://www.goodreads.com/search?q=&qid=')
#         search_book = browser.find_element(By.ID, value='search_query_main')
#         search_book.send_keys(author)
#         search_book.submit()
#         sleep(random.uniform(10,20))
#         itemqueue = browser.find_elements(By.XPATH, value="//table/tbody/tr[contains(@itemtype, 'http://schema.org/Book')]")
#         img = browser.find_elements(By.CLASS_NAME, value="bookCover")
#         book_list = list()
#         for i in range(len(itemqueue)):
#             book_list.append(itemqueue[i].text.split('\n'))
#             book_list_ap = list()
#         for i in range(0, len(book_list)):
#              book_list_ap.append((book_list[i][0],book_list[i][1],img[i].get_property("src")))
#         for book in book_list_ap:
#             if title in book[0]:
#                 book_id = book[2].split('/')[-1].split('.')[0]
#                 break
    
#     #If that doesn't work, return an empty list. 
#     if book_id == -1:
#         return np.nan
    
#     book_url = f'https://www.goodreads.com/book/show/{book_id}'
#     browser.get(book_url)
#     try: 
#         datestring = browser.find_element(By.CSS_SELECTOR, ".FeaturedDetails > p:nth-child(2)")
#     except NoSuchElementException: 
#         return(add_year(title, author))
    
#     year = int(datestring.text[-4:])
#     sleep(random.uniform(10,20))
#     return year

In [None]:
browser.quit()