In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
from secrets import goodreadsKey
import requests
import time
import editdistance
import re

In [3]:
# Load data so far
df = pd.read_hdf('pg-text-5-goodreads.hdf', 'pg')

In [None]:
class goodReads:  
    def __init__(self, title, author=None):
        self.title = title
        self.author = author
        self.response = self.queryAPI(title, author)
        self.bestWork = self.parseResponse(self.response)
        if self.bestWork is not None: 
            self.found = True
            self.pubDate = self.bestWork.original_publication_year.text + \
                '-' + self.bestWork.original_publication_month.text + '-' + \
                self.bestWork.original_publication_day.text
            self.rating = self.bestWork.average_rating.text
            self.id = self.bestWork.id.text
            self.numReviews = self.bestWork.text_reviews_count.text
            self.authorID = self.bestWork.author.id.text
            self.imageURL = self.bestWork.image_url.text
            self.smallImageURL = self.bestWork.small_image_url.text
            self.show()
        else: 
            self.found = False
        
    def show(self): 
        print('pubDate: ', self.pubDate)
        print('rating: ', self.rating)
        print('id: ', self.id)
        print('numReviews: ', self.numReviews)
        
    def queryAPI(self, title, author): 
        url = 'https://www.goodreads.com/search.xml'
        params = [('key', goodreadsKey), ('q', title)]
        response = requests.get(url, params)
        self.lastRequest = time.time()
        return response
        
    def parseResponse(self, response): 
        if response.status_code != 200: 
            print('Got response other than 200!')
        else: 
            self.text = response.text
            self.soup = BeautifulSoup(self.text, "lxml")
            self.works = self.soup.find_all('work')
            return self.getBestWork(self.works)
    
    def getBestWork(self, works, maxTries=3):
        for i in range(min(len(works), maxTries)): 
            authorParsed = self.parseAuthor(works[i])
            if authorParsed is not None: 
                if self.similarAuthors(self.author, authorParsed): 
                    return works[i]
        print("Couldn't find an appropriate work in the list.")
        return None
            
    def parseAuthor(self, work): 
        author = work.find('author').find('name').text
        return author
        
    def similarAuthors(self, origAuthor, parsedAuthor, threshold=3): 
#         print("origAuthor: ", origAuthor)
#         print("parsedAuthor: ", parsedAuthor)
        if (type(origAuthor) == str) and (type(parsedAuthor) == str): 
            origAuthor = origAuthor.strip().lower()
            origAuthor = re.sub('\(.*?\)', '', origAuthor)
            parsedAuthor = parsedAuthor.strip().lower()
            if ',' in origAuthor: 
                # Try to change Jefferson, Thomas to Thomas Jefferson
                nameParts = origAuthor.split(',')
                origAuthor = nameParts[1] + ' ' + nameParts[0]
                # if the last name is somewhere in the name, that's good enough
                if nameParts[0] in parsedAuthor: 
                    return True
            if editdistance.eval(origAuthor, parsedAuthor) < threshold: 
                return True
            else: 
                print("Couldn't match origAuthor: %s \n with parsedAuthor: %s" % (origAuthor, parsedAuthor))
                return False
        else: 
            print("One of these authors is None, assuming it's OK.")
            return True

In [22]:
start, end = 53043, 100000
for i, row in df.iterrows(): 
    if i > start and i < end: 
        print('Looking up #%s: %s' % (i, row.title))
        print('By: ', row.author)
        gr = goodReads(row.title, row.author)
        if gr.found: 
            df.set_value(i, 'gr_rating', gr.rating)
            df.set_value(i, 'gr_numReviews', gr.numReviews)
            df.set_value(i, 'gr_pubDate', gr.pubDate)
            df.set_value(i, 'gr_id', gr.id)
            df.set_value(i, 'gr_info', str(gr.bestWork))
            df.set_value(i, 'gr_author_id', gr.authorID)
            df.set_value(i, 'gr_image_url', gr.imageURL)
            df.set_value(i, 'gr_small_image_url', gr.smallImageURL)
        time.sleep(1)

Looking up #53044: Three Days on the Ohio River
By:  Alcott, William A. (William Andrus)
Couldn't find an appropriate work in the list.
Looking up #53045: The Irish Penny Journal, Vol. 1, No. 18, October 31, 1840
By:  Various
pubDate:  --
rating:  0.0
id:  56354618
numReviews:  0
Looking up #53046: Conscience and Sin: Daily Meditations for Lent
By:  Baring-Gould, S. (Sabine)
Couldn't find an appropriate work in the list.
Looking up #53047: What Jesus Taught
By:  Widtsoe, Osborne J. P.
Couldn't find an appropriate work in the list.
Looking up #53048: The American Missionary — Volume 33, No. 8, August, 1879
By:  Various
pubDate:  2006-3-16
rating:  0.0
id:  13035293
numReviews:  0
Looking up #53049: Charlie Codman's Cruise: A Story for Boys
By:  Alger, Horatio, Jr.
pubDate:  --
rating:  0.0
id:  56255776
numReviews:  0
Looking up #53050: Short Reasons for Communion with the Church of England: or the Churchman's answer to the question, "Why are you a Member of the Established Church?"
By:

Looking up #53103: The Slav Nations
By:  Tucić, Srgjan Pl.
Couldn't find an appropriate work in the list.
Looking up #53104: Printing in Relation to Graphic Art
By:  French, George
pubDate:  2015-8-20
rating:  0.0
id:  46767682
numReviews:  0
Looking up #53105: Jed, the Poorhouse Boy
By:  Alger, Horatio, Jr.
pubDate:  --
rating:  0.0
id:  56173586
numReviews:  0
Looking up #53106: In Beaver Cove and Elsewhere
By:  Crim, Matt
pubDate:  2007-10-1
rating:  0.0
id:  19957096
numReviews:  0
Looking up #53107: Istruzioni popolari per la buona tenuta dei bachi da seta
By:  Poggi, Tito
Couldn't find an appropriate work in the list.
Looking up #53108: Des Vaters Sünde, der Mutter Fluch
By:  Clauren, Heinrich
Couldn't find an appropriate work in the list.
Looking up #53109: Glasgow; A Sketch-Book
By:  Nisbet, John
Couldn't find an appropriate work in the list.
Looking up #53110: Harrow; A Sketch-Book
By:  Keesey, Walter M.
pubDate:  --
rating:  0.0
id:  55855713
numReviews:  0
Looking up #53111:

Looking up #53160: nan
By:  nan
One of these authors is None, assuming it's OK.
pubDate:  2012-1-10
rating:  4.27
id:  16827462
numReviews:  140061
Looking up #53161: nan
By:  nan
One of these authors is None, assuming it's OK.
pubDate:  2012-1-10
rating:  4.27
id:  16827462
numReviews:  140061
Looking up #53162: nan
By:  nan
One of these authors is None, assuming it's OK.
pubDate:  2012-1-10
rating:  4.27
id:  16827462
numReviews:  140061
Looking up #53163: nan
By:  nan
One of these authors is None, assuming it's OK.
pubDate:  2012-1-10
rating:  4.27
id:  16827462
numReviews:  140061
Looking up #53164: nan
By:  nan
One of these authors is None, assuming it's OK.
pubDate:  2012-1-10
rating:  4.27
id:  16827462
numReviews:  140061
Looking up #53165: nan
By:  nan
One of these authors is None, assuming it's OK.
pubDate:  2012-1-10
rating:  4.27
id:  16827462
numReviews:  140061
Looking up #53166: nan
By:  nan
One of these authors is None, assuming it's OK.
pubDate:  2012-1-10
rating:  4.2

In [None]:
df.to_hdf('pg-text-6-goodreads.hdf', 'pg')

In [None]:
len(df)