In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
from secrets import goodreadsKey
import requests
import time
import editdistance
import re

In [2]:
# Load data so far
df = pd.read_hdf('pg-text-5-goodreads.hdf', 'pg')

In [3]:
class goodReads:  
    def __init__(self, title, author=None):
        self.title = title
        self.author = author
        self.response = self.queryAPI(title, author)
        self.bestWork = self.parseResponse(self.response)
        if self.bestWork is not None: 
            self.found = True
            self.pubDate = self.bestWork.original_publication_year.text + \
                '-' + self.bestWork.original_publication_month.text + '-' + \
                self.bestWork.original_publication_day.text
            self.rating = self.bestWork.average_rating.text
            self.id = self.bestWork.id.text
            self.numReviews = self.bestWork.text_reviews_count.text
            self.authorID = self.bestWork.author.id.text
            self.imageURL = self.bestWork.image_url.text
            self.smallImageURL = self.bestWork.small_image_url.text
            self.show()
        else: 
            self.found = False
        
    def show(self): 
        print('pubDate: ', self.pubDate)
        print('rating: ', self.rating)
        print('id: ', self.id)
        print('numReviews: ', self.numReviews)
        
    def queryAPI(self, title, author): 
        url = 'https://www.goodreads.com/search.xml'
        params = [('key', goodreadsKey), ('q', title)]
        response = requests.get(url, params)
        self.lastRequest = time.time()
        return response
        
    def parseResponse(self, response): 
        if response.status_code != 200: 
            print('Got response other than 200!')
        else: 
            self.text = response.text
            self.soup = BeautifulSoup(self.text, "lxml")
            self.works = self.soup.find_all('work')
            return self.getBestWork(self.works)
    
    def getBestWork(self, works, maxTries=3):
        for i in range(min(len(works), maxTries)): 
            authorParsed = self.parseAuthor(works[i])
            if authorParsed is not None: 
                if self.similarAuthors(self.author, authorParsed): 
                    return works[i]
        print("Couldn't find an appropriate work in the list.")
        return None
            
    def parseAuthor(self, work): 
        author = work.find('author').find('name').text
        return author
        
    def similarAuthors(self, origAuthor, parsedAuthor, threshold=3): 
#         print("origAuthor: ", origAuthor)
#         print("parsedAuthor: ", parsedAuthor)
        if (type(origAuthor) == str) and (type(parsedAuthor) == str): 
            origAuthor = origAuthor.strip().lower()
            origAuthor = re.sub('\(.*?\)', '', origAuthor)
            parsedAuthor = parsedAuthor.strip().lower()
            if ',' in origAuthor: 
                # Try to change Jeferson, Thomas to Thomas Jefferson
                nameParts = origAuthor.split(',')
                origAuthor = nameParts[1] + ' ' + nameParts[0]
            return editdistance.eval(origAuthor, parsedAuthor) < threshold
        else: 
            print("One of these authors is None, assuming it's OK.")
            return True

In [14]:
df.to_hdf('pg-text-5-goodreads.hdf', 'pg')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block1_values] [items->['LCC', 'author', 'authoryearofbirth', 'authoryearofdeath', 'downloads', 'formats', 'languages', 'lcsh', 'title', 'type', '_repo', '_version', 'alternative_title', 'contributor', 'covers', 'creator', 'description', 'edition_identifiers', 'edition_note', 'gutenberg_bookshelf', 'gutenberg_issued', 'gutenberg_type', 'identifiers', 'jmdate', 'subjects', 'language_note', 'production_note', 'publication_date', 'publication_note', 'publisher', 'rights', 'rights_url', 'series_note', 'summary', 'tableOfContents', 'titlepage_image', 'url', 'wikipedia', 'filename', 'wp_publication_date', 'wp_subjects', 'wp_info', 'wp_literary_genres', 'gr_rating', 'gr_numReviews', 'gr_pubDate', 'gr_id', 'gr_info', 'gr_author_id', 'gr_image_url', 'gr_small_image_url']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [5]:
wells, sea = df.loc[34681].author, df.loc[34681].title


In [6]:
wells

'Wells, H. G. (Herbert George)'

In [7]:
sea

'The Sea Lady'

In [11]:
result = goodReads(sea)

One of these authors is None, assuming it's OK.
pubDate:  2006--
rating:  3.26
id:  1349063
numReviews:  112


In [16]:
result.soup

<?xml version="1.0" encoding="UTF-8"?><html><body><goodreadsresponse>
<request>
<authentication>true</authentication>
<key></key>
<method></method>
</request>
<search>
<query></query>
<results-start>1</results-start>
<results-end>20</results-end>
<total-results>122</total-results>
<source>Goodreads</source>
<query-time-seconds>0.20</query-time-seconds>
<results>
<work>
<id type="integer">1349063</id>
<books_count type="integer">15</books_count>
<ratings_count type="integer">532</ratings_count>
<text_reviews_count type="integer">112</text_reviews_count>
<original_publication_year type="integer">2006</original_publication_year>
<original_publication_month nil="true" type="integer"></original_publication_month>
<original_publication_day nil="true" type="integer"></original_publication_day>
<average_rating>3.26</average_rating>
<best_book type="Book">
<id type="integer">105023</id>
<title>The Sea Lady</title>
<author>
<id type="integer">60750</id>
<name>Margaret Drabble</name>
</author>
<i