In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
# Reads the book_club json file
book_club = pd.read_json('book_club.json', orient='records')
if 'g_published' in book_club.columns:
    book_club[['meeting','g_published']] = book_club[['meeting','g_published']].apply(lambda x: pd.to_datetime(x, unit='ms'))
else:
    book_club['meeting'] = pd.to_datetime(book_club['meeting'],unit='ms')
pd.set_option('display.max_colwidth', None)

In [2]:
# Returns BeautifulSoup for the given url
def soup_cooker(url):
    # Fetch
    try:
        response = requests.get(url)
    except Exception as e:
        print('Connection Error', str(e))
        return None
    # Parse
    soup = BeautifulSoup(response.text, 'html.parser')
    # extracts title from soup
    title = soup.find('title').contents[0]
    print('Scraping:',title)
    return soup

In [3]:
# Returns the url that matches the book name 
import webbrowser
def g_url(row):
    book = row['book'].lower()
    soup = soup_cooker(row['goodreads search'])
    # finds the anchor tags of the first page
    anchor_tags = soup.find_all('a', {'class' : 'bookTitle'})
    # splits the anchor tags into title and converts href into a full url, trimming search info
    title_url_tuples = [(anchor_tag.span.text.lower(), 'https://www.goodreads.com' + anchor_tag['href'].split('?')[0]) for anchor_tag in anchor_tags]
    # finds the author tags of the first page
    author_tags = soup.find_all('span', {'itemprop' : 'author'})
    # creates a mask of any results that contain an adapter in the author list
    mask = [len(author_tag.find_all(string='(Adapter)')) == 0 for author_tag in author_tags]
    search_results = [tuple for tuple, bool in zip(title_url_tuples, mask) if bool]
    # checks search results for exact matches to the title, returns the first matching url
    for result in search_results:
        if result[0] == book:
            return result[1]
    # checks search results for titles that start with the book, returns first matching url
    for result in search_results:
        if result[0].startswith(book):
            return result[1]
    # automatically opens the search url of a book title that isn't found
    browser = webbrowser.get('windows-default')
    browser.open(row['goodreads search'])
    return None

In [4]:
def g_rating(soup):
    try:
        # Returns the soup's rating
        return float(soup.find('div', {'class': 'RatingStatistics__rating'}).contents[0])
    except:
        print('Error finding the rating in soup:', soup.find('title').contents[0])
        return None
def g_genre(soup):
    try:
        # generates a list of genre tags
        tags = soup.find('ul', {'aria-label': 'Top genres for this book'}).find_all('span', {'class' : 'Button__labelItem'})
        # gets the content of each tag in the list
        genres = [genre.contents[0] for genre in tags]
        # removes the last category if it is ...more
        if genres[-1] == '...more':
            del genres[-1]
        return genres
    except:
        print('Error finding the categories in soup:', soup.find('title').contents[0])
        return None
def g_published(soup):
    try:
        contents = soup.find('p', {'data-testid' : 'publicationInfo'}).contents[0]
        date = contents.replace('First published ', '').replace('Published ', '')
        return pd.to_datetime(date, format='%B %d, %Y')
    except:
        print('Error finding the publication info in soup:', soup.find('title').contents[0])
        return None
def g_pages(soup):
    try:
        contents = soup.find('p', {'data-testid' : 'pagesFormat'}).contents[0]
        return int(contents.split(' ')[0])
    except:
        print('Error finding the pages in soup:', soup.find('title').contents[0])
        return None
def g_author(soup):
    try:
        return soup.find('span', {'class' : 'ContributorLink__name'}).contents[0]
    except:
        print('Error finding the author in soup:', soup.find('title').contents[0])
        return None
def g_scrape(url):
    # collects all the scraped data and returns it as a Series
    soup = soup_cooker(url)
    return pd.Series([g_rating(soup), g_genre(soup), g_author(soup), g_pages(soup), g_published(soup)])

In [5]:
if 'goodreads' in book_club.columns:
    mask = book_club['goodreads'].isnull()
    book_club.loc[mask,['g_rating', 'g_genre', 'g_author', 'g_pages', 'g_published']] = book_club[mask].apply(g_url, axis=1).apply(g_scrape)
    book_club.to_json('book_club.json', orient='records')
else:
    book_club[['g_rating', 'g_genre', 'g_author', 'g_pages', 'g_published']] = book_club.apply(g_url, axis=1).apply(g_scrape)
    book_club.to_json('book_club.json', orient='records')

Scraping: Search results for "Do Androids Dream of Electric Sheep?" (showing 1-10 of 153 books)
Scraping: Search results for "The Call of Cthulhu" (showing 1-10 of 489 books)
Scraping: Search results for "Guards! Guards!" (showing 1-10 of 25402 books)
Scraping: Search results for "Blood Meridian" (showing 1-10 of 57 books)
Scraping: Search results for "The Blade Itself" (showing 1-10 of 26 books)
Scraping: Search results for "Circe" (showing 1-10 of 1332 books)
Scraping: Search results for "The Plague" (showing 1-10 of 5524 books)
Scraping: Search results for "The Book Thief" (showing 1-10 of 1132 books)
Scraping: Search results for "Running With Sherman" (showing 1-4 of 4 books)
Scraping: Search results for "The Things They Carried" (showing 1-10 of 95 books)
Scraping: Search results for "The Little Prince" (showing 1-10 of 3567 books)
Scraping: Search results for "The Fifth Season" (showing 1-10 of 2123 books)
Scraping: Search results for "Kings of the Wyld" (showing 1-2 of 2 books)
