In [1]:
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import requests
import matplotlib.pyplot as plt
# Reads the book_club json file
book_club = pd.read_json('book_club.json', orient='records')
book_club['meeting'] = pd.to_datetime(book_club['meeting'],unit='ms')
book_club.set_index(['season','meeting','book'], inplace=True)
pd.set_option('display.max_colwidth', None)

In [2]:
# Returns BeautifulSoup for the given url
def soup_cooker(url):
    # Fetch
    try:
        response = requests.get(url)
        response.status_code == 200
    except e:
        print(requests.status_code)
        return None
    # Parse
    soup = BeautifulSoup(response.text, 'html.parser')
    # extracts title from soup
    title = soup.find('title').contents[0]
    print('Scraping:',title)
    if title.lower() == 'page not found':
        return None
    return soup

In [3]:
# Returns the url that matches the book name 
import webbrowser
def g_url(row):
    book = row.name[2].lower()
    soup = soup_cooker(row['goodreads search'])
    # finds the results on the first page, ordered in reverse so that the highest ranked ones will overwrite lower ranked ones
    tags = soup.find_all('a', {'class' : 'bookTitle'})[::-1]
    # splits the tags into title and converts href into a full url, trimming search info
    urls = {tag.span.text.lower(): 'https://www.goodreads.com' + tag['href'].split('?')[0] for tag in tags}
    if book in urls:
        return urls[book]
    else:
        for title in urls:
            if title.startswith(book):
                return urls[title]
    browser = webbrowser.get('windows-default')
    browser.open(row['goodreads search'])
    return None

In [4]:
if 'goodreads' in book_club.columns:
    mask = book_club['goodreads'].isnull()
    book_club.loc[mask,'goodreads'] = book_club[mask].apply(g_url, axis=1)
else:
    book_club['goodreads'] = book_club.apply(g_url, axis=1)
book_club.head()

Scraping: Search results for "Do Androids Dream of Electric Sheep?" (showing 1-10 of 153 books)
Scraping: Search results for "The Call of Cthulhu" (showing 1-10 of 492 books)
Scraping: Search results for "Guards! Guards!" (showing 1-10 of 25619 books)
Scraping: Search results for "Blood Meridian" (showing 1-10 of 58 books)
Scraping: Search results for "The Blade Itself" (showing 1-10 of 27 books)
Scraping: Search results for "Circe" (showing 1-10 of 1341 books)
Scraping: Search results for "The Plague" (showing 1-10 of 5591 books)
Scraping: Search results for "The Book Thief" (showing 1-10 of 1181 books)
Scraping: Search results for "Running With Sherman" (showing 1-4 of 4 books)
Scraping: Search results for "The Things They Carried" (showing 1-10 of 94 books)
Scraping: Search results for "The Little Prince" (showing 1-10 of 3618 books)
Scraping: Search results for "The Fifth Season" (showing 1-10 of 2184 books)
Scraping: Search results for "Kings of the Wyld" (showing 1-2 of 2 books)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,suggestor,james,matt,phil,ryan,sean,kyle,goodreads search,audible search,goodreads
season,meeting,book,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
The Codpiece,2019-06-05,Do Androids Dream of Electric Sheep?,group,5.5,5.0,6.0,5.0,4.6,,https://www.goodreads.com/search?q=Do+Androids+Dream+of+Electric+Sheep?&qid=kH2MDtYH2u,https://www.audible.com/search?keywords=Do+Androids+Dream+of+Electric+Sheep?&k=Do+Androids+Dream+of+Electric+Sheep?do+androids&sprefix=%2Cna-audible-us%2C424&i=na-audible-us&url=search-alias%3Dna-audible-us&ref=nb_sb_noss_2,https://www.goodreads.com/book/show/36402034-do-androids-dream-of-electric-sheep
The Codpiece,2019-07-01,The Call of Cthulhu,group,4.0,4.0,2.5,2.0,1.7,,https://www.goodreads.com/search?q=The+Call+of+Cthulhu&qid=kH2MDtYH2u,https://www.audible.com/search?keywords=The+Call+of+Cthulhu&k=The+Call+of+Cthulhudo+androids&sprefix=%2Cna-audible-us%2C424&i=na-audible-us&url=search-alias%3Dna-audible-us&ref=nb_sb_noss_2,https://www.goodreads.com/book/show/15730101-the-call-of-cthulhu
The Codpiece,2019-07-01,Guards! Guards!,group,5.0,1.0,5.7,5.5,5.3,,https://www.goodreads.com/search?q=Guards!+Guards!&qid=kH2MDtYH2u,https://www.audible.com/search?keywords=Guards!+Guards!&k=Guards!+Guards!do+androids&sprefix=%2Cna-audible-us%2C424&i=na-audible-us&url=search-alias%3Dna-audible-us&ref=nb_sb_noss_2,https://www.goodreads.com/book/show/833462.Guards_Guards_
The Codpiece,2019-07-31,Blood Meridian,phil,6.5,6.0,6.4,,4.3,,https://www.goodreads.com/search?q=Blood+Meridian&qid=kH2MDtYH2u,https://www.audible.com/search?keywords=Blood+Meridian&k=Blood+Meridiando+androids&sprefix=%2Cna-audible-us%2C424&i=na-audible-us&url=search-alias%3Dna-audible-us&ref=nb_sb_noss_2,https://www.goodreads.com/book/show/394535.Blood_Meridian_or_the_Evening_Redness_in_the_West
The Codpiece,2019-09-15,The Blade Itself,matt,5.7,6.5,,,6.3,,https://www.goodreads.com/search?q=The+Blade+Itself&qid=kH2MDtYH2u,https://www.audible.com/search?keywords=The+Blade+Itself&k=The+Blade+Itselfdo+androids&sprefix=%2Cna-audible-us%2C424&i=na-audible-us&url=search-alias%3Dna-audible-us&ref=nb_sb_noss_2,https://www.goodreads.com/book/show/68616.The_Blade_Itself


In [5]:
def g_rating(soup):
    try:
        # Returns the soup's rating
        return float(soup.find('div', {'class': 'RatingStatistics__rating'}).contents[0])
    except:
        print('Error finding the rating in soup:', soup.find('title').contents[0])
        return None
def g_genre(soup):
    try:
        # generates a list of genre tags
        tags = soup.find('ul', {'aria-label': 'Top genres for this book'}).find_all('span', {'class' : 'Button__labelItem'})
        # gets the content of each tag in the list
        genres = [genre.contents[0] for genre in tags]
        # removes the last category if it is ...more
        if genres[-1] == '...more':
            del genres[-1]
        return genres
    except:
        print('Error finding the categories in soup:', soup.find('title').contents[0])
        return None
def g_published(soup):
    try:
        contents = soup.find('p', {'data-testid' : 'publicationInfo'}).contents[0]
        date = contents.replace('First published ', '').replace('Published ', '')
        return datetime.strptime(date, '%B %d, %Y')
    except:
        print('Error finding the publication info in soup:', soup.find('title').contents[0])
        return None
def g_pages(soup):
    try:
        contents = soup.find('p', {'data-testid' : 'pagesFormat'}).contents[0]
        return int(contents.split(' pages')[0])
    except:
        print('Error finding the pages in soup:', soup.find('title').contents[0])
        return None
def g_author(soup):
    try:
        return soup.find('span', {'class' : 'ContributorLink__name'}).contents[0]
    except:
        print('Error finding the author in soup:', soup.find('title').contents[0])
        return None
def g_scrape(url):
    soup = soup_cooker(url)
    return pd.Series([g_rating(soup), g_genre(soup), g_author(soup), g_pages(soup), g_published(soup)])

In [6]:
book_club[['g_rating', 'g_genre', 'g_author', 'g_pages', 'g_published']] = book_club['goodreads'].apply(g_scrape)
book_club.reset_index().to_json('book_club.json', orient='records')

Scraping: Do Androids Dream of Electric Sheep? by Philip K. Dick | Goodreads
Scraping: The Call of Cthulhu by H.P. Lovecraft | Goodreads
Scraping: Guards! Guards!: The Graphic Novel by Stephen Briggs | Goodreads
Scraping: Blood Meridian, or, the Evening Redness in the West by Cormac McCarthy | Goodreads
Scraping: The Blade Itself by Marcus Sakey | Goodreads
Scraping: Circe by Madeline Miller | Goodreads
Scraping: The Plague by Albert Camus | Goodreads
Scraping: The Book Thief by Markus Zusak | Goodreads
Scraping: Running with Sherman by Christopher McDougall | Goodreads
Scraping: The Things They Carried by Tim O'Brien | Goodreads
Scraping: The Little Prince by Antoine de Saint-Exupéry | Goodreads
Scraping: The Fifth Season (Malin Fors #5) by Mons Kallentoft | Goodreads
Scraping: Kings of the Wyld (The Band, #1) by Nicholas Eames | Goodreads
Scraping: The Shadow of What Was Lost by James  Islington | Goodreads
Scraping: The Drunken Botanist: an exploration of the plants that make up som

Unnamed: 0_level_0,Unnamed: 1_level_0,suggestor,james,matt,phil,ryan,sean,kyle,goodreads search,audible search,goodreads,g_rating,g_genre,g_author,g_pages,g_published
season,meeting,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A New Player,2020-09-13,james,5.3,4.5,2.8,3.0,4.0,,https://www.goodreads.com/search?q=A+Sand+County+Almanac&qid=kH2MDtYH2u,https://www.audible.com/search?keywords=A+Sand+County+Almanac&k=A+Sand+County+Almanacdo+androids&sprefix=%2Cna-audible-us%2C424&i=na-audible-us&url=search-alias%3Dna-audible-us&ref=nb_sb_noss_2,https://www.goodreads.com/book/show/86562529-a-sand-county-almanac,4.18,"[Nature, Nonfiction, Essays]",Aldo Leopold,462.0,2011-04-01
