In [1]:
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib

In [2]:
url = "http://books.toscrape.com/"

def fetch_page(page):
    print(f"\rscraping page {page + 1}", end = "") # This just prints and erase the page being scraped
    response = requests.get(url + f'catalogue/page-{page + 1}.html',
                            headers={"Accept-Language":"en-US"} # This makes the request to the URL, the response shuold be <Response [200]>
    )
    soup = BeautifulSoup(response.content, "html.parser") # This creates the soup based on the contect read as a HTML
    return soup

In [3]:
ratings = {'One':1, 'Two':2, 'Three':3, 'Four':4, 'Five':5}

In [4]:
def add_books_to_dict(soup, dict):
    # The <article /> element with the class product_pod is what we are looking for!
    # All the books on the page have exactly the same structure, that’s exactly what we need for parsing.
    for book_html in soup.find_all(class_="product_pod"):
        # The title is located in an HTML link tag <a /> inside the <h3 /> tag. So we need to first .find() the h3, then the a.
        # And select the title in the <a /> tag’s attributes
        dict['Title'].append(book_html.find("h3").find("a").string)
        # The price is located within a <p class="price_color"></p>
        dict['Price'].append(float(book_html.find(class_="price_color").string.strip('£')))
        # there is a <p class="star-rating TEXT"></p> where TEXT can take the values "One", "Two", "Three", "Four" or "Five"
        stars_html = book_html.find(class_="star-rating")
        dict['Rating'].append(ratings.get(stars_html.attrs['class'][1], 0))

In [5]:
books_dict = { 'Title': [], 'Price': [], 'Rating': [] }

In [6]:
def create_books_df(max_page):
    books_dict = { 'Title': [], 'Price': [], 'Rating': [] }
    for page in range(max_page):
        soup = fetch_page(page) # First Function
        add_books_to_dict(soup, books_dict) # Second Function
    return pd.DataFrame.from_dict(books_dict)

In [None]:
books_df = create_books_df(50)
books_df.shape

scraping page 21

In [None]:
books_df.head()

In [None]:
from nbresult import ChallengeResult

result = ChallengeResult('books',
    books_dict=books_dict,
    columns=books_df.columns,
    title=str(books_df.loc[0,'Title']),
    price=books_df.loc[0,'Price'],
    rating=books_df.loc[0,'Rating']
)
result.write()
print(result.check())

In [None]:
def parse_rating(rating_classes):
    if 'One' in rating_classes:
        return 1
    elif 'Two' in rating_classes:
        return 2
    elif 'Three' in rating_classes:
        return 3
    elif 'Four' in rating_classes:
        return 4
    elif 'Five' in rating_classes:
        return 5
    else:
        return 0

In [None]:
all_books_dict = { 'Title': [], 'Price': [], 'Rating': [] }

MAX_PAGE = 30
for page in range(1, MAX_PAGE + 1):
    print(f"Parsing page {page}...")
    url = f"http://books.toscrape.com/catalogue/page-{page}.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    for book in soup.find_all("article", class_="product_pod"):
        title = book.find("h3").find("a").attrs["title"]
        price = float(book.find("p", class_="price_color").text[1:])
        rating = parse_rating(book.find("p", class_="star-rating").attr["class"])
        all_books_dict["Title"].append(title)
        all_books_dict["Price"].append(price)
        all_books_dict["Rating"].append(rating)

print("Done!")