In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [2]:
landingPage = 'https://books.toscrape.com'

# Get response from website
response = requests.get(url=landingPage)

In [3]:
# View all attributes of the response
response.__attrs__

['_content',
 'status_code',
 'headers',
 'url',
 'history',
 'encoding',
 'reason',
 'cookies',
 'elapsed',
 'request']

In [4]:
# What is the status code?
response.status_code

200

In [5]:
# Creating a soup from the content 
soup = BeautifulSoup(markup=response.content, 
              features='html.parser')

In [6]:
bookList = soup.find(name='ol', attrs={'class': 'row'})\
                .findAll(name='li', attrs={'class': "col-xs-6 col-sm-4 col-md-3 col-lg-3"})

In [7]:
test = bookList[0]

test

<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>
</li>

In [8]:
# Grabbing the title of the book
title = test.find(name='h3').find(name='a')['title']

# Grab the price of the book
priceInEuros = test.find(name='div', attrs={'class': 'product_price'})\
                    .find(name='p', attrs={'class': 'price_color'})\
                    .text\
                    .replace('£', '')

# Grab the availability in store
availability = test.find(name='div', attrs={'class': 'product_price'})\
                    .find(name='p', attrs={'class': 'instock availability'})\
                    .text\
                    .strip()

# Extract the star rating
rating = test.find(name='p', 
          attrs={'class': re.compile(pattern='star-rating.+')})['class'][1]

In [9]:
df = pd.DataFrame(columns=['title', 'priceInEuros', 'availability', 'rating'])


for idx, book in enumerate(bookList):
    # Grab the title
    title = book.find(name='h3').find(name='a')['title']
    # Grab the price
    priceInEuros = book.find(name='div', attrs={'class': 'product_price'})\
                    .find(name='p', attrs={'class': 'price_color'})\
                    .text\
                    .replace('£', '')
    # Grab the availability
    availability = book.find(name='div', attrs={'class': 'product_price'})\
                    .find(name='p', attrs={'class': 'instock availability'})\
                    .text\
                    .strip()
    # Grab the rating
    rating = book.find(name='p', 
          attrs={'class': re.compile(pattern='star-rating.+')})['class'][1]
    
    # Appending information to the dataframe
    df.loc[idx] = [title, float(priceInEuros), availability, rating]

In [10]:
df.head()

Unnamed: 0,title,priceInEuros,availability,rating
0,A Light in the Attic,51.77,In stock,Three
1,Tipping the Velvet,53.74,In stock,One
2,Soumission,50.1,In stock,One
3,Sharp Objects,47.82,In stock,Four
4,Sapiens: A Brief History of Humankind,54.23,In stock,Five


## Incorporate Pagination

In [17]:
pagesScrapped = 1
df = pd.DataFrame(columns=['title', 'priceInEuros', 'availability', 'rating'])
landingPage = 'https://books.toscrape.com/'
url = landingPage
listOfDfs = []

while pagesScrapped <= 48:
    # For each page, make a request
    response = requests.get(url=url)

    # Create a soup
    soup = BeautifulSoup(markup=response.content,
                         features='html.parser')

    # Extract the list of books
    bookList = soup.find(name='ol', attrs={'class': 'row'})\
                .findAll(name='li', attrs={'class': "col-xs-6 col-sm-4 col-md-3 col-lg-3"})

    for idx, book in enumerate(bookList):
        # Grab the title
        title = book.find(name='h3').find(name='a')['title']
        # Grab the price
        priceInEuros = book.find(name='div', attrs={'class': 'product_price'})\
                        .find(name='p', attrs={'class': 'price_color'})\
                        .text\
                        .replace('£', '')
        # Grab the availability
        availability = book.find(name='div', attrs={'class': 'product_price'})\
                        .find(name='p', attrs={'class': 'instock availability'})\
                        .text\
                        .strip()
        # Grab the rating
        rating = book.find(name='p', 
            attrs={'class': re.compile(pattern='star-rating.+')})['class'][1]
        
        # Appending information to the dataframe
        df.loc[idx] = [title, float(priceInEuros), availability, rating]
    
    # Store the pages info in a list of dataframes
    listOfDfs.append(df)


    # Get the url for the next page
    nextPageUrl = soup.find(name='li', attrs={'class': 'next'})\
                        .find(name='a')['href']
    
    # Update url 
    if pagesScrapped >= 2:
        url = landingPage + 'catalogue/' + nextPageUrl
    else:
        url = landingPage + nextPageUrl

    # Update control variable
    pagesScrapped += 1

df = pd.concat(listOfDfs)

In [19]:
df.shape

(960, 4)

In [23]:
df.head()

Unnamed: 0,title,priceInEuros,availability,rating
0,The Bhagavad Gita,57.49,In stock,Three
1,The Bette Davis Club,30.66,In stock,Three
2,The Art of Not Breathing,40.83,In stock,Four
3,Taking Shots (Assassins #1),18.88,In stock,Two
4,Starlark,25.83,In stock,Three


In [24]:
rating_dict = {'One': 1,
               'Two':2,
               'Three': 3,
               'Four':4,
               'Five':5}

In [27]:
df['rating'] = df['rating'].replace(rating_dict)