In [2]:
# import libraries
import requests
from bs4 import BeautifulSoup

In [3]:
# URL to scrape
url = 'https://books.toscrape.com/'
response = requests.get(url)
html = response.content
scraped = BeautifulSoup(html, 'html.parser')


In [4]:
# print title of the page
scraped.title # prints the title with html tags

<title>
    All products | Books to Scrape - Sandbox
</title>

In [5]:
# print the title without html tags
scraped.title.text

'\n    All products | Books to Scrape - Sandbox\n'

In [6]:
# remove the all invisible characters
scraped.title.text.strip()

'All products | Books to Scrape - Sandbox'

In [7]:
title_text = scraped.title.text.strip()
print(title_text)

All products | Books to Scrape - Sandbox


In [8]:
# print 'a' tag attributes 
scraped.a

<a href="index.html">Books to Scrape</a>

In [9]:
# print 'a' tag attribute through find method   	    
scraped.find('a')

<a href="index.html">Books to Scrape</a>

In [10]:
scraped.find('article')

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [11]:
# can't use 'class' keyword in python . So we are using class_
scraped.find('article', class_ = 'product_pod') 

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [12]:
# Get the title text of the book
scraped.find('article', class_ = 'product_pod').h3.a

<a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

In [13]:
# the book title is not clear. But the title class contains the clear name of the book
link = scraped.find('article', class_ = 'product_pod').h3.a
link

<a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

In [14]:
# print the title of the book using attribute - 'title'
link["title"]

'A Light in the Attic'

In [17]:
# print the titles 
articles = scraped.find_all('article', class_ = 'product_pod')
for article in articles:
    links = article.h3.a
    print (links["title"])


A Light in the Attic
Tipping the Velvet
Soumission
Sharp Objects
Sapiens: A Brief History of Humankind
The Requiem Red
The Dirty Little Secrets of Getting Your Dream Job
The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
The Black Maria
Starving Hearts (Triangular Trade Trilogy, #1)
Shakespeare's Sonnets
Set Me Free
Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)
Rip it Up and Start Again
Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991
Olio
Mesaerion: The Best Science Fiction Stories 1800-1849
Libertarianism for Beginners
It's Only the Himalayas


In [26]:
# print the prices
articles = scraped.find_all('div', class_ = 'product_price')
for article in articles:
    price = float(article.p.text.lstrip("£"))
    print(price)

51.77
53.74
50.1
47.82
54.23
22.65
33.34
17.93
22.6
52.15
13.99
20.66
17.46
52.29
35.02
57.25
23.88
37.59
51.33
45.17


In [62]:
# load the details into array
result_set = []
articles = scraped.find_all('article', class_ = 'product_pod')
for article in articles:
    title = article.h3.a['title']
    price = article.find('div', class_ = 'product_price').find('p', class_ = 'price_color').text
    price = float(price.lstrip('£'))
    stock_avail = article.find('p', class_ = 'instock availability').text
    #print(title, price)
    result_set.append({title : price}) # load the price and book details
print(result_set)

 

[{'A Light in the Attic': 51.77}, {'Tipping the Velvet': 53.74}, {'Soumission': 50.1}, {'Sharp Objects': 47.82}, {'Sapiens: A Brief History of Humankind': 54.23}, {'The Requiem Red': 22.65}, {'The Dirty Little Secrets of Getting Your Dream Job': 33.34}, {'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull': 17.93}, {'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics': 22.6}, {'The Black Maria': 52.15}, {'Starving Hearts (Triangular Trade Trilogy, #1)': 13.99}, {"Shakespeare's Sonnets": 20.66}, {'Set Me Free': 17.46}, {"Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)": 52.29}, {'Rip it Up and Start Again': 35.02}, {'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991': 57.25}, {'Olio': 23.88}, {'Mesaerion: The Best Science Fiction Stories 1800-1849': 37.59}, {'Libertarianism for Beginners': 51.33}, {"It's Only the Himalayas": 45.17}]
