Trying out BeautifulSoup for webscraping

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
url = "http://books.toscrape.com/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
# find all info you need by passing in the element type and the class
books_html = soup.find_all("article", class_="product_pod")
len(books_html)

20

In [None]:
books_html[0]

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [None]:
# from the html, the book title is in element h3 and element type a
title = books_html[0].find("h3").find("a").attrs["title"]
price = float(books_html[0].find("div", class_="product_price").find("p", class_="price_color").string[1:])
# availabilty = books_html[0].find("div", class_="product_price").find("p", class_="instock availability").find("i", class_="icon-ok").text[10:]
rating = books_html[0].find("p", class_="star-rating").attrs['class']

In [None]:
def parse_rating(rating_list):
    '''
    recieve a list item containing rating information and return the rating as an integer'''
    if 'One' in rating:
        return 1
    elif 'Two' in rating:
      return 2
    elif 'Three' in rating:
      return 3
    elif 'Four' in rating:
      return 4
    elif 'Five' in rating:
      return 5
    else: 
      return 0
    
    

In [None]:
books_dict = {"Title": [], "Price": [], "Rating": []}

for book in soup.find_all("article", class_="product_pod"):
  books_dict["Title"].append(book.find("h3").find("a").attrs["title"])
  books_dict["Price"].append(float(book.find("div", class_="product_price").find("p", class_="price_color").string[1:]))
  books_dict["Rating"].append(parse_rating(book.find("p", class_="star-rating").attrs['class']))

In [None]:
books_dict

{'Title': ['A Light in the Attic',
  'Tipping the Velvet',
  'Soumission',
  'Sharp Objects',
  'Sapiens: A Brief History of Humankind',
  'The Requiem Red',
  'The Dirty Little Secrets of Getting Your Dream Job',
  'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
  'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
  'The Black Maria',
  'Starving Hearts (Triangular Trade Trilogy, #1)',
  "Shakespeare's Sonnets",
  'Set Me Free',
  "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
  'Rip it Up and Start Again',
  'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
  'Olio',
  'Mesaerion: The Best Science Fiction Stories 1800-1849',
  'Libertarianism for Beginners',
  "It's Only the Himalayas"],
 'Price': [51.77,
  53.74,
  50.1,
  47.82,
  54.23,
  22.65,
  33.34,
  17.93,
  22.6,
  52.15,
  13.99,
  20.66,
  17.46,
  52.29,
  35.02,
  57.25,
  23.88,
  3

In [None]:
books_df = pd.DataFrame(books_dict)
books_df

Unnamed: 0,Title,Price,Rating
0,A Light in the Attic,51.77,3
1,Tipping the Velvet,53.74,1
2,Soumission,50.1,1
3,Sharp Objects,47.82,4
4,Sapiens: A Brief History of Humankind,54.23,5
5,The Requiem Red,22.65,1
6,The Dirty Little Secrets of Getting Your Dream...,33.34,4
7,The Coming Woman: A Novel Based on the Life of...,17.93,3
8,The Boys in the Boat: Nine Americans and Their...,22.6,4
9,The Black Maria,52.15,1


In [None]:
# scrape more data from other pages
max_page = 10

pages_url = f'{url}/catalogue/'
for page in range(1, max_page+1):
  curr_url = f"{pages_url}page-{page}.html"
  print(curr_url)
  

http://books.toscrape.com//catalogue/page-1.html
http://books.toscrape.com//catalogue/page-2.html
http://books.toscrape.com//catalogue/page-3.html
http://books.toscrape.com//catalogue/page-4.html
http://books.toscrape.com//catalogue/page-5.html
http://books.toscrape.com//catalogue/page-6.html
http://books.toscrape.com//catalogue/page-7.html
http://books.toscrape.com//catalogue/page-8.html
http://books.toscrape.com//catalogue/page-9.html
http://books.toscrape.com//catalogue/page-10.html


In [None]:
# putting it all together
max_page = 10
pages_url = f'{url}/catalogue/'
books_dict = {"Title": [], "Price": [], "Rating": []}


for page in range(1, max_page+1):
  # Fetch URL data
  curr_url = f"{pages_url}page-{page}.html"
  response = requests.get(curr_url)
  soup = BeautifulSoup(response.content, "html.parser")
 
  for book in soup.find_all("article", class_="product_pod"):
    books_dict["Title"].append(book.find("h3").find("a").attrs["title"])
    books_dict["Price"].append(float(book.find("div", class_="product_price").find("p", class_="price_color").string[1:]))
    books_dict["Rating"].append(parse_rating(book.find("p", class_="star-rating").attrs['class']))

In [1]:
# import requests
# from bs4 import BeautifulSoup

# def scrape_data(url):
#     # Send a GET request to the URL
#     response = requests.get(url)
    
#     # Check if the request was successful
#     if response.status_code == 200:
#         # Parse the HTML content of the page
#         soup = BeautifulSoup(response.content, 'html.parser')
        
#         # Find all the relevant data using the HTML tags
#         data = soup.find_all('p')
        
#         # Return the data
#         return data
#     else:
#         # Return an error message if the request was not successful
#         return "Error: Could not retrieve data from the URL"

# # Example usage
# data = scrape_data("http://books.toscrape.com/")
# print(data)
