In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# website link (all pages follow same format)
link = "https://books.toscrape.com/catalogue/page-{}.html"

all_books = []   # to keep data of all books
page_no = 1      # start with page 1

while True:
    # get page
    url = link.format(page_no)
    res = requests.get(url)

    # if page not found, stop the loop
    if res.status_code != 200:
        break

    soup = BeautifulSoup(res.text, "html.parser")

    # find all books in the current page
    books_on_page = soup.find_all("article", class_="product_pod")
    if not books_on_page:
        break

    for b in books_on_page:
        # book title
        title = b.h3.a["title"]

        # book price
        price = b.find("p", class_="price_color").text.strip()

        # availability info
        stock = b.find("p", class_="instock availability").text.strip()

        # star rating is hidden in class names
        star_info = b.find("p", class_="star-rating")
        stars = [x for x in star_info["class"] if x != "star-rating"][0]

        # save one book’s data
        all_books.append({
            "Title": title,
            "Price": price,
            "Availability": stock,
            "Star Rating": stars
        })

    page_no += 1   # go to next page

# convert to DataFrame
df = pd.DataFrame(all_books)

# save in a csv file
df.to_csv("books.csv", index=False)

print("Done! Scraped", len(df), "books in total.")


Done! Scraped 1000 books in total.


In [1]:
!pip install selenium





In [2]:
!pip install selenium webdriver-manager


Collecting webdriver-manager
  Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2
