In [1]:
!pip install requests beautifulsoup4



In [1]:
import requests

URL = "http://books.toscrape.com/"
response = requests.get(URL)

if response.status_code == 200:
    print("Success! We have downloaded the webpage.")

    response.text 
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Success! We have downloaded the webpage.


In [3]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
book_containers = soup.find_all('article', class_='product_pod')

print(f"Found {len(book_containers)} books on this page.")

Found 20 books on this page.


In [5]:
all_books_data = []

for book in book_containers:
    title = book.h3.a['title']
    
    price = book.find('p', class_='price_color').text
    
    rating = book.find('p', class_='star-rating')['class'][1]
    
    book_data = {
        'Title': title,
        'Price': price,
        'Rating': rating
    }
    
    all_books_data.append(book_data)

print("Successfully extracted data. Here's a sample:")
print(all_books_data[:3])

Successfully extracted data. Here's a sample:
[{'Title': 'A Light in the Attic', 'Price': 'Â£51.77', 'Rating': 'Three'}, {'Title': 'Tipping the Velvet', 'Price': 'Â£53.74', 'Rating': 'One'}, {'Title': 'Soumission', 'Price': 'Â£50.10', 'Rating': 'One'}]


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "http://books.toscrape.com/catalogue/"
current_page_url = base_url + "page-1.html"

all_books_data = []

print("Starting the scraper...")

while current_page_url:
    response = requests.get(current_page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    book_containers = soup.find_all('article', class_='product_pod')
    
    print(f"Scraping page: {current_page_url}")

    for book in book_containers:
        title = book.h3.a['title']
        price = book.find('p', class_='price_color').text
        rating = book.find('p', class_='star-rating')['class'][1]
        
        book_data = {
            'Title': title,
            'Price': price,
            'Rating': rating
        }
        all_books_data.append(book_data)

    next_button = soup.find('li', class_='next')
    
    if next_button:
        next_page_relative_url = next_button.a['href']
        current_page_url = base_url + next_page_relative_url
    else:
    
        current_page_url = None

print("Scraping complete!")

df = pd.DataFrame(all_books_data)

df.to_csv('books_data.csv', index=False)

print("Data has been saved to books_data.csv")
print(f"Total books scraped: {len(df)}")

Starting the scraper...
Scraping page: http://books.toscrape.com/catalogue/page-1.html
Scraping page: http://books.toscrape.com/catalogue/page-2.html
Scraping page: http://books.toscrape.com/catalogue/page-3.html
Scraping page: http://books.toscrape.com/catalogue/page-4.html
Scraping page: http://books.toscrape.com/catalogue/page-5.html
Scraping page: http://books.toscrape.com/catalogue/page-6.html
Scraping page: http://books.toscrape.com/catalogue/page-7.html
Scraping page: http://books.toscrape.com/catalogue/page-8.html
Scraping page: http://books.toscrape.com/catalogue/page-9.html
Scraping page: http://books.toscrape.com/catalogue/page-10.html
Scraping page: http://books.toscrape.com/catalogue/page-11.html
Scraping page: http://books.toscrape.com/catalogue/page-12.html
Scraping page: http://books.toscrape.com/catalogue/page-13.html
Scraping page: http://books.toscrape.com/catalogue/page-14.html
Scraping page: http://books.toscrape.com/catalogue/page-15.html
Scraping page: http://boo