# Web Scraper

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_books(base_url):
    """
    Scrapes book data from all pages of a book catalog website.
    
    Args:
        base_url (str): The base URL of the website to scrape
        
    Returns:
        pandas.DataFrame: DataFrame containing book titles, prices, and ratings
    """
    # Initialize list to store all book data
    all_books = []
    
    # Start with the first page
    page_url = f"{base_url}/catalogue/page-1.html"
    page_num = 1

    # Continue scraping until no more pages are found
    while True:
        try:
            # Send HTTP GET request to the current page
            response = requests.get(page_url)
            response.raise_for_status()  # Raise an exception for bad status codes
            
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all book containers on the current page
            books_on_page = soup.find_all('article', class_='product_pod')
            
            # If no books found, we've reached the end
            if not books_on_page:
                break  # No more books found

            # Extract data from each book on the current page
            for book in books_on_page:
                # Extract book title from the anchor tag's title attribute
                title = book.h3.a['title']
                
                # Extract and convert price from text to float (remove £ symbol)
                price = float(book.find('p', class_='price_color').text.replace('£', ''))
                
                # Map star rating classes to numeric values
                rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
                
                # Extract rating class and convert to numeric rating
                rating_class = book.find('p', class_='star-rating')['class'][1]
                rating = rating_map.get(rating_class, 0)

                # Add book data to our collection
                all_books.append({'title': title, 'price': price, 'rating': rating})

            # Check if there's a "next" button to continue to the next page
            next_button = soup.find('li', class_='next')
            if next_button:
                # Increment page number and construct next page URL
                page_num += 1
                page_url = f"{base_url}/catalogue/page-{page_num}.html"
                print(f"Scraping page {page_num}...")
            else:
                # No next button found, we've scraped all pages
                break

            # Add a small delay
            time.sleep(1)  # Add a small delay

        except requests.exceptions.RequestException as e:
            # Handle any network-related errors gracefully
            print(f"An error occurred: {e}")
            break

    # Convert the list of book dictionaries to a pandas DataFrame
    return pd.DataFrame(all_books)

# Run the Scrapper

In [2]:

# Run the scraper
df_books = scrape_books('http://books.toscrape.com')
df_books.to_csv('books_data.csv', index=False)
print("Scraping complete. Data saved to books_data.csv")

ValueError: could not convert string to float: 'Â51.77'