# Web Scraper

In [3]:
# Import necessary libraries for web scraping and data processing
import re  # Regular expressions for pattern matching in text
import requests  # HTTP library for making web requests
from bs4 import BeautifulSoup  # HTML parsing library
import pandas as pd  # Data manipulation and analysis library

def scrape_books(base_url):
    """
    Scrapes book information from all pages of a book catalog website.
    
    Args:
        base_url (str): The base URL of the website to scrape
        
    Returns:
        pd.DataFrame: A DataFrame containing book information with columns:
                     title, price, rating, availability
    """
    books = []  # List to store all scraped book data
    page = 1    # Start with the first page

    # Loop through all pages until no more books are found
    while True:
        # Construct URL for current page
        if page == 1:
            url = base_url  # First page uses base URL
        else:
            url = f"{base_url}/catalogue/page-{page}.html"  # Subsequent pages have specific format

        # Send HTTP GET request to fetch the page
        response = requests.get(url)
        
        # Check if page exists (status code 200 means success)
        if response.status_code != 200:
            break  # Exit loop if page doesn't exist

        # Parse HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all book containers on the current page
        book_containers = soup.find_all('article', class_='product_pod')

        # If no books found on this page, we've reached the end
        if not book_containers:
            break

        # Process each book on the current page
        for book in book_containers:
            # Extract book title from the 'title' attribute of the anchor tag
            title = book.h3.a['title']

            # Extract and convert price with robust cleaning
            price_text = book.find('p', class_='price_color').text
            # Use regex to extract only digits and decimal point from price string
            price_match = re.search(r'(\d+\.?\d*)', price_text)
            if price_match:
                try:
                    # Convert extracted price string to float
                    price = float(price_match.group(1))
                except ValueError:
                    # Set default price if conversion fails
                    price = 0.0  
            else:
                # Set default price if no match found
                price = 0.0

            # Map star rating classes to numeric values for easier analysis
            rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
            
            # Find the rating element and extract rating class
            rating_element = book.find('p', class_='star-rating')
            if rating_element:
                # Get CSS classes from the rating element
                rating_class = rating_element.get('class', [])
                # Convert rating class to numeric value (second class contains the rating)
                rating = rating_map.get(rating_class[1] if len(rating_class) > 1 else '', 0)
            else:
                # Default rating if element not found
                rating = 0

            # Extract availability status text
            availability_element = book.find('p', class_='instock availability')
            # Clean up whitespace, set to 'Unknown' if element not found
            availability = availability_element.text.strip() if availability_element else 'Unknown'

            # Add all extracted book data to the list
            books.append({
                'title': title,
                'price': price,
                'rating': rating,
                'availability': availability
            })

        # Move to the next page
        page += 1

    # Convert list of dictionaries to pandas DataFrame and return
    return pd.DataFrame(books)


# Run the Scrapper

In [4]:

# Run the scraper
df_books = scrape_books('http://books.toscrape.com')
df_books.to_csv('books_data.csv', index=False)
print("Scraping complete. Data saved to books_data.csv")
print(f"Scraped {len(df_books)} books")

Scraping complete. Data saved to books_data.csv
Scraped 1000 books
