# Web Scraper

In [3]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_books(base_url):
    books = []
    page = 1

    while True:
        if page == 1:
            url = base_url
        else:
            url = f"{base_url}/catalogue/page-{page}.html"

        response = requests.get(url)
        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.content, 'html.parser')
        book_containers = soup.find_all('article', class_='product_pod')

        if not book_containers:
            break

        for book in book_containers:
            # Extract title
            title = book.h3.a['title']

            # Extract and convert price with robust cleaning
            price_text = book.find('p', class_='price_color').text
            # Use regex to extract only digits and decimal point
            price_match = re.search(r'(\d+\.?\d*)', price_text)
            if price_match:
                try:
                    price = float(price_match.group(1))
                except ValueError:
                    price = 0.0  # Default value if conversion fails
            else:
                price = 0.0

            # Map star rating classes to numeric values
            rating_map = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
            rating_element = book.find('p', class_='star-rating')
            if rating_element:
                rating_class = rating_element.get('class', [])
                rating = rating_map.get(rating_class[1] if len(rating_class) > 1 else '', 0)
            else:
                rating = 0

            # Extract availability
            availability_element = book.find('p', class_='instock availability')
            availability = availability_element.text.strip() if availability_element else 'Unknown'

            books.append({
                'title': title,
                'price': price,
                'rating': rating,
                'availability': availability
            })

        page += 1

    return pd.DataFrame(books)


# Run the Scrapper

In [4]:

# Run the scraper
df_books = scrape_books('http://books.toscrape.com')
df_books.to_csv('books_data.csv', index=False)
print("Scraping complete. Data saved to books_data.csv")
print(f"Scraped {len(df_books)} books")

Scraping complete. Data saved to books_data.csv
Scraped 1000 books
