In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Books to Scrape — full dataset scraper (≥500 rows)
Collects title, price_gbp, availability_text, stock_count, rating_stars,
category, upc, description, product_page_url. Saves to books_dataset.csv
"""

import time
import random
import re
import sys
from urllib.parse import urljoin

import requests
import pandas as pd
from bs4 import BeautifulSoup as BS
from tqdm import tqdm

BASE = "https://books.toscrape.com/"
HEADERS = {
    "User-Agent": "StudentScraper/1.1 (+academic use; contact: student@example.com)"
}
TIMEOUT = 20
SLEEP_LOW, SLEEP_HIGH = 0.75, 1.25  # polite throttle


def polite_sleep(a=SLEEP_LOW, b=SLEEP_HIGH):
    time.sleep(random.uniform(a, b))


def get_soup(url, retries=3):
    """GET a URL with simple retry and return BeautifulSoup."""
    last_err = None
    for _ in range(retries):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            resp.raise_for_status()
            return BS(resp.text, "html.parser")
        except Exception as e:
            last_err = e
            polite_sleep(1.0, 2.0)
    raise last_err


def extract_rating(soup):
    """Map CSS class to 1–5 integer stars."""
    rating_p = soup.select_one("p.star-rating")
    if not rating_p:
        return None
    classes = rating_p.get("class", [])
    mapping = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
    for c in classes:
        if c in mapping:
            return mapping[c]
    return None


def extract_stock(avail_text):
    """Extract first integer from availability text."""
    if not avail_text:
        return None
    m = re.search(r"(\d+)", avail_text)
    return int(m.group(1)) if m else None


def clean_price(raw_text):
    """
    Robustly convert a price string like '£45.17' (possibly with NBSPs/spaces)
    to a float 45.17.
    """
    if raw_text is None:
        return None
    t = raw_text.replace("£", "").replace("\xa0", " ").strip()
    # keep digits, dot, comma; remove any stray characters
    t = re.sub(r"[^0-9\.,]", "", t)
    t = t.replace(",", "")  # ensure standard decimal
    return float(t) if t else None


def parse_product_page(url):
    """Parse a single product page and return a dict of fields."""
    soup = get_soup(url)
    # Title
    title_el = soup.select_one("div.product_main h1")
    title = title_el.get_text(strip=True) if title_el else None

    # Price (FIXED: strip currency explicitly)
    price_el = soup.select_one("p.price_color")
    price_gbp = clean_price(price_el.get_text()) if price_el else None

    # Availability (and stock_count)
    avail_el = soup.select_one("p.instock.availability")
    availability_text = (
        " ".join(avail_el.get_text(separator=" ", strip=True).split())
        if avail_el
        else None
    )
    stock_count = extract_stock(availability_text)

    # Rating
    rating_stars = extract_rating(soup)

    # Category via breadcrumb
    category = None
    bc_nodes = soup.select("ul.breadcrumb li")
    # typical: [Home] [Books] [Category] [Title]
    if bc_nodes and len(bc_nodes) >= 3:
        category = bc_nodes[2].get_text(strip=True)

    # Metadata table (UPC lives here)
    meta = {}
    for row in soup.select("table.table.table-striped tr"):
        th = row.select_one("th")
        td = row.select_one("td")
        if th and td:
            meta[th.get_text(strip=True)] = td.get_text(strip=True)
    upc = meta.get("UPC")

    # Description (paragraph following the #product_description header)
    desc_header = soup.select_one("#product_description")
    description = None
    if desc_header:
        p = desc_header.find_next("p")
        if p:
            description = p.get_text(" ", strip=True)

    return {
        "title": title,
        "price_gbp": price_gbp,
        "availability_text": availability_text,
        "stock_count": stock_count,
        "rating_stars": rating_stars,
        "category": category,
        "product_page_url": url,
        "upc": upc,
        "description": description,
    }


def get_all_categories():
    soup = get_soup(BASE)
    polite_sleep()
    cats = []
    for a in soup.select("div.side_categories ul li ul li a"):
        href = a.get("href")
        name = a.get_text(strip=True)
        url = urljoin(BASE, href)
        cats.append((name, url))
    return cats


def category_pages(cat_url):
    """Yield all pagination URLs for a category (page-1, page-2, ...)"""
    # walk next links until none remain
    url = cat_url
    while True:
        yield url
        soup = get_soup(url)
        polite_sleep()
        nxt = soup.select_one("li.next a")
        if not nxt:
            break
        url = urljoin(url, nxt.get("href"))


def list_product_links(listing_url):
    """Return product detail links found on a category listing page."""
    soup = get_soup(listing_url)
    polite_sleep()
    links = []
    for a in soup.select("article.product_pod h3 a"):
        href = a.get("href")
        url = urljoin(listing_url, href)
        links.append(url)
    return links


def main():
    rows = []
    categories = get_all_categories()
    print(f"Found {len(categories)} categories")

    for cat_name, cat_url in tqdm(categories, desc="Categories"):
        for page_url in category_pages(cat_url):
            product_links = list_product_links(page_url)
            for purl in product_links:
                # normalize to absolute URL once more (handles ../../../)
                purl = urljoin(page_url, purl)
                try:
                    rec = parse_product_page(purl)
                    # fill category if breadcrumb missing
                    if not rec.get("category"):
                        rec["category"] = cat_name
                    rows.append(rec)
                except Exception as e:
                    print(f"[warn] {e} @ {purl}", file=sys.stderr)
                finally:
                    polite_sleep()

    df = pd.DataFrame(rows)
    # de-dup by UPC (unique)
    if "upc" in df.columns:
        df = df.drop_duplicates(subset=["upc"]).reset_index(drop=True)

    # ensure columns exist & order
    cols = [
        "title",
        "price_gbp",
        "availability_text",
        "stock_count",
        "rating_stars",
        "category",
        "product_page_url",
        "upc",
        "description",
    ]
    for c in cols:
        if c not in df.columns:
            df[c] = None
    df = df[cols]

    print(f"Total rows scraped: {len(df)}")
    # sanity for assignment requirement
    if len(df) < 500:
        print(
            "Warning: fewer than 500 rows. Try re-running; site may have failed mid-run.",
            file=sys.stderr,
        )

    # save
    out_csv = "books_dataset.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"Saved → {out_csv}")

    # optional parquet
    try:
        df.to_parquet("books_dataset.parquet", index=False)
    except Exception:
        pass


if __name__ == "__main__":
    main()


Found 50 categories


Categories:   8%|▊         | 4/50 [03:15<46:06, 60.13s/it]