<a href="https://colab.research.google.com/github/Jesicaprmta/UTS-PDAB/blob/main/Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q requests beautifulsoup4 pandas tqdm

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import time
from urllib.parse import urljoin
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

BASE_URL = "https://books.toscrape.com/"

# Session dengan retry biar stabil
def make_session():
    s = requests.Session()
    retry = Retry(total=3, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    s.mount("https://", adapter)
    s.mount("http://", adapter)
    s.headers.update({"User-Agent": "Mozilla/5.0 (compatible; ColabScraper/1.0)"})
    return s

session = make_session()


In [3]:
def parse_detail_page(html, base_url):
    soup = BeautifulSoup(html, "html.parser")

    # --- Category ---
    category = soup.select_one("ul.breadcrumb li:nth-of-type(3) a").text.strip()

    # --- Title ---
    title = soup.select_one(".product_main h1").text.strip()

    # --- Rating (bisa 'One', 'Two', dst) ---
    rating_tag = soup.select_one("p.star-rating")
    rating = rating_tag["class"][1] if rating_tag and len(rating_tag["class"]) > 1 else ""

    # --- Price ---
    price_excl = soup.find("th", string="Price (excl. tax)").find_next("td").text.strip()
    price_incl = soup.find("th", string="Price (incl. tax)").find_next("td").text.strip()
    tax = soup.find("th", string="Tax").find_next("td").text.strip()

    # --- Stock info (Availability) ---
    stock_text = soup.select_one(".instock.availability").text.strip()
    stock_status = "In stock" if "In stock" in stock_text else "Out of stock"
    # ambil angka jumlah stok
    import re
    stock_available = re.search(r"\d+", stock_text)
    stock_available = int(stock_available.group()) if stock_available else 0

    # --- Code (UPC) ---
    code = soup.find("th", string="UPC").find_next("td").text.strip()

    # --- Description ---
    desc_tag = soup.select_one("#product_description")
    if desc_tag:
        desc_text = desc_tag.find_next_sibling("p").text.strip()
    else:
        desc_text = ""

    # --- Number of Reviews ---
    num_reviews = soup.find("th", string="Number of reviews").find_next("td").text.strip()

    # --- Cover (image URL absolut) ---
    img_tag = soup.select_one("div.item.active img")
    if img_tag:
        cover = urljoin(base_url, img_tag["src"])
    else:
        cover = ""

    # hasilkan dict
    return {
        "category": category,
        "code": code,
        "cover": cover,
        "title": title,
        "rating": rating,
        "price (excl. tax)": price_excl,
        "price (incl. tax)": price_incl,
        "tax": tax,
        "stock status": stock_status,
        "number of stock available": stock_available,
        "description": desc_text,
        "number of reviews": num_reviews,
    }


In [5]:
books = []
TOTAL_PAGES = 50

for page in tqdm(range(1, TOTAL_PAGES + 1), desc="Scraping pages"):
    # halaman pertama: /catalogue/page-1.html
    url = urljoin(BASE_URL, f"catalogue/page-{page}.html")
    r = session.get(url)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "html.parser")
    product_links = [urljoin(url, a["href"]) for a in soup.select("h3 a")]

    for link in product_links:
        try:
            res = session.get(link)
            res.raise_for_status()
            data = parse_detail_page(res.text, link)
            books.append(data)
        except Exception as e:
            print(f"Error on {link}: {e}")
        time.sleep(0.1)

df = pd.DataFrame(books)
print("Jumlah buku:", len(df))
df.head(3)


Scraping pages: 100%|██████████| 50/50 [03:44<00:00,  4.49s/it]

Jumlah buku: 1000





Unnamed: 0,category,code,cover,title,rating,price (excl. tax),price (incl. tax),tax,stock status,number of stock available,description,number of reviews
0,Poetry,a897fe39b1053632,https://books.toscrape.com/media/cache/fe/72/f...,A Light in the Attic,Three,Â£51.77,Â£51.77,Â£0.00,In stock,22,It's hard to imagine a world without A Light i...,0
1,Historical Fiction,90fa61229261140a,https://books.toscrape.com/media/cache/08/e9/0...,Tipping the Velvet,One,Â£53.74,Â£53.74,Â£0.00,In stock,20,"""Erotic and absorbing...Written with starling ...",0
2,Fiction,6957f44c3847a760,https://books.toscrape.com/media/cache/ee/cf/e...,Soumission,One,Â£50.10,Â£50.10,Â£0.00,In stock,20,"Dans une France assez proche de la nÃ´tre, un ...",0


In [7]:
df.sample(10)

Unnamed: 0,category,code,cover,title,rating,price (excl. tax),price (incl. tax),tax,stock status,number of stock available,description,number of reviews
945,Sequential Art,2ac720f76384c57e,https://books.toscrape.com/media/cache/03/0a/0...,"Skip Beat!, Vol. 01 (Skip Beat! #1)",Three,Â£42.12,Â£42.12,Â£0.00,In stock,1,"Kyoko always thought that Sho, whose family to...",0
913,Default,9c96cd1329fbd82d,https://books.toscrape.com/media/cache/ca/4a/c...,The Zombie Room,Five,Â£19.69,Â£19.69,Â£0.00,In stock,1,An unlikely bond is forged between three men f...,0
55,Add a comment,4eed62cf5f8d8edf,https://books.toscrape.com/media/cache/c7/26/c...,The Torch Is Passed: A Harding Family Story,One,Â£19.09,Â£19.09,Â£0.00,In stock,16,Andrea Harding is a recent college graduate lo...,0
475,Default,0ab4b35dcffcffd1,https://books.toscrape.com/media/cache/57/a8/5...,I'll Give You the Sun,One,Â£56.48,Â£56.48,Â£0.00,In stock,8,"A brilliant, luminous story of first love, fam...",0
925,Philosophy,5c10b64db3e4f228,https://books.toscrape.com/media/cache/16/f4/1...,The Nicomachean Ethics,One,Â£36.34,Â£36.34,Â£0.00,In stock,1,âOne swallow does not make a summer; neither...,0
202,Christian Fiction,a57b1dcbd6849222,https://books.toscrape.com/media/cache/94/f4/9...,Close to You,Four,Â£49.46,Â£49.46,Â£0.00,In stock,15,A disgraced scholar running from her past and ...,0
136,Romance,623e1a180426039b,https://books.toscrape.com/media/cache/29/3b/2...,The Wedding Dress,One,Â£24.12,Â£24.12,Â£0.00,In stock,15,"Four brides. One Dress.A tale of faith, redemp...",0
281,Fiction,cc82685d9f49bc2c,https://books.toscrape.com/media/cache/2d/a3/2...,My Mrs. Brown,Three,Â£24.48,Â£24.48,Â£0.00,In stock,14,"From William Norwich, the well-known fashion w...",0
409,Young Adult,c65c25d990fc7025,https://books.toscrape.com/media/cache/fc/8c/f...,"Exit, Pursued by a Bear",Four,Â£51.34,Â£51.34,Â£0.00,In stock,11,"âI love you,â Polly says suddenly when Iâ...",0
87,Self Help,9ffa38d3b832433a,https://books.toscrape.com/media/cache/49/e1/4...,Online Marketing for Busy Authors: A Step-By-S...,One,Â£46.35,Â£46.35,Â£0.00,In stock,16,"If You Want People to Read Your Book, Writing ...",0
