In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

base_url = "https://books.toscrape.com/"
start_url = base_url + "catalogue/category/books_1/index.html"

def get_soup(url):
    res = requests.get(url)
    res.encoding = 'utf-8'
    return BeautifulSoup(res.text, 'html.parser')

def get_book_links(category_url):
    """Ambil semua link buku dari 1 kategori (semua halaman)"""
    links = []
    while True:
        soup = get_soup(category_url)
        for book in soup.select('h3 a'):
            link = book['href'].replace('../../../', base_url + 'catalogue/')
            links.append(link)
        next_page = soup.select_one('li.next a')
        if next_page:
            next_url = category_url.rsplit('/', 1)[0] + '/' + next_page['href']
            category_url = next_url
        else:
            break
    return links

def get_book_details(url, category):
    """Scrape detail dari 1 buku"""
    soup = get_soup(url)

    title = soup.h1.text.strip()

    # Extract product info table
    table = {row.th.text.strip(): row.td.text.strip() for row in soup.select('table tr')}
    code = table.get('UPC')
    price_excl = table.get('Price (excl. tax)')
    price_incl = table.get('Price (incl. tax)')
    tax = table.get('Tax')
    stock_text = table.get('Availability', '')
    stock_status = 'In stock' if 'In stock' in stock_text else 'Out of stock'
    num_stock = int(re.findall(r'\d+', stock_text)[0]) if re.findall(r'\d+', stock_text) else 0
    num_reviews = table.get('Number of reviews', '0')

    rating_class = soup.select_one('p.star-rating')
    rating = rating_class['class'][1] if rating_class else 'None'

    cover = base_url + soup.select_one('div.item.active img')['src'].replace('../', '')
    desc = soup.select_one('#product_description ~ p')
    description = desc.text.strip() if desc else ''

    return {
        'category': category,
        'code': code,
        'cover': cover,
        'title': title,
        'rating': rating,
        'price (excl. tax)': price_excl,
        'price (incl. tax)': price_incl,
        'tax': tax,
        'stock status': stock_status,
        'number of stock available': num_stock,
        'description': description,
        'number of reviews': num_reviews
    }

# --- Mulai scraping ---
soup = get_soup(base_url)
categories = soup.select('.side_categories ul ul li a')

all_books = []
for cat in categories:
    category_name = cat.text.strip()
    category_url = base_url + cat['href']

    print(f"Scraping kategori: {category_name}")
    links = get_book_links(category_url)

    for link in links:
        try:
            details = get_book_details(link, category_name)
            all_books.append(details)
            if len(all_books) % 50 == 0:
                print(f"{len(all_books)} buku terkumpul...")
            time.sleep(0.2)
        except Exception as e:
            print(f"Gagal memproses {link}: {e}")

    if len(all_books) >= 1000:
        break

# Buat dataframe
df = pd.DataFrame(all_books)
print("\nJumlah data:", len(df))
display(df.head())

# Simpan ke CSV
df.to_csv("books_scraped.csv", index=False)
print("File 'books_scraped.csv' berhasil disimpan.")


Scraping kategori: Travel
Scraping kategori: Mystery
Scraping kategori: Historical Fiction
50 buku terkumpul...
Scraping kategori: Sequential Art
100 buku terkumpul...
Scraping kategori: Classics
150 buku terkumpul...
Scraping kategori: Philosophy
Scraping kategori: Romance
200 buku terkumpul...
Scraping kategori: Womens Fiction
Scraping kategori: Fiction
250 buku terkumpul...
Scraping kategori: Childrens
300 buku terkumpul...
Scraping kategori: Religion
Scraping kategori: Nonfiction
350 buku terkumpul...
400 buku terkumpul...
Scraping kategori: Music
450 buku terkumpul...
Scraping kategori: Default
500 buku terkumpul...
550 buku terkumpul...
600 buku terkumpul...
Scraping kategori: Science Fiction
Scraping kategori: Sports and Games
Scraping kategori: Add a comment
650 buku terkumpul...
Scraping kategori: Fantasy
700 buku terkumpul...
Scraping kategori: New Adult
Scraping kategori: Young Adult
750 buku terkumpul...
Scraping kategori: Science
800 buku terkumpul...
Scraping kategori: Po

Unnamed: 0,category,code,cover,title,rating,price (excl. tax),price (incl. tax),tax,stock status,number of stock available,description,number of reviews
0,Travel,a22124811bfa8350,https://books.toscrape.com/media/cache/6d/41/6...,It's Only the Himalayas,Two,£45.17,£45.17,£0.00,In stock,19,"“Wherever you go, whatever you do, just . . . ...",0
1,Travel,ce60436f52c5ee68,https://books.toscrape.com/media/cache/fe/8a/f...,Full Moon over Noah’s Ark: An Odyssey to Mount...,Four,£49.43,£49.43,£0.00,In stock,15,Acclaimed travel writer Rick Antonson sets his...,0
2,Travel,f9705c362f070608,https://books.toscrape.com/media/cache/c7/1a/c...,See America: A Celebration of Our National Par...,Three,£48.87,£48.87,£0.00,In stock,14,To coincide with the 2016 centennial anniversa...,0
3,Travel,1809259a5a5f1d8d,https://books.toscrape.com/media/cache/ca/30/c...,Vagabonding: An Uncommon Guide to the Art of L...,Two,£36.94,£36.94,£0.00,In stock,8,With a new foreword by Tim Ferriss •There’s no...,0
4,Travel,a94350ee74deaa07,https://books.toscrape.com/media/cache/45/21/4...,Under the Tuscan Sun,Three,£37.33,£37.33,£0.00,In stock,7,A CLASSIC FROM THE BESTSELLING AUTHOR OF UNDER...,0


File 'books_scraped.csv' berhasil disimpan.
