# Techniques: HTML parsing, XPath, CSS selectors.
# ■ Libraries: BeautifulSoup, requests.
# ■ Project: Build a scraper to extract data from a static website (e.g.,
# e-commerce product details).

In [65]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os

# Scraped Data Cleaning

In [66]:
def Scrap_page(product_data, products):
    for product in products:
        price = product.find('p', class_="price_color").text.strip()
        name = product.find("h3").find("a")['title']
        stock = product.find("p", class_="instock availability").get_text(strip=True)
        rating_tag = product.find("p", class_="star-rating")
        rating = rating_tag["class"][1] if rating_tag else "No Rating"
        image_src = product.find("div", class_="image_container").find("img")["src"]
        
        product_data.append({
            "Name": name,
            "Rating": rating,
            "Price": price,
            "Images": image_src,
            "Stock": stock
        })

# Scraping multple pages

In [72]:
def scrap_multiple_pages(url):
    number_pages = 20
    product_data = []
    
    for page in range(1, number_pages + 1):
        page_url = f"{url}/catalogue/page-{page}.html"
        response = requests.get(page_url)
        response.raise_for_status() 
        soup = BeautifulSoup(response.text, "html.parser")
        products = soup.find_all("article", class_="product_pod")
        Scrap_page(product_data, products)
    
    return product_data

# Website URL

In [73]:
url = "https://books.toscrape.com/"
ALL_products = scrap_multiple_pages(url)

# Gain Data List

In [74]:
ALL_products

[{'Name': 'A Light in the Attic',
  'Rating': 'Three',
  'Price': 'Â£51.77',
  'Images': '../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg',
  'Stock': 'In stock'},
 {'Name': 'Tipping the Velvet',
  'Rating': 'One',
  'Price': 'Â£53.74',
  'Images': '../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg',
  'Stock': 'In stock'},
 {'Name': 'Soumission',
  'Rating': 'One',
  'Price': 'Â£50.10',
  'Images': '../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg',
  'Stock': 'In stock'},
 {'Name': 'Sharp Objects',
  'Rating': 'Four',
  'Price': 'Â£47.82',
  'Images': '../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg',
  'Stock': 'In stock'},
 {'Name': 'Sapiens: A Brief History of Humankind',
  'Rating': 'Five',
  'Price': 'Â£54.23',
  'Images': '../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg',
  'Stock': 'In stock'},
 {'Name': 'The Requiem Red',
  'Rating': 'One',
  'Price': 'Â£22.65',
  'Images': '../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.j

In [75]:
data = pd.DataFrame(ALL_products)

# Scraped Data

In [76]:
data   

Unnamed: 0,Name,Rating,Price,Images,Stock
0,A Light in the Attic,Three,Â£51.77,../media/cache/2c/da/2cdad67c44b002e7ead0cc356...,In stock
1,Tipping the Velvet,One,Â£53.74,../media/cache/26/0c/260c6ae16bce31c8f8c95dadd...,In stock
2,Soumission,One,Â£50.10,../media/cache/3e/ef/3eef99c9d9adef34639f51066...,In stock
3,Sharp Objects,Four,Â£47.82,../media/cache/32/51/3251cf3a3412f53f339e42cac...,In stock
4,Sapiens: A Brief History of Humankind,Five,Â£54.23,../media/cache/be/a5/bea5697f2534a2f86a3ef27b5...,In stock
...,...,...,...,...,...
395,Take Me Home Tonight (Rock Star Romance #3),Three,Â£53.98,../media/cache/a6/4b/a64b3c559f59748bfdbbe75be...,In stock
396,Sleeping Giants (Themis Files #1),One,Â£48.74,../media/cache/c0/72/c072c1ef144d571abd25fe9cc...,In stock
397,"Setting the World on Fire: The Brief, Astonish...",Two,Â£21.15,../media/cache/8b/c4/8bc43a6b42d0283ab4bf611f1...,In stock
398,Playing with Fire,Three,Â£13.71,../media/cache/90/0b/900bd2e60d56b6480a4e8eb2d...,In stock


# Data Saved in "products.csv" file 

In [82]:
data.to_csv("products.csv", index = False)


In [87]:
os.makedirs("images", exist_ok=True)  
img_url = str(data["Images"].iloc[0])  
if not img_url.startswith("http"):
    img_url = url + img_url 
img_data = requests.get(img_url).content
img_name = os.path.join("images", img_url.split("/")[-1])  

with open(img_name, "wb") as img_file:
    img_file.write(img_data)

print(f"Downloaded {img_name}")


Downloaded images\2cdad67c44b002e7ead0cc35693c0e8b.jpg
