In [58]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [1]:
BASE_URL = "https://books.toscrape.com/catalogue/"
PAGE_URL = "https://books.toscrape.com/catalogue/page-{}.html"
HEADERS = {"User-Agent": "Mozilla/5.0"} # just to prevent the website from blocking  
RATING_MAP = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}

def extract_text(element):
    return element.text.strip() if element else "X"

In [60]:
def scrape_book(book_element, session):
    try:
        title = extract_text(book_element.find("h3").a)
        price = extract_text(book_element.find("p", class_="price_color")).replace("£", "").strip()
        availability = extract_text(book_element.find("p", class_="instock availability"))
        rating = RATING_MAP.get(book_element.find("p", class_="star-rating")["class"][1], "X")  # Extract rating as a number
        relative_url = book_element.h3.a["href"].replace("../../../", "")
        book_url = BASE_URL + relative_url

        book_page = session.get(book_url)
        book_soup = BeautifulSoup(book_page.content, "html.parser")
        genre_items = book_soup.find("ul", class_="breadcrumb").find_all("li")
        genre = extract_text(genre_items[-2]) if len(genre_items) >= 3 else "X"

        book_data = {
            "title": title,           
            "genre": genre,           
            "price": price,           
            "availability": availability,  
            "rating": rating,         
            "url": book_url           
        }

        
        print(book_data, flush=True)

        return book_data
        
    except Exception as e:
        print(f"Error scraping book: {e}", flush=True)
        return None


In [57]:
def scrape_books_lab2_format():
    session = requests.Session()
    session.headers.update(HEADERS)
    books_data = []

    for page_num in range(1, 51): 
        response = session.get(PAGE_URL.format(page_num))
        if response.status_code != 200:
            print("No more pages found. Stopping scrape.", flush=True)
            break

        soup = BeautifulSoup(response.content, "html.parser")
        book_elements = soup.find_all("article", class_="product_pod")

        if not book_elements:
            print("No more books found. Stopping scrape.", flush=True)
            break

        for book in book_elements:
            book_data = scrape_book(book, session)
            if book_data:
                books_data.append(book_data)

        time.sleep(0.1) 

    return books_data


books= scrape_books_lab2_format()
df = pd.DataFrame(books)


csv_file_path = "a1_books_816031011.csv"
df.to_csv(csv_file_path, index=False, encoding="utf-8")

print(f"Data successfully saved to {csv_file_path}", flush=True)

{'title': 'A Light in the ...', 'genre': 'Poetry', 'price': '51.77', 'availability': 'In stock', 'rating': 3, 'url': 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'}
{'title': 'Tipping the Velvet', 'genre': 'Historical Fiction', 'price': '53.74', 'availability': 'In stock', 'rating': 1, 'url': 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'}
{'title': 'Soumission', 'genre': 'Fiction', 'price': '50.10', 'availability': 'In stock', 'rating': 1, 'url': 'https://books.toscrape.com/catalogue/soumission_998/index.html'}
{'title': 'Sharp Objects', 'genre': 'Mystery', 'price': '47.82', 'availability': 'In stock', 'rating': 4, 'url': 'https://books.toscrape.com/catalogue/sharp-objects_997/index.html'}
{'title': 'Sapiens: A Brief History ...', 'genre': 'History', 'price': '54.23', 'availability': 'In stock', 'rating': 5, 'url': 'https://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html'}
{'title': 'The Requiem Re

In [None]:
import os  # Os import to check if the file exsits 

CSV_FILE_PATH = "a1_books_816031011.csv"

#Checks to see if the file actaully exsists first
if not os.path.exists(CSV_FILE_PATH):
    print(f"Error: File '{CSV_FILE_PATH}' not found. Please run the scraping cell first.")
else:
    print("\n Loading and preprocessing data...")
    
    df = pd.read_csv(CSV_FILE_PATH)

    df.fillna("X", inplace=True)  

    
    df["price"] = df["price"].apply(lambda x: float(x) if str(x).replace('.', '', 1).isdigit() else "X")
    #This just ensures that price values are properly converted to float where possible
    #If a value happens to be non-numeric, it's just replaced with an "X"
    
    df["rating"] = df["rating"].apply(lambda x: int(x) if str(x).isdigit() else "X")
    # Would convert ratings to integer format

    # Availability values are extracted and converted into a numeric format
    def extract_availability(value):
        if isinstance(value, str):
            if "(" in value:  
                num = ''.join([c for c in value if c.isdigit()])  
                return int(num) if num else 1  
            elif "In stock" in value:
                return 1  
            elif "Out of stock" in value:
                return 0  
        return "X"  

    df["availability"] = df["availability"].apply(extract_availability)


Since I encountered issues processing the file directly, I decided to first verify its existence before proceeding with any data operations. 
This would ensure that the dataset is successfully saved before attempting to load and preprocess it.

In [67]:
    print("\n**Preprocessing Decisions:**")
    print("- Missing values remain represented as 'X'.")
    print("- Price has been converted to a float where possible, otherwise remains 'X'.")
    print("- Rating has been converted to an integer where possible, otherwise remains 'X'.")
    print("- Availability has been extracted and converted to an integer (actual number if specified, 1 if available but no quantity is mentioned, 0 if out of stock, 'X' if missing).")
     
    numeric_df = df[df["price"] != "X"].copy()  
    numeric_df = numeric_df[numeric_df["rating"] != "X"]
    numeric_df = numeric_df[numeric_df["availability"] != "X"]

    print("\nDataset Information:")
    print(df.info())  

    if not numeric_df.empty:
        print("\nSummary Statistics (Numerical Fields):")
        print(numeric_df[["price", "availability", "rating"]].astype(float).describe()) 

    print("\nMissing Values:")
    print(df.isin(["X"]).sum()) 

    print("Data preprocessing and summary statistics completed.")


**Preprocessing Decisions:**
- Missing values remain represented as 'X'.
- Price has been converted to a float where possible, otherwise remains 'X'.
- Rating has been converted to an integer where possible, otherwise remains 'X'.
- Availability has been extracted and converted to an integer (actual number if specified, 1 if available but no quantity is mentioned, 0 if out of stock, 'X' if missing).

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   genre         1000 non-null   object 
 2   price         1000 non-null   float64
 3   availability  1000 non-null   int64  
 4   rating        1000 non-null   int64  
 5   url           1000 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 47.0+ KB
None

Summary Statistics (Numerical Fields):
            price