<a href="https://colab.research.google.com/github/Moh-aleid/Big-Data/blob/main/TP1_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv

def fetch_webpage(url):
    """Fetch the content of a web page and return the HTML source code"""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

def extract_quotes(html_content):
    """Extract all quotes from the HTML content using BeautifulSoup"""
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all quote containers
    quotes = []

    # For books.toscrape.com, we'll extract book information instead of quotes
    # since the site doesn't have quotes but has book data
    books = soup.find_all('article', class_='product_pod')

    for book in books:
        title = book.h3.a['title']
        price = book.find('p', class_='price_color').text
        stock = book.find('p', class_='instock availability').text.strip()

        quotes.append({
            'title': title,
            'price': price,
            'stock': stock
        })

    return quotes

def save_to_csv(data, filename='books_data.csv'):
    """Save the extracted data to a CSV file using pandas"""
    if not data:
        print("No data to save")
        return

    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

# Set the URL to scrape
url = "https://books.toscrape.com"

# Question 1: Fetch and display HTML source code
print("Fetching webpage content...")
html_content = fetch_webpage(url)

if html_content:
    print("\n--- HTML Source Code (first 500 characters) ---")
    print(html_content[:500] + "...")

    # Question 2: Extract information using BeautifulSoup
    print("\n--- Extracting book information ---")
    books_data = extract_quotes(html_content)

    # Display some of the extracted data
    print(f"\nFound {len(books_data)} books")
    if books_data:
        print("\nFirst 3 books:")
        for i, book in enumerate(books_data[:3], 1):
            print(f"{i}. Title: {book['title']}")
            print(f"   Price: {book['price']}")
            print(f"   Stock: {book['stock']}")

    # Question 3: Save to CSV using pandas
    print("\n--- Saving data to CSV ---")
    save_to_csv(books_data)

Fetching webpage content...

--- HTML Source Code (first 500 characters) ---
<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->
    <head>
        <title>
    All products | Books to Scrape - Sandbox
</title>

        <meta http-equiv="content-type" content="text/html; charset=UTF-8" /...

--- Extracting book information ---

Found 20 books

First 3 books:
1. Title: A Light in the Attic
   Price: Â£51.77
   Stock: In stock
2. Title: Tipping the Velvet
   Price: Â£53.74
   Stock: In stock
3. Title: Soumission
   Price: Â£50.10
   Stock: In stock

--- Saving data to CSV ---
Data saved to books_data.csv
