# Partie 1 : Access the main website

## Basic import

In [99]:
import requests as rq
import pandas as pd
from scrapy import Selector
import re
import os

In [100]:
number_words = {
    "zero": "0",
    "one": "1",
    "two": "2",
    "three": "3",
    "four": "4",
    "five": "5",
    "six": "6",
    "seven": "7",
    "eight": "8",
    "nine": "9",
    "ten": "10"
}

## Acces the website

In [101]:
main_url = "https://books.toscrape.com/"

main_response = rq.get(main_url)

main_response

<Response [200]>

## Get the list of categories

In [None]:
main_selector = Selector(text=main_response.text)

categorie_url_list_crude = main_selector.css("ul.nav-list ul li a::attr(href)").getall()
categorie_url_list = [u.replace('index.html', '') for u in categorie_url_list_crude]

## Access each links categories and books

In [None]:
def book_linker(link: str) -> list:
    response = rq.get(link)
    selector = Selector(text=response.text)

    book_link_list = selector.css("article.product_pod h3 a::attr(href)").getall()

    # print(book_link_list)
    return book_link_list

# test_list = book_linker("https://books.toscrape.com/catalogue/category/books/travel_2/index.html")

In [104]:
def book_link_access(book_link,url_categorie):
    clean_link = f"{main_url}{url_categorie}{book_link}"
    return clean_link
    # print(clean_link)


# for link in test_list:
#    book_link_access(link)

In [105]:
def image_link_access(link):
    return f"{main_url}{re.sub(r'^(\.\./)+', '',link)}"

# print(clean_link("../../../1000-places-to-see-before-you-die_1/index.html"))

In [106]:
def download_image(link,categorie,upc):
    img_folder = f"output/image/{categorie}"
    os.makedirs(img_folder, exist_ok=True)  

    img_filename = f"{upc}.jpg"
    img_path = os.path.join(img_folder, img_filename)
    img_data = rq.get(link, stream=True)
    with open(img_path, "wb") as f:
        for chunk in img_data.iter_content(1024):
            f.write(chunk)

In [107]:

def book_scrapper(book_link : str) -> dict:
    response = rq.get(book_link)
    selector = Selector(text=response.text)

    # Titre
    title = selector.css(".product_main h1::text").get()
    # Prix
    price = selector.css(".product_main .price_color::text").get().replace("Â","").replace("£","")
    # Disponibilité
    availability = selector.css(".product_main .availability::text").getall()[1].strip()
    # Note
    note_class = selector.css(".product_main .star-rating").attrib.get("class", "")
    note = note_class.replace("star-rating", "").strip()
    note = number_words.get(note.lower())
    # Url Image
    url_image = selector.css(".carousel-inner img::attr(src)").get()
    # UPC
    upc = selector.css("table.table.table-striped tr:nth-child(1) td::text").get()
    # Catégorie
    categorie = selector.css("ul.breadcrumb li a::text").getall()[2]
    # Tout dans un dictionnaire
    book_detail = {
    "titre": title,
    "prix": price,
    "disponibilite": availability,
    "note": note,
    "url": book_link,
    "url_image": image_link_access(url_image),
    "upc": upc,
    "categorie": categorie,
    }

    # Télécharger les images
    download_image(book_detail["url_image"],book_detail["categorie"],book_detail["upc"])

    return book_detail

# book_scrapper("https://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html")


In [111]:
def extract_category_from_url(url):
    return url.split("/")[-2].split("_")[0]

In [118]:
def export_to_csv(data_list, category_url):
    category_name = extract_category_from_url(category_url)
    folder = "output/csv"
    os.makedirs(folder, exist_ok=True)
    file_path = os.path.join(folder, f"category_{category_name}.csv")

    df = pd.DataFrame(data_list)
    df.to_csv(file_path, index=False, sep=";", encoding="utf-8-sig")

# export_to_csv(book_scrapper("https://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html"))

In [None]:
for url_categorie in categorie_url_list:
    page_number = 1
    has_next = True

    all_category_books = []
    while has_next:

        if page_number == 1:
            page_url = f"{main_url}{url_categorie}index.html"
        else:
            page_url = f"{main_url}{url_categorie}page-{page_number}.html"

        # Recuperer les liens des livres
        book_link_list = book_linker(page_url)
        # print(book_link_list)

        # Nettoyer les liens
        book_link_list_clean = [book_link_access(book_link,url_categorie) for book_link in book_link_list]
        # print(book_link_list_clean)
        
        # Scrapper les livres
        book_info_list = [book_scrapper(book_link_clean) for book_link_clean in book_link_list_clean]
        # print(book_info_list)

        page_response = rq.get(page_url)
        page_selector = Selector(text=page_response.text)

        all_category_books.append(book_info_list)

        if page_selector.css("li.next a::text").get() == "next":
            page_number += 1
        else:
            has_next = False

    flat_books = [book for page_books in all_category_books for book in page_books]
    # print(flat_books)

    export_to_csv(flat_books,url_categorie)
    