In [3]:
%pip install psycopg2 requests beautifulsoup4

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.4 soupsieve-2.7
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine

# -----------------------
# Config PostgreSQL
# -----------------------
DB_URI = "postgresql+psycopg2://postgres:admin@localhost:5432/booksdb"

base_url = "https://books.toscrape.com/catalogue/page-{}.html"
livres = []

rating_map = {
    "One": "1",
    "Two": "2",
    "Three": "3",
    "Four": "4",
    "Five": "5"
}

# ======= SCRAPING =======
for page in range(1, 51):
    url = base_url.format(page)
    response = requests.get(url)
    response.encoding = "latin-1"
    
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article", class_="product_pod")
    
    for article in articles:
        titre = article.h3.a["title"]
        titre = titre.encode("latin-1", errors="ignore").decode("utf-8", errors="ignore")
        
        prix_str = article.find("p", class_="price_color").text.strip()
        prix_str = prix_str.replace("√Ç", "").replace("¬£", "")
        prix = float(prix_str)
        
        stock = article.find("p", class_="instock availability").text.strip()
        stock = stock.encode("latin-1", errors="ignore").decode("utf-8", errors="ignore")
        
        img_url = article.find("img")["src"].replace("../../", "https://books.toscrape.com/")
        
        rating_class = article.find("p", class_="star-rating")["class"][1]
        rating = int(rating_map.get(rating_class, "0"))
        
        livres.append([titre, prix, stock, img_url, rating])

# Sauvegarde CSV en UTF-8
csv_file = "livres.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Titre", "Prix", "Stock", "Image URL", "Rating"])
    writer.writerows(livres)

print("‚úÖ Donn√©es sauvegard√©es dans livres.csv")

# ======= CHARGEMENT CSV & STOCKAGE DB =======
try:
    # Charger le CSV
    df = pd.read_csv(csv_file, encoding="utf-8")
    
    # Connexion PostgreSQL via SQLAlchemy
    engine = create_engine(DB_URI)
    
    # Sauvegarde dans la table "livres"
    df.to_sql("livres", engine, if_exists="replace", index=False)
    
    print("‚úÖ Donn√©es sauvegard√©es dans PostgreSQL avec to_sql()")
except Exception as e:
    print("‚ùå Erreur lors de la sauvegarde dans PostgreSQL :", e)


‚úÖ Donn√©es sauvegard√©es dans livres.csv
‚úÖ Donn√©es sauvegard√©es dans PostgreSQL avec to_sql()


In [18]:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import urljoin

# -----------------------
# Config PostgreSQL
# -----------------------
DB_URI = "postgresql+psycopg2://postgres:admin@localhost:5432/booksdb"

base_url = "https://books.toscrape.com/catalogue/page-{}.html"
site_base = "https://books.toscrape.com/"
livres = []

rating_map = {
    "One": "1",
    "Two": "2",
    "Three": "3",
    "Four": "4",
    "Five": "5"
}

# ======= SCRAPING =======
for page in range(1, 51):
    url = base_url.format(page)
    response = requests.get(url)
    response.encoding = "latin-1"
    
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article", class_="product_pod")
    
    for article in articles:
        titre = article.h3.a["title"]
        titre = titre.encode("latin-1", errors="ignore").decode("utf-8", errors="ignore")
        
        prix_str = article.find("p", class_="price_color").text.strip()
        prix_str = prix_str.replace("√Ç", "").replace("¬£", "")
        prix = float(prix_str)
        
        stock = article.find("p", class_="instock availability").text.strip()
        stock = stock.encode("latin-1", errors="ignore").decode("utf-8", errors="ignore")
        
        img_url = article.find("img")["src"]
        img_url = urljoin(site_base, img_url)  # construit une URL absolue
        
        rating_class = article.find("p", class_="star-rating")["class"][1]
        rating = int(rating_map.get(rating_class, "0"))
        
        # --- R√©cup√©ration de la description ---
        detail_href = article.h3.a["href"]
        detail_url = urljoin(site_base + "catalogue/", detail_href)  # URL absolue
        detail_resp = requests.get(detail_url)
        detail_resp.encoding = "latin-1"
        detail_soup = BeautifulSoup(detail_resp.text, "html.parser")
        
        desc_tag = detail_soup.find("div", id="product_description")
        if desc_tag:
            description = desc_tag.find_next_sibling("p").text.strip()
        else:
            description = ""
        
        livres.append([titre, prix, stock, img_url, rating, description])

# Sauvegarde CSV en UTF-8
csv_file = "livres.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Titre", "Prix", "Stock", "Image URL", "Rating", "Description"])
    writer.writerows(livres)

print("‚úÖ Donn√©es sauvegard√©es dans livres.csv")

# ======= CHARGEMENT CSV & STOCKAGE DB =======
try:
    # Charger le CSV
    df = pd.read_csv(csv_file, encoding="utf-8")
    
    # Connexion PostgreSQL via SQLAlchemy
    engine = create_engine(DB_URI)
    
    # Sauvegarde dans la table "livres"
    df.to_sql("livres", engine, if_exists="replace", index=False)
    
    print("‚úÖ Donn√©es sauvegard√©es dans PostgreSQL avec to_sql()")
except Exception as e:
    print("‚ùå Erreur lors de la sauvegarde dans PostgreSQL :", e)


‚úÖ Donn√©es sauvegard√©es dans livres.csv
‚úÖ Donn√©es sauvegard√©es dans PostgreSQL avec to_sql()


In [22]:
import csv
import pandas as pd
import numpy as np
import re
import time
import joblib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from sqlalchemy import create_engine, Column, Integer, String, Float, Text
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from urllib.parse import urljoin

# -----------------------
# Configuration
# -----------------------
DB_URI = "postgresql+psycopg2://postgres:admin@localhost:5432/booksdb"
BASE_URL = "https://books.toscrape.com"
CSV_FILE = "livres_bruts.csv"

# Mapping des ratings
RATING_MAP = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}

# Configuration SQLAlchemy
Base = declarative_base()

class Livre(Base):
    __tablename__ = 'livres'
    
    id = Column(Integer, primary_key=True, autoincrement=True)
    titre = Column(String(500), nullable=False)
    description = Column(Text)
    prix = Column(Float)
    disponibilite = Column(Integer)
    image_url = Column(String(500))
    note = Column(Integer)

def setup_driver():
    """Configure et initialise le driver Selenium"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Mode sans interface
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.implicitly_wait(10)
    return driver

def extract_stock_number(stock_text):
    """Extrait le nombre de livres disponibles du texte de stock"""
    if not stock_text:
        return 0
    
    # Recherche d'un nombre dans le texte
    match = re.search(r'(\d+)', stock_text)
    if match:
        return int(match.group(1))
    
    # Si "In stock" sans nombre, consid√©rer comme 1
    if "in stock" in stock_text.lower():
        return 1
    
    return 0

def clean_description(description):
    """Nettoie la description du livre"""
    if not description:
        return ""
    
    # Supprimer les espaces multiples
    description = re.sub(r'\s+', ' ', description)
    
    # Supprimer les caract√®res sp√©ciaux ind√©sirables
    description = re.sub(r'[^\w\s.,!?;:\'-]', '', description)
    
    # Supprimer "...more" √† la fin
    description = re.sub(r'\.\.\.more$', '', description)
    
    return description.strip()

def scrape_book_details(driver, book_url):
    """Scrape les d√©tails d'un livre sp√©cifique"""
    try:
        driver.get(book_url)
        wait = WebDriverWait(driver, 10)
        
        # Attendre que la page soit charg√©e
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
        
        # Titre
        titre = driver.find_element(By.TAG_NAME, "h1").text
        
        # Description
        description = ""
        try:
            desc_element = driver.find_element(By.CSS_SELECTOR, "#product_description + p")
            description = desc_element.text
        except NoSuchElementException:
            description = titre  # Utiliser le titre si pas de description
        
        # Prix
        prix_text = driver.find_element(By.CSS_SELECTOR, ".price_color").text
        prix = float(re.sub(r'[^\d.]', '', prix_text))
        
        # Disponibilit√©
        stock_text = driver.find_element(By.CSS_SELECTOR, ".instock").text
        disponibilite = extract_stock_number(stock_text)
        
        # Image URL
        img_element = driver.find_element(By.CSS_SELECTOR, "#product_gallery img")
        image_url = urljoin(BASE_URL, img_element.get_attribute("src"))
        
        # Note
        note = 0
        try:
            rating_element = driver.find_element(By.CSS_SELECTOR, ".star-rating")
            rating_class = rating_element.get_attribute("class")
            for word in rating_class.split():
                if word in RATING_MAP:
                    note = RATING_MAP[word]
                    break
        except NoSuchElementException:
            pass
        
        return {
            'titre': titre,
            'description': clean_description(description),
            'prix': prix,
            'disponibilite': disponibilite,
            'image_url': image_url,
            'note': note
        }
        
    except Exception as e:
        print(f"Erreur lors du scraping de {book_url}: {e}")
        return None

def scrape_all_books():
    """Scrape tous les livres du site"""
    driver = setup_driver()
    livres = []
    
    try:
        print("D√©but du scraping...")
        
        page = 1
        while True:
            url = f"{BASE_URL}/catalogue/page-{page}.html"
            print(f"Scraping page {page}...")
            
            driver.get(url)
            
            # V√©rifier si la page existe
            try:
                WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, ".product_pod"))
                )
            except TimeoutException:
                print(f"Page {page} non trouv√©e. Arr√™t du scraping.")
                break
            
            # R√©cup√©rer tous les liens des livres sur cette page
            book_links = []
            book_elements = driver.find_elements(By.CSS_SELECTOR, ".product_pod h3 a")
            
            for element in book_elements:
                book_href = element.get_attribute("href")
                if book_href:
                    book_links.append(book_href)
            
            print(f"Trouv√© {len(book_links)} livres sur la page {page}")
            
            # Scraper chaque livre
            for i, book_url in enumerate(book_links, 1):
                print(f"  Scraping livre {i}/{len(book_links)}...")
                book_data = scrape_book_details(driver, book_url)
                
                if book_data:
                    livres.append(book_data)
                
                # Petite pause pour √©viter de surcharger le serveur
                time.sleep(0.5)
            
            page += 1
            
            # Limiter √† 50 pages pour √©viter une boucle infinie
            if page > 50:
                break
    
    finally:
        driver.quit()
    
    print(f"Scraping termin√©. {len(livres)} livres r√©cup√©r√©s.")
    return livres

def save_to_csv(livres, filename):
    """Sauvegarde les donn√©es dans un fichier CSV"""
    df = pd.DataFrame(livres)
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"‚úÖ Donn√©es sauvegard√©es dans {filename}")
    return df

def create_database_table(engine):
    """Cr√©e la table dans la base de donn√©es"""
    Base.metadata.create_all(engine)
    print("‚úÖ Table 'livres' cr√©√©e dans la base de donn√©es")

def save_to_database(df, engine):
    """Sauvegarde les donn√©es dans la base de donn√©es"""
    try:
        df.to_sql('livres', engine, if_exists='replace', index=False)
        print("‚úÖ Donn√©es sauvegard√©es dans PostgreSQL")
    except Exception as e:
        print(f"‚ùå Erreur lors de la sauvegarde dans PostgreSQL: {e}")

def preprocess_data(df):
    """Pr√©traite les donn√©es"""
    print("Pr√©traitement des donn√©es...")
    
    # 1. Nettoyer la description (d√©j√† fait dans clean_description)
    df['description'] = df['description'].fillna('')
    
    # 2. Prix d√©j√† converti en float
    
    # 3. Disponibilit√© d√©j√† convertie en int
    
    # 4. Note d√©j√† extraite sous forme num√©rique
    
    # 5. Remplir les valeurs manquantes de description avec le titre
    mask = (df['description'] == '') | df['description'].isna()
    df.loc[mask, 'description'] = df.loc[mask, 'titre']
    
    print("‚úÖ Pr√©traitement termin√©")
    return df

def create_recommendation_model(df):
    """Cr√©e le mod√®le de recommandation bas√© sur la similarit√© cosinus"""
    print("Cr√©ation du mod√®le de recommandation...")
    
    # Pr√©parer les descriptions pour TF-IDF
    descriptions = df['description'].fillna('').astype(str)
    
    # Appliquer TF-IDF
    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words='english',
        lowercase=True,
        ngram_range=(1, 2)
    )
    
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    
    # Calculer la matrice de similarit√© cosinus
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    # Sauvegarder le mod√®le
    model_data = {
        'vectorizer': vectorizer,
        'tfidf_matrix': tfidf_matrix,
        'similarity_matrix': similarity_matrix,
        'titles': df['titre'].tolist(),
        'indices': pd.Series(df.index, index=df['titre']).to_dict()
    }
    
    joblib.dump(model_data, 'recommendation_model.pkl')
    print("‚úÖ Mod√®le de recommandation sauvegard√© dans 'recommendation_model.pkl'")
    
    return model_data

def get_recommendations(titre, model_data, n_recommendations=5):
    """Obtient des recommandations pour un livre donn√©"""
    try:
        # Obtenir l'index du livre
        idx = model_data['indices'][titre]
        
        # Obtenir les scores de similarit√©
        sim_scores = list(enumerate(model_data['similarity_matrix'][idx]))
        
        # Trier par score de similarit√© (d√©croissant)
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Obtenir les indices des livres similaires (exclure le livre lui-m√™me)
        sim_indices = [i[0] for i in sim_scores[1:n_recommendations+1]]
        
        # Retourner les titres des livres recommand√©s
        return [model_data['titles'][i] for i in sim_indices]
        
    except KeyError:
        return f"Livre '{titre}' non trouv√© dans la base de donn√©es"

def main():
    """Fonction principale"""
    print("=== SCRAPING BOOKS TO SCRAPE AVEC SELENIUM ===\n")
    
    # 1. Scraper les donn√©es
    livres = scrape_all_books()
    
    if not livres:
        print("‚ùå Aucune donn√©e r√©cup√©r√©e. Arr√™t du programme.")
        return
    
    # 2. Cr√©er le DataFrame
    df = pd.DataFrame(livres)
    
    # 3. Pr√©traiter les donn√©es
    df = preprocess_data(df)
    
    # 4. Sauvegarder dans le CSV
    save_to_csv(df, CSV_FILE)
    
    # 5. Sauvegarder dans la base de donn√©es
    try:
        engine = create_engine(DB_URI)
        create_database_table(engine)
        save_to_database(df, engine)
    except Exception as e:
        print(f"‚ùå Erreur avec la base de donn√©es: {e}")
    
    # 6. Cr√©er le mod√®le de recommandation
    model_data = create_recommendation_model(df)
    
    # 7. Exemple de recommandations
    if len(df) > 0:
        premier_livre = df.iloc[0]['titre']
        recommendations = get_recommendations(premier_livre, model_data)
        print(f"\nüìö Recommandations pour '{premier_livre}':")
        for i, rec in enumerate(recommendations, 1):
            print(f"  {i}. {rec}")
    
    print("\n‚úÖ Processus termin√© avec succ√®s!")

if __name__ == "__main__":
    main()

  Base = declarative_base()


=== SCRAPING BOOKS TO SCRAPE AVEC SELENIUM ===

D√©but du scraping...
Scraping page 1...
Trouv√© 20 livres sur la page 1
  Scraping livre 1/20...
  Scraping livre 2/20...
  Scraping livre 3/20...
  Scraping livre 4/20...
  Scraping livre 5/20...
  Scraping livre 6/20...
  Scraping livre 7/20...
  Scraping livre 8/20...
  Scraping livre 9/20...
  Scraping livre 10/20...
  Scraping livre 11/20...
  Scraping livre 12/20...
  Scraping livre 13/20...
  Scraping livre 14/20...
  Scraping livre 15/20...
  Scraping livre 16/20...
  Scraping livre 17/20...
  Scraping livre 18/20...
  Scraping livre 19/20...
  Scraping livre 20/20...
Scraping page 2...
Trouv√© 20 livres sur la page 2
  Scraping livre 1/20...
  Scraping livre 2/20...
  Scraping livre 3/20...
  Scraping livre 4/20...
  Scraping livre 5/20...
  Scraping livre 6/20...
  Scraping livre 7/20...
  Scraping livre 8/20...
  Scraping livre 9/20...
  Scraping livre 10/20...
  Scraping livre 11/20...
  Scraping livre 12/20...
  Scraping liv

In [23]:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
from urllib.parse import urljoin

# -----------------------
# Config PostgreSQL
# -----------------------
DB_URI = "postgresql+psycopg2://postgres:admin@localhost:5432/booksdb"

base_url = "https://books.toscrape.com/catalogue/page-{}.html"
site_base = "https://books.toscrape.com/"
livres = []

rating_map = {
    "One": 1,
    "Two": 2,
    "Three": 3,
    "Four": 4,
    "Five": 5
}

# ======= SCRAPING =======
print("D√©but du scraping...")
for page in range(1, 51):
    url = base_url.format(page)
    print(f"Scraping page {page}/50...")
    response = requests.get(url)
    response.encoding = "latin-1"
    
    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article", class_="product_pod")
    
    for article in articles:
        try:
            # Titre
            titre = article.h3.a["title"]
            titre = titre.encode("latin-1", errors="ignore").decode("utf-8", errors="ignore")
            
            # Prix
            prix_str = article.find("p", class_="price_color").text.strip()
            prix_str = prix_str.replace("√Ç", "").replace("¬£", "")
            prix = float(prix_str)
            
            # Stock/Disponibilit√©
            stock_text = article.find("p", class_="instock availability").text.strip()
            # Extract number from "In stock (22 available)"
            import re
            stock_match = re.search(r'\((\d+)', stock_text)
            disponibilite = int(stock_match.group(1)) if stock_match else 0
            
            # Image URL
            img_url = article.find("img")["src"]
            img_url = urljoin(site_base, img_url)
            
            # Rating
            rating_class = article.find("p", class_="star-rating")["class"][1]
            note = rating_map.get(rating_class, 0)
            
            # --- R√©cup√©ration de la description ---
            detail_href = article.h3.a["href"]
            detail_url = urljoin(site_base + "catalogue/", detail_href)
            detail_resp = requests.get(detail_url)
            detail_resp.encoding = "latin-1"
            detail_soup = BeautifulSoup(detail_resp.text, "html.parser")
            
            desc_tag = detail_soup.find("div", id="product_description")
            if desc_tag:
                description = desc_tag.find_next_sibling("p").text.strip()
                description = description.encode("latin-1", errors="ignore").decode("utf-8", errors="ignore")
            else:
                description = ""
            
            livres.append({
                'titre': titre,
                'description': description,
                'prix': prix,
                'disponibilite': disponibilite,
                'image_url': img_url,
                'note': note
            })
            
        except Exception as e:
            print(f"Erreur lors du scraping d'un article: {e}")
            continue

print(f"Scraping termin√©. {len(livres)} livres r√©cup√©r√©s.")

# ======= CHARGEMENT DIRECT EN DB =======
try:
    # Cr√©er DataFrame directement
    df = pd.DataFrame(livres)
    
    # V√©rifier les donn√©es
    print("\nAper√ßu des donn√©es:")
    print(df.head())
    print(f"\nNombre de livres: {len(df)}")
    print(f"Colonnes: {list(df.columns)}")
    print(f"Types: {df.dtypes}")
    
    # Connexion PostgreSQL via SQLAlchemy
    engine = create_engine(DB_URI)
    
    # Sauvegarde dans la table "livres" 
    df.to_sql("livres", engine, if_exists="replace", index=False)
    
    print("‚úÖ Donn√©es sauvegard√©es dans PostgreSQL avec to_sql()")
    
    # V√©rification
    verification_query = "SELECT COUNT(*) as total FROM livres"
    result = pd.read_sql_query(verification_query, engine)
    print(f"‚úÖ V√©rification: {result['total'].iloc[0]} livres dans la base")
    
except Exception as e:
    print("‚ùå Erreur lors de la sauvegarde dans PostgreSQL :", e)

# ======= SAUVEGARDE CSV OPTIONNELLE =======
try:
    csv_file = "livres.csv"
    df.to_csv(csv_file, index=False, encoding="utf-8")
    print(f"‚úÖ Donn√©es √©galement sauvegard√©es dans {csv_file}")
except Exception as e:
    print(f"‚ùå Erreur sauvegarde CSV: {e}")

D√©but du scraping...
Scraping page 1/50...
Scraping page 2/50...
Scraping page 3/50...
Scraping page 4/50...
Scraping page 5/50...
Scraping page 6/50...
Scraping page 7/50...
Scraping page 8/50...
Scraping page 9/50...
Scraping page 10/50...
Scraping page 11/50...
Scraping page 12/50...
Scraping page 13/50...
Scraping page 14/50...
Scraping page 15/50...
Scraping page 16/50...
Scraping page 17/50...
Scraping page 18/50...
Scraping page 19/50...
Scraping page 20/50...
Scraping page 21/50...
Scraping page 22/50...
Scraping page 23/50...
Scraping page 24/50...
Scraping page 25/50...
Scraping page 26/50...
Scraping page 27/50...
Scraping page 28/50...
Scraping page 29/50...
Scraping page 30/50...
Scraping page 31/50...
Scraping page 32/50...
Scraping page 33/50...
Scraping page 34/50...
Scraping page 35/50...
Scraping page 36/50...
Scraping page 37/50...
Scraping page 38/50...
Scraping page 39/50...
Scraping page 40/50...
Scraping page 41/50...
Scraping page 42/50...
Scraping page 43/50..