In [None]:
import pandas as pd
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import psycopg2
from psycopg2.extras import RealDictCursor
import os
from dotenv import load_dotenv

load_dotenv()

class BookScraper:
    def __init__(self):
        self.driver = None
        self.books_data = []
        self.setup_driver()
        
    def setup_driver(self):
        """Configuration du driver Selenium"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Mode headless pour performance
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.implicitly_wait(10)
    
    def extract_rating_number(self, rating_class):
        """Convertir la note textuelle en nombre"""
        rating_map = {
            'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5
        }
        for word, number in rating_map.items():
            if word.lower() in rating_class.lower():
                return number
        return 0
    
    def clean_price(self, price_text):
        """Nettoyer et convertir le prix en float"""
        if price_text:
            # Enlever les symboles et garder seulement les chiffres et points
            cleaned = re.sub(r'[^\d.]', '', price_text)
            try:
                return float(cleaned)
            except ValueError:
                return 0.0
        return 0.0
    
    def clean_availability(self, availability_text):
        """Extraire le nombre d'exemplaires disponibles"""
        if availability_text:
            # Chercher les chiffres dans le texte
            numbers = re.findall(r'\d+', availability_text)
            if numbers:
                return int(numbers[0])
        return 0
    
    def clean_description(self, description):
        """Nettoyer la description"""
        if not description or description.strip() == "":
            return ""
        
        # Supprimer les espaces multiples et caractères spéciaux
        cleaned = re.sub(r'\s+', ' ', description)
        cleaned = re.sub(r'[^\w\s.,!?;:-]', '', cleaned)
        return cleaned.strip()
    
    def scrape_books(self):
        """Scraper principal pour extraire toutes les données des livres"""
        print("Démarrage du scraping de books.toscrape.com...")
        
        try:
            self.driver.get("https://books.toscrape.com/")
            
            page = 1
            while True:
                print(f"Scraping page {page}...")
                
                # Attendre que les livres se chargent
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article.product_pod"))
                )
                
                # Trouver tous les liens vers les livres
                book_links = self.driver.find_elements(By.CSS_SELECTOR, "article.product_pod h3 a")
                book_urls = []
                
                for link in book_links:
                    book_urls.append(link.get_attribute('href'))
                
                # Visiter chaque livre pour extraire les détails
                for url in book_urls:
                    self.scrape_book_details(url)
                    time.sleep(1)  # Pause entre les requêtes
                
                # Vérifier s'il y a une page suivante
                try:
                    next_button = self.driver.find_element(By.CSS_SELECTOR, "li.next a")
                    next_url = next_button.get_attribute('href')
                    self.driver.get(next_url)
                    page += 1
                    time.sleep(2)
                except:
                    print("Pas de page suivante trouvée. Scraping terminé.")
                    break
        
        except Exception as e:
            print(f"Erreur lors du scraping: {e}")
        
        finally:
            if self.driver:
                self.driver.quit()
    
    def scrape_book_details(self, book_url):
        """Extraire les détails d'un livre spécifique"""
        try:
            self.driver.get(book_url)
            
            # Extraire les données
            title = self.driver.find_element(By.TAG_NAME, "h1").text
            
            # Description
            try:
                description_elem = self.driver.find_element(By.CSS_SELECTOR, "#product_description + p")
                description = description_elem.text
            except:
                description = ""
            
            # Prix
            price_elem = self.driver.find_element(By.CSS_SELECTOR, "p.price_color")
            price = self.clean_price(price_elem.text)
            
            # Disponibilité
            availability_elem = self.driver.find_element(By.CSS_SELECTOR, "p.availability")
            availability = self.clean_availability(availability_elem.text)
            
            # Image
            try:
                img_elem = self.driver.find_element(By.CSS_SELECTOR, "#product_gallery img")
                image_url = img_elem.get_attribute('src')
                # Convertir l'URL relative en absolue
                if image_url.startswith('../'):
                    image_url = "https://books.toscrape.com/" + image_url.replace('../', '')
            except:
                image_url = ""
            
            # Note
            try:
                rating_elem = self.driver.find_element(By.CSS_SELECTOR, "p.star-rating")
                rating_class = rating_elem.get_attribute('class')
                rating = self.extract_rating_number(rating_class)
            except:
                rating = 0
            
            # Nettoyer la description
            cleaned_description = self.clean_description(description)
            if not cleaned_description:
                cleaned_description = title  # Utiliser le titre si pas de description
            
            book_data = {
                'titre': title,
                'description': cleaned_description,
                'prix': price,
                'stock': availability,
                'image_url': image_url,
                'rating': rating
            }
            
            self.books_data.append(book_data)
            print(f"Livre scraped: {title}")
            
        except Exception as e:
            print(f"Erreur lors du scraping du livre {book_url}: {e}")
    
    def create_dataframe(self):
        """Créer un DataFrame à partir des données scrapées"""
        if not self.books_data:
            print("Aucune donnée à traiter")
            return None
        
        df = pd.DataFrame(self.books_data)
        
        # Gestion des valeurs manquantes
        df['description'] = df.apply(
            lambda row: row['titre'] if pd.isna(row['description']) or row['description'] == '' 
            else row['description'], axis=1
        )
        
        # S'assurer que les types sont corrects
        df['prix'] = pd.to_numeric(df['prix'], errors='coerce').fillna(0.0)
        df['stock'] = pd.to_numeric(df['stock'], errors='coerce').fillna(0)
        df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0)
        
        return df
    
    def save_to_csv(self, df, filename='livres_bruts.csv'):
        """Sauvegarder temporairement les données en CSV"""
        if df is not None:
            df.to_csv(filename, index=False, encoding='utf-8')
            print(f"Données sauvegardées dans {filename}")
            return True
        return False
    
    # def save_to_database(self, df):
    #     """Sauvegarder les données dans PostgreSQL"""
    #     try:
    #         # Configuration de la base de données
    #         conn = psycopg2.connect(
    #             host=os.getenv('DB_HOST', 'localhost'),
    #             database=os.getenv('DB_NAME', 'bib_readers'),
    #             user=os.getenv('DB_USER', 'postgres'),
    #             password=os.getenv('DB_PASSWORD', 'password'),
    #             port=os.getenv('DB_PORT', '5432')
    #         )
            
    #         cursor = conn.cursor()
            
    #         # Créer la table si elle n'existe pas
    #         create_table_query = """
    #         CREATE TABLE IF NOT EXISTS livres (
    #             id SERIAL PRIMARY KEY,
    #             titre VARCHAR(255) NOT NULL,
    #             description TEXT,
    #             image_url VARCHAR(500),
    #             stock INTEGER DEFAULT 0,
    #             rating INTEGER DEFAULT 0,
    #             prix DECIMAL(10,2) DEFAULT 0.00,
            #     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            # );
            # """
            # cursor.execute(create_table_query)
            
            # # Insérer les données
            # for _, row in df.iterrows():
            #     insert_query = """
            #     INSERT INTO livres (titre, description, image_url, stock, rating, prix)
            #     VALUES (%s, %s, %s, %s, %s, %s)
            #     ON CONFLICT DO NOTHING;
            #     """
            #     cursor.execute(insert_query, (
            #         row['titre'], 
            #         row['description'], 
            #         row['image_url'], 
            #         row['stock'], 
            #         row['rating