In [None]:
import asyncio
import aiohttp
import logging
from bs4 import BeautifulSoup
import aiomysql
import nest_asyncio

import sys

logging.basicConfig(
    filename="scraper.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    encoding="utf-8"
)

# Forzar que los logs se escriban inmediatamente
logging.getLogger().handlers[0].flush = sys.stdout.flush


nest_asyncio.apply()

# Configuración del semáforo para limitar concurrencia
SEMÁFORO = asyncio.Semaphore(5)

# Headers de la solicitud
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def extract_text(element, selector, default="N/A"):
    found = element.select_one(selector)
    return found.get_text(strip=True) if found else default

async def procesar_categoria(categoria, categoria_url, session, db_pool):
    async with SEMÁFORO:
        async with db_pool.acquire() as conn:
            async with conn.cursor() as cursor:
                logging.info(f"Procesando categoría: {categoria}")
                page = 1

                while True:
                    url = f"{categoria_url}?page={page}"
                    try:
                        async with session.get(url, timeout=10) as response:
                            if response.status != 200:
                                logging.warning(f"Error {response.status} al acceder a {url}")
                                break

                            soup = BeautifulSoup(await response.text(), "html.parser")
                            product_elements = soup.select("div.box-producto")

                            if not product_elements:
                                logging.info(f"No más productos en '{categoria}' (página {page}).")
                                break

                            for product in product_elements:
                                try:
                                    product_name = extract_text(product, "h3.nombre")
                                    product_author = extract_text(product, "div.autor")
                                    product_price_now = extract_text(product, "p.precio-ahora strong", "0")
                                    product_price_before = extract_text(product, "p.precio-antes del", "0")
                                    discount = extract_text(product, "div.descuento-v2")
                                    product_url = product.select_one("a")["href"].strip() if product.select_one("a") else "N/A"
                                    additional_details = extract_text(product, "div.autor.color-dark-gray.metas")
                                    stock = extract_text(product, "div.stock.color-green")
                                    image_url = product.select_one("img")["data-src"].strip() if product.select_one("img") else "N/A"

                                    try:
                                        product_price_now = float(product_price_now.replace(",", "").replace("$", ""))
                                    except ValueError:
                                        product_price_now = 0.0

                                    try:
                                        product_price_before = float(product_price_before.replace(",", "").replace("$", ""))
                                    except ValueError:
                                        product_price_before = 0.0

                                    if not discount or discount == "N/A":
                                        product_price_before = product_price_now

                                    await cursor.execute(
                                        """SELECT precio_actual, precio_anterior, descuento, unidades_disponibles 
                                           FROM productos WHERE url_producto = %s""",
                                        (product_url,)
                                    )
                                    producto_existente = await cursor.fetchone()

                                    if producto_existente:
                                        (precio_actual_bd, precio_anterior_bd, descuento_bd, stock_bd) = producto_existente

                                        cambios = []

                                        if precio_actual_bd != product_price_now:
                                            cambios.append(f"precio_actual: {precio_actual_bd} -> {product_price_now}")
                                        if precio_anterior_bd != product_price_before:
                                            cambios.append(f"precio_anterior: {precio_anterior_bd} -> {product_price_before}")
                                        if descuento_bd != discount:
                                            cambios.append(f"descuento: {descuento_bd} -> {discount}")
                                        if stock_bd != stock:
                                            cambios.append(f"unidades_disponibles: {stock_bd} -> {stock}")

                                        if cambios:
                                            logging.info(f"Actualizando producto: {product_name} | Cambios: {', '.join(cambios)}")

                                            await cursor.execute(
                                                """UPDATE productos 
                                                   SET precio_actual = %s, precio_anterior = %s, descuento = %s, unidades_disponibles = %s
                                                   WHERE url_producto = %s""",
                                                (product_price_now, product_price_before, discount, stock, product_url)
                                            )
                                        else:
                                            logging.info(f"El producto '{product_name}' no tiene cambios.")
                                    else:
                                        await cursor.execute(
                                            """INSERT INTO productos 
                                               (categoria, nombre, autor, precio_actual, precio_anterior, descuento, url_producto, 
                                                detalles_adicionales, unidades_disponibles, imagen_url)
                                               VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
                                            (categoria, product_name, product_author, product_price_now, 
                                             product_price_before, discount, product_url, additional_details, stock, image_url)
                                        )
                                        logging.info(f"Producto insertado: {product_name}")

                                except Exception as e:
                                    logging.error(f"Error insertando producto: {e}")

                    except aiohttp.ClientError as e:
                        logging.error(f"Error en la solicitud a {url}: {e}")
                        break
                    except asyncio.TimeoutError:
                        logging.warning(f"Timeout en {url}")
                        break

                    page += 1
                    await asyncio.sleep(2)

async def main():
    base_url = "https://www.buscalibre.com.co/"
    categories = []

    db_pool = await aiomysql.create_pool(
        host="localhost",
        user="root",
        password="43340837",
        db="libreria",
        port=3306,
        autocommit=True
    )

    async with aiohttp.ClientSession(headers=HEADERS) as session:
        try:
            async with session.get(base_url, timeout=10) as response:
                if response.status != 200:
                    logging.warning(f"Error {response.status} al acceder a {base_url}")
                    return

                soup = BeautifulSoup(await response.text(), "html.parser")
                category_elements = soup.select("li.category-li a")

                for element in category_elements:
                    try:
                        name = element.find("span").get_text(strip=True)
                        category_url = element["href"].strip()
                        if category_url.startswith("/"):
                            category_url = f"https://www.buscalibre.com.co{category_url}"
                        categories.append((name, category_url))
                    except Exception as e:
                        logging.error(f"Error procesando categoría: {e}")

            tasks = [procesar_categoria(name, url, session, db_pool) for name, url in categories]
            await asyncio.gather(*tasks)

        except aiohttp.ClientError as e:
            logging.error(f"Error en la conexión: {e}")
        except asyncio.TimeoutError:
            logging.warning(f"Timeout en {base_url}")

    db_pool.close()
    await db_pool.wait_closed()

asyncio.run(main())

logging.info("Proceso completado.")
