In [None]:
import requests
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
#3.6
from dataclasses import dataclass, field
from typing import List, Dict
from datetime import datetime

# Modelos

In [None]:
from sqlalchemy import create_engine, Column, Integer, String, Float, Boolean
from sqlalchemy.orm import sessionmaker, declarative_base

engine = create_engine('sqlite:///top.db', echo=True)

class Logger:
    def __init__(self, filename):
        self.filename = filename
        self.log('Init')
        
    def log(self, message):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        with open(self.filename, "a") as file:
            file.write(f"[{timestamp}] {message}\n")

Base = declarative_base()
class Product(Base):
    __tablename__ = 'product'
    id = Column(Integer, primary_key=True)
    name = Column(String)
    price = Column(Float)
    product_url = Column(String)
    product_img = Column(String)
    department = Column(String)
    in_stock = Column(Boolean)

    def to_dict(self):
        return {
            'id': self.id,
            'name': self.name,
            'price': self.price,
            'product_url': self.product_url,
            'product_img': self.product_img,
            'department': self.department,
            'in_stock': self.in_stock
        }

# class Department(Base):
#     __tablename__ = 'department'
#     id = Column
Base.metadata.create_all(engine)


In [None]:
@dataclass
class Department:
    name: str
    url: str
    products: List[Product] = field(default_factory=list)
    pages: List = field(default_factory=list)
    soup: BeautifulSoup = None

    def setSoup(self, response_text):
        self.soup = BeautifulSoup(response_text, 'html.parser')

    def  getPageNumber(self):
        pages = self.soup.find('ul', 'a-pagination')
        self.pages = pages.find_all('li')[2:-1]

    def getPageElements(self):
        # Obtener todos los cards
        cards = self.soup.find_all('div', id='gridItemRoot')

        for card in cards:
            try:
                image = card.find('img')['src']
            except:
                image = ''

            try:
                name = card.find_all('a', 'a-link-normal')[1].find('span').find('div').text
            except:
                name = ''

            try:
                price = card.find('span', 'p13n-sc-price')
                price = float(price.text.replace('$', '').replace(',', ''))
                stock = True
            except AttributeError:
                try:
                    price = card.find('span', class_='_cDEzb_p13n-sc-price_3mJ9Z')
                    price = float(price.text.replace('$', '').replace(',', ''))
                    stock = True
                except:
                    price = 0.0
                    stock = False

     
            

            element = Product(name=name, 
                              price=price, 
                              product_url='https://www.amazon.com.mx/' + card.find_all('a', 'a-link-normal')[1]['href'], 
                              product_img=image,
                              department = self.name,
                              in_stock = stock
                    )

            self.products.append(element)

    def getAllElements(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

        self.getPageElements()
        print(self.page_no)
        for element in self.page_no:

            response = requests.get('https://www.amazon.com.mx'+ element.find('a')['href'], headers=headers) 
            
            while response.status_code != 200:
                response = requests.get('https://www.amazon.com.mx'+  element.find('a')['href'], headers=headers) 
                print(response.status_code,' || ', 'https://www.amazon.com.mx'+ element.find('a')['href'])
                sleep(3)

            self.setSoup(response_text=response.text)
            self.getPageElements()





        
    

# Main Code

In [None]:
logger = Logger('test.log')

In [None]:
### Petición get
URL = 'https://www.amazon.com.mx/gp/bestsellers/?ref_=nav_cs_bestsellers'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0',
    'Accept': 'text/html',
    'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
    'Referer': 'https://www.example.com',
    'Connection': 'close'
}

response = requests.get(URL, headers=headers)
logger.log(str(response.status_code) + ' | Best Sellers')
while response.status_code != 200:
    response = requests.get(URL, headers=headers)
    logger.log(str(response.status_code) + ' | Best Sellers')
    if response.status_code == 200:
        break
    sleep(3)


In [None]:
### Primer request a pagina de Amazon best sellers
soup = BeautifulSoup(response.text, 'html.parser')
headers = soup.find_all('div', class_='a-carousel-header-row')

In [None]:
## Obtiene la lista de departamentos dentro de los mas vendidos dentro de amazon
## Se crea el objeto departamento
## Guarda los departamentos en una lista
department_list = []
for header in headers:
    query = header.find('a')
    department_list.append(
        Department(
            str(query['aria-label'].replace(' - Ver más', '')),
            'https://www.amazon.com.mx' + str(query['href'])
        )
    )

department_list

## Scraping de cada de partamento

In [None]:
## Itera por cada departamento detectado dentro de la pagina
## hace un request hasta que este devuelva un 200 ya que amazon puede enviarte diferentes status_codes
## Si la petición get no es success no vamos a poder hacer el scraping por lo que se haran las peticiones cada 3 segundos en caso de que no 
## Sea success
for department in department_list:
    print(f'================{department.name}================')
    response = requests.get(url=department.url)

    while response.status_code != 200:
        response = requests.get(url=department.url)
        print(response.status_code, department.name )
        if response.status_code == 200:
            break
        sleep(5)
    ## Despues de que la petición es aceptada nosotros mandamos ese response text
    ## Lo preparamos para la extracción
    ## Nota se tiene que modificar y mejor utilizar una lista de urls ya que cuenta con los diferentes urls y no utiliza la converción  de
    ## URL.com.mx/endpoint=no_pagina
    department.setSoup(response.text)
    department.getPageNumber()
    department.getAllElements()


## Guardando la data en la DB

In [None]:
from sqlalchemy.orm import Session, sessionmaker

Session = sessionmaker(bind=engine)
session = Session()
engine = create_engine('sqlite:///top.db', echo=True)


for department in department_list:
    session.add_all(department.products)

    session.commit()



### Formato de df

In [None]:
records = []
for department in department_list:
    for product in department.products:
        records.append(product.to_dict())
records

In [None]:
df = pd.DataFrame.from_records(records)
df.drop_duplicates()

In [None]:
['pagina anterior',1,2,3,4,'pagina siguiente'][2:-1]