In [16]:
from datetime import date, datetime, timedelta
import json
from time import localtime, sleep, strftime, time
import os

from dotenv import load_dotenv
from openpyxl import load_workbook
import pandas as pd
from seleniumwire import webdriver
from seleniumwire.utils import decode
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, ElementNotInteractableException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from timeit import default_timer
from webdriver_manager.chrome import ChromeDriverManager

In [17]:
fb_mkp_ropa = {
    "Fecha Extraccion": [],
    "titulo_marketplace": [],
    "tiempo_creacion": [],
    "tipo_delivery": [],
    "delivery_data": [],
    "delivery_direccion": [],
    "descripcion": [],
    "disponible": [],
    "vendido": [],
    "fecha_union_vendedor": [],
    "cantidad": [],
    "precio": [],
    "tipo_moneda": [],
    "amount_with_concurrency": [],
    "latitud": [],
    "longitud": [],
    "locacion": [],
    "locacion_id": [],
    "name_vendedor": [],
    "tipo_vendedor": [],
    "id_vendedor": []
}

In [18]:
fb_mkp_ropa_time = {
    "Fecha" : None,
    "Hora Inicio": None,
    "Hora Termino": None,
    "Cantidad": None,
    "Tiempo(HHMMSS)": None,
    "Productos/min": None,
    "Enlace": None,
    "Observaciones": None,
}

In [27]:
class ScraperFb:
    """Representa a un bot para hacer web scarping en fb marketplace.

    Attributes:
        driver (Object): Maneja un navegador para hacer web scraping
        wait (Object): Maneja el Tiempo de espera durante la ejecución del bot
    """
    
    def __init__(self):
        """Inicializa un objeto de tipo ScraperFb.

        Args:
            driver (Object): [Driver]
            wait (Object): [Wait]
        """
        chrome_options = webdriver.ChromeOptions()
        prefs = {"profile.default_content_setting_values.notifications" : 2}
        chrome_options.add_experimental_option("prefs",prefs)
        self.driver = webdriver.Chrome(chrome_options=chrome_options,service=Service(ChromeDriverManager().install()))
        self.wait = WebDriverWait(self.driver, 10)

    def iniciar_sesion(self, url):
        """Inicia sesión en una página web usando un usuario y contraseña

        Args:
            url (str): [Url]
        """
        self.driver.get(url)
        self.driver.maximize_window()
        username = self.wait.until(EC.presence_of_element_located((By.ID, "email")))
        password = self.wait.until(EC.presence_of_element_located((By.ID, "pass")))
        username.clear()
        password.clear()
        username.send_keys(os.getenv('FB_USERNAME'))
        password.send_keys(os.getenv('FB_PASSWORD'))
        self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[name='login']"))).click()
    
    def mapear_datos(self):
        sleep(10)
        self.driver.execute_script("window.open('about:blank', 'newtab');")
        self.driver.switch_to.window("newtab")
        self.driver.get("https://www.facebook.com/marketplace/category/apparel/?sortBy=creation_time_descend&exact=false")
        
        sleep(8)        
        ropa = self.driver.find_elements(By.XPATH, '//*[@class="xt7dq6l xl1xv1r x6ikm8r x10wlt62 xh8yej3"]')
        tiempo = fecha = "2023-01-05" #fb_mkp_ropa_time["Fecha"]
        fecha_extraccion = datetime.strptime(fb_mkp_ropa_time["Fecha"],"%Y-%m-%d").strftime('%d/%m/%Y')
        i=0
        e=0
        while tiempo >= fecha:
            print("Scrapeando item", i + 1)
            try:
                ropa[i].click()
                sleep(3)
                for request in self.driver.requests:
                    if not request.response or 'graphql' not in request.url:
                        continue
                    
                    body = decode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
                    decoded_body = body.decode('utf-8')
                    json_data = json.loads(decoded_body)
                    
                    if 'prefetch_uris_v2' not in json_data['extensions']:
                        continue

                    print(localtime(json_data['data']['viewer']['marketplace_product_details_page']['target']['creation_time']))
                    tiempo = strftime('%Y-%m-%d', localtime(json_data['data']['viewer']['marketplace_product_details_page']['target']['creation_time']))
                    
                    if tiempo == '2023-01-06':
                        dato = json_data['data']['viewer']['marketplace_product_details_page']
                        print(dato["target"]["marketplace_listing_title"])
                        self.extraer_datos(dato, fecha_extraccion)
                    break
                self.driver.execute_script("window.history.go(-1)");
                
            except (NoSuchElementException, StaleElementReferenceException) as error:
                print("Error:",error)
                print('No se hallo el item N '+str(i + 1)+'se pasará al siguiente')
                e=e+1
                
            except (KeyError, ElementNotInteractableException) as error:
                print("Error:",error)
                print('No se puede obtener la data del item N '+str(i + 1)+'se pasará al siguiente')
                e=e+1
                self.driver.execute_script("window.history.go(-1)");
                
            except Exception as error:
                print("Error:", error)
                e = e+1
                
            i = i + 1
            if i == len(ropa):
                self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
                sleep(7)
                ropa = self.driver.find_elements(By.XPATH, '//*[@class="xt7dq6l xl1xv1r x6ikm8r x10wlt62 xh8yej3"]')
            del self.driver.requests
            sleep(3)
            if i == 5:
                break
            print('-------------------------------------------------------------------')
        fb_mkp_ropa_time["Cantidad"]= i - e
        print("Se halló", e, "errores")
        print('-------------------------------------------------------------------')
        print('Fin de la extraccion')
        print('-------------------------------------------------------------------')
    
    def extraer_datos(self, item, fecha_extraccion):
        fb_mkp_ropa["titulo_marketplace"].append(item['target'].get('marketplace_listing_title'))
        fb_mkp_ropa["tiempo_creacion"].append(item['target'].get('creation_time'))
        fb_mkp_ropa["tipo_delivery"].append(item['target'].get('delivery_types', [None])[0])
        fb_mkp_ropa["delivery_data"].append(item['target'].get("delivery_data", {}).get('carrier'))
        fb_mkp_ropa["delivery_direccion"].append(item['target'].get("delivery_data", {}).get('delivery_address'))
        fb_mkp_ropa["descripcion"].append(item['target'].get('redacted_description', {}).get('text'))
        fb_mkp_ropa["disponible"].append(item['target'].get('is_live'))
        fb_mkp_ropa["vendido"].append(item['target'].get('is_sold'))
        fb_mkp_ropa["fecha_union_vendedor"].append(item['target'].get('marketplace_listing_seller', {}).get('join_time'))
        fb_mkp_ropa["cantidad"].append(item['target'].get('listing_inventory_type'))
        fb_mkp_ropa["precio"].append(item['target'].get('listing_price', {}).get('amount'))
        fb_mkp_ropa["tipo_moneda"].append(item['target'].get('listing_price', {}).get('currency'))
        fb_mkp_ropa["amount_with_concurrency"].append(item['target'].get('listing_price', {}).get('amount_with_offset_in_currency'))
        fb_mkp_ropa["latitud"].append(item['target'].get('location', {}).get('latitude'))
        fb_mkp_ropa["longitud"].append(item['target'].get('location', {}).get('longitude'))
        fb_mkp_ropa["locacion_id"].append(item['target'].get('location_vanity_or_id'))
        fb_mkp_ropa["name_vendedor"].append(item['target'].get('story', {}).get('actors', [{}])[0].get('name'))
        fb_mkp_ropa["tipo_vendedor"].append(item['target'].get('story', {}).get('actors', [{}])[0]['__typename'])
        fb_mkp_ropa["id_vendedor"].append(item['target'].get('story', {}).get('actors', [{}])[0]['id'])
        fb_mkp_ropa["locacion"].append(item['target'].get('location_text', {}).get('text'))
        fb_mkp_ropa["Fecha Extraccion"].append(fecha_extraccion)
        
    def guardar_datos(self):
        df_fb_mkp_ropa = pd.DataFrame(fb_mkp_ropa)
        df_fb_mkp_ropa.fillna("null")
        fb_mkp_ropa_time["Cantidad"] = len(df_fb_mkp_ropa)
        datetime_obj = datetime.strptime(fb_mkp_ropa_time["Fecha"],"%Y-%m-%d")
        path = "Data/" + datetime_obj.strftime('%d-%m-%Y') + "/"
        filename = "fb_ropa_" + datetime_obj.strftime('%d%m%Y') + "_" + str(fb_mkp_ropa_time["Cantidad"]) + ".xlsx"
        if not os.path.exists(path):
            os.mkdir(path)
        df_fb_mkp_ropa.to_excel(path + filename, index = False)
        
    def guardar_tiempos(self):
        filename = 'Tiempos.xlsx'
        tiempos = load_workbook(filename)
        sheet_name = "Ropa"
        header_exist = True
        if sheet_name not in [ws.title for ws in tiempos.worksheets]:
            tiempos.create_sheet(sheet_name)
            header_exist = False
        worksheet = tiempos[sheet_name]
        if not header_exist:
            worksheet.append(list(fb_mkp_ropa_time.keys()))
        worksheet.append(list(fb_mkp_ropa_time.values()))
        tiempos.save(filename)
        tiempos.close()

In [28]:
def set_params_inicio():
    fb_mkp_ropa_time["Fecha"] = str(datetime.now().date() - timedelta(days=1))
    start = time()
    fb_mkp_ropa_time["Hora Inicio"] = strftime("%H:%M:%S", localtime(start))
    print("Hora de inicio:",fb_mkp_ropa_time["Hora Inicio"])
    return start

def set_params_final(start):
    end = time()
    fb_mkp_ropa_time["Hora Termino"] = strftime("%H:%M:%S", localtime(end))
    print("Hora Termino:",fb_mkp_ropa_time["Hora Termino"])
    total = end - start
    print("Duracion: ",total, 'seconds')
    fb_mkp_ropa_time["Tiempo(HHMMSS)"] = str(timedelta(seconds=total)).split(".")[0]
    fb_mkp_ropa_time["Productos/min"] = int(fb_mkp_ropa_time["Cantidad"]/(total / 60))

In [29]:
def main():
    start = set_params_inicio()
    scraper = ScraperFb()
    url = 'https://www.facebook.com/'
    load_dotenv()
    scraper.iniciar_sesion(url)
    scraper.mapear_datos()
    scraper.guardar_datos()
    set_params_final(start)
    scraper.guardar_tiempos()

In [25]:
if __name__ == '__main__':
    main()

Hora de inicio: 19:31:44
Scrapeando item 1
Error: 'marketplace_product_details_page'
No se puede obtener la data del item N 1se pasará al siguiente
-------------------------------------------------------------------
Scrapeando item 2
time.struct_time(tm_year=2023, tm_mon=1, tm_mday=6, tm_hour=19, tm_min=30, tm_sec=38, tm_wday=4, tm_yday=6, tm_isdst=0)
Vestido Semibolsillo
-------------------------------------------------------------------
Scrapeando item 3
time.struct_time(tm_year=2023, tm_mon=1, tm_mday=6, tm_hour=19, tm_min=29, tm_sec=33, tm_wday=4, tm_yday=6, tm_isdst=0)
Top con detalle al frente nuevo
-------------------------------------------------------------------
Scrapeando item 4
time.struct_time(tm_year=2023, tm_mon=1, tm_mday=6, tm_hour=19, tm_min=28, tm_sec=42, tm_wday=4, tm_yday=6, tm_isdst=0)
Zapatillas de Fútbol talla 36 - original
-------------------------------------------------------------------
Scrapeando item 5
time.struct_time(tm_year=2023, tm_mon=1, tm_mday=6, tm