In [49]:
import curl_cffi
from curl_cffi import requests
import json
import pandas as pd
import numpy as np
import requests
import scraper_functions
from scraper_functions import VPNTester, ScraperClient
import openpyxl
import pydantic_model
from pydantic_model import categories, products
import pydantic
from pydantic import ValidationError

import random
from datetime import datetime
from zoneinfo import ZoneInfo
from pathlib import Path

##### VPN Checker

In [50]:
tester = VPNTester()
tester.vpn_check()

[🛡️  VPN Check] Your current IP: 185.239.150.6


##### Fixing impersonate agent for the session

In [51]:
FIXED_IMPERSONATE = random.choice(scraper_functions.IMPERSONATE_OPTIONS)
print(FIXED_IMPERSONATE)

safari172_ios


## Product Categories

##### Get json

In [52]:
url = "https://api.app.biggie.com.py/api/classifications/web?take=-1&storeType="

scraper = ScraperClient(
    url, 
    random_wait = True, 
    impersonate = FIXED_IMPERSONATE
)

json_response = scraper.get_json()

[⏱️] Waiting 5.50s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios


##### Saving raw json file

In [53]:
ScraperClient.save_json_raw(
    json_response.get("items", []), 
    supermarket="biggie", 
    subfolder=f'/workspaces/proyecto-tesis/outputs/biggie/categories', 
    name='categories'
)

[💾] Archivo guardado en: /workspaces/proyecto-tesis/outputs/biggie/categories/biggie_categories_2025-07-17_20-14-59.json


##### Parse JSON to Model

In [54]:
if json_response :
    categories_model = ScraperClient.parse_json_to_model(
        json_data = json_response, 
        model_class = categories, 
        supermarket = 'biggie'
    )

##### Model -> Dataframe | Validation

In [55]:
df = pd.DataFrame([c.model_dump() for c in categories_model])
df.head()

Unnamed: 0,id,name,slug,supermarket,ingestion_time
0,6,Alimentos Especiales,alimentos-especiales,biggie,2025-07-17 20:14:59.317960-03:00
1,1,Almacén,almacen,biggie,2025-07-17 20:14:59.317960-03:00
2,246,Asado,asado,biggie,2025-07-17 20:14:59.317960-03:00
3,41,Bebes,bebes,biggie,2025-07-17 20:14:59.317960-03:00
4,3,Bebidas con Alcohol,bebidas-con-alcohol,biggie,2025-07-17 20:14:59.317960-03:00


In [56]:
df.to_csv('/workspaces/proyecto-tesis/outputs/biggie/categories/biggie_categories_test.csv')

# Product Data

In [57]:
# Original sorting
categories = df["slug"].unique() # [15:-3] subset for testing
print(categories.size)
categories

20


array(['alimentos-especiales', 'almacen', 'asado', 'bebes',
       'bebidas-con-alcohol', 'bebidas-sin-alcohol', 'carniceria',
       'chocolates-y-golosinas', 'congelados', 'fiambreria',
       'fruteria-y-verduleria', 'heladeria-y-confiteria',
       'higiene-personal', 'lacteos', 'libreria', 'limpieza', 'mascotas',
       'panaderia', 'snacks', 'varios'], dtype=object)

In [58]:
# Shuffleled
categories = df["slug"].unique() # [15:-3] subset for testing
np.random.shuffle(categories)
print(categories.size)
categories

20


array(['lacteos', 'panaderia', 'limpieza', 'alimentos-especiales',
       'almacen', 'bebidas-con-alcohol', 'fruteria-y-verduleria',
       'chocolates-y-golosinas', 'snacks', 'varios',
       'bebidas-sin-alcohol', 'libreria', 'carniceria', 'asado',
       'mascotas', 'heladeria-y-confiteria', 'fiambreria',
       'higiene-personal', 'congelados', 'bebes'], dtype=object)

#### Get JSON
4 min 47s for last three categories //
1 min for last category //
35 min 11s for full scrap (1st test on 29/06)

In [None]:
# SUPER = "Biggie" # [Done at parsing to pydantic model]
NOW = datetime.now(ZoneInfo("America/Asuncion"))
# FIXED_IMPERSONATE = random.choice(scraper_functions.IMPERSONATE_OPTIONS) [Done at session level]

all_items = []
NumberResults = 24

for category in categories:
    skip_value = 0

    while True:
        url = (
            f"https://api.app.biggie.com.py/api/articles"
            f"?take={NumberResults}&skip={skip_value}&classificationName={category}"
        )

        scraper = ScraperClient(
            url=url,
            impersonate=FIXED_IMPERSONATE,  # ❗️Fijo durante toda la sesión
            random_wait=True                 # ❗️Simula comportamiento humano
        )
        json_response = scraper.get_json()

        items = json_response.get("items", [])

        if not items:
            break

        for item in items:
            item["category"] = category
            # item["supermarket"] = SUPER
            item["ingestion_time"] = NOW

        all_items.extend(items)
        skip_value += NumberResults


[⏱️] Waiting 3.64s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 7.74s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 6.02s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 9.42s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 6.48s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 6.99s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 4.08s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 5.07s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 6.43s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 2.84s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 4.73s to simulate human behavior...
[🎭 Impersonation] Using: safari172_ios
[⏱️] Waiting 5.72s to simulate h

##### Saving raw json file

In [None]:
ScraperClient.save_json_raw(
    data = all_items,
    supermarket="biggie", 
    subfolder=f'/workspaces/proyecto-tesis/outputs/biggie/products', 
    name='products'
)

[💾] Archivo guardado en: /workspaces/proyecto-tesis/outputs/biggie/products/biggie_products_2025-07-16_23-48-30.json


##### Parse JSON to Model

In [None]:
# Manual JSON Load
'''
with open('/workspaces/proyecto-tesis/outputs/biggie/products/biggie/biggie_products_2025-06-29_15-21-52.json', 'r') as f:
        all_items = json.load(f)

all_items
'''

"\nwith open('/workspaces/proyecto-tesis/outputs/biggie/products/biggie/biggie_products_2025-06-29_15-21-52.json', 'r') as f:\n        all_items = json.load(f)\n\nall_items\n"

In [None]:
if all_items :
    products_model = ScraperClient.parse_json_to_model(
        json_data = all_items, 
        model_class = products, 
        supermarket = 'biggie'
    )

##### Model -> Dataframe | Validation

In [None]:
df = pd.DataFrame([c.model_dump() for c in products_model])
df.head()

Unnamed: 0,code,name,price,category,supermarket,ingestion_time
0,60168,Costilla de Primera Biggie Envasado al Vacio x Kg,38950,carniceria,biggie,2025-07-16 23:10:41.005708-03:00
1,60136,Carne Vacio Biggie x kg.,52450,carniceria,biggie,2025-07-16 23:10:41.005708-03:00
2,60133,Costilla Especial Biggie envasado al vacio x kg.,40450,carniceria,biggie,2025-07-16 23:10:41.005708-03:00
3,60140,Carne falda Biggie x kg.,31250,carniceria,biggie,2025-07-16 23:10:41.005708-03:00
4,60161,Costilla Ancha Biggie envasado al vacio x kg.,41950,carniceria,biggie,2025-07-16 23:10:41.005708-03:00


In [None]:
df.to_csv('/workspaces/proyecto-tesis/outputs/biggie/products/biggie_products_test.csv')