In [7]:
import pdfplumber
import pandas as pd
import re
from pathlib import Path

In [8]:
pdf_dir = Path("../data/raw_pdfs")
bronze_dir = Path("../data/bronze")
bronze_dir.mkdir(exist_ok=True)

pdf_files = list(pdf_dir.glob("*.pdf"))
if not pdf_files:
    raise FileNotFoundError("No hay PDFs en raw_pdfs/")
pdf_file = pdf_files[0]

print(f"ðŸ“„ Procesando: {pdf_file.name}")

ðŸ“„ Procesando: 2025-10-03.pdf


In [17]:
rows = []

with pdfplumber.open(pdf_file) as pdf:
    for i, page in enumerate(pdf.pages, start=1):
        text = page.extract_text()
        if not text:
            continue

        lines = [re.sub(r'\s+', ' ', line.strip()) for line in text.split('\n') if line.strip()]

        for line in lines:
            if re.search(r'precio|unidad|calidad|nombre', line, re.IGNORECASE):
                continue

            # Buscamos lÃ­neas con al menos un precio ($ o nÃºmero con coma)
            if not re.search(r'\$\s?\d|(?:\d{1,3}(?:\.\d{3})*,\d{2})|(?:\d{1,3}(?:,\d{3})+)', line):
                continue

            parts = line.split(" ")
            parts = [p for p in parts if p.strip()]

            # Buscar Ã­ndice del primer valor que parece precio
            price_idx = next((i for i, p in enumerate(parts) if re.search(r'\$\s?\d|[\d,]{2,}', p)), None)
            if price_idx is None:
                continue

            # Todo lo anterior al precio se asume parte del nombre/presentaciÃ³n
            producto = " ".join(parts[:price_idx-3]) if price_idx > 3 else parts[0]
            presentacion = parts[price_idx-3] if price_idx > 2 else None
            unidad = parts[price_idx-2] if price_idx > 1 else None

            # Intentamos obtener precios y estado
            precios = [p for p in parts[price_idx:] if re.search(r'\$\s?\d|[\d,]{2,}', p)]
            variacion = parts[-1] if not re.search(r'\d', parts[-1]) else None

            row = {
                "producto": producto.strip(),
                "presentacion": presentacion,
                "unidad": unidad,
                "precio_extra": precios[0] if len(precios) > 0 else None,
                "precio_primera": precios[2] if len(precios) > 2 else None,
                "precio_unidad": precios[2] if len(precios) > 2 else None,
                "variacion": variacion,
            }
            rows.append(row)
df = pd.DataFrame(rows)
display(df)

Unnamed: 0,producto,presentacion,unidad,precio_extra,precio_primera,precio_unidad,variacion
0,ALAS DE POLLO,KILO,1,"$16,000","$16,000","$16,000",Estable
1,MENUDENCIAS,KILO,1,"$7,000","$7,000","$7,000",Estable
2,PECHUGA DE POLLO,KILO,1,"$17,500","$17,500","$17,500",Estable
3,PERNILES DE POLLO,KILO,1,"$15,000","$15,000","$15,000",Estable
4,POLLO SIN VICERAS,KILO,1,"$21,000","$21,000","$21,000",Estable
...,...,...,...,...,...,...,...
162,QUESO CAMPESINO,KILO,1,"$21,000","$21,000","$21,000",Estable
163,QUESO COSTE,KILO,1,"$24,000","$24,000","$24,000",Estable
164,QUESO DOBLE CREMA,KILO,1,"$22,000","$22,000","$22,000",Estable
165,QUESO PAIPA,KILO,1,"$35,000","$35,000","$35,000",Estable


In [18]:
def parse_price(value):
    if isinstance(value, str):
        value = re.sub(r'[^\d,]', '', value)
        value = value.replace(',', '')
        return int(value) if value.isdigit() else None
    return None

price_cols = [c for c in df.columns if 'precio' in c]
for c in price_cols:
    df[c] = df[c].map(parse_price)
display(df.head(10))

Unnamed: 0,producto,presentacion,unidad,precio_extra,precio_primera,precio_unidad,variacion
0,ALAS DE POLLO,KILO,1,16000,16000,16000,Estable
1,MENUDENCIAS,KILO,1,7000,7000,7000,Estable
2,PECHUGA DE POLLO,KILO,1,17500,17500,17500,Estable
3,PERNILES DE POLLO,KILO,1,15000,15000,15000,Estable
4,POLLO SIN VICERAS,KILO,1,21000,21000,21000,Estable
5,BAGRE DORADO,KILO,1,26000,26000,26000,Estable
6,BAGRE PINTADO,KILO,1,24000,24000,24000,Estable
7,BLANQUILLO GALLEGO,KILO,1,16000,16000,16000,Estable
8,BOCA CHICO,KILO,1,17600,17600,17600,Estable
9,CACHAMA,KILO,1,12000,12000,12000,Estable


In [None]:
csv_path = bronze_dir / (pdf_file.stem + ".csv")
df.to_csv(csv_path, index=False)
print(f"ðŸ’¾ Guardado: {csv_path}")

ðŸ’¾ Guardado: ../data/bronze/2025-10-03.csv
