<a href="https://colab.research.google.com/github/Juan-Draghi/relevamiento-boletin-oficial-caba/blob/main/Busqueda_Boletin_Oficial_CABA_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üìÑ An√°lisis del Bolet√≠n Oficial de CABA
Este notebook procesa **PDF(s) del Bolet√≠n Oficial de la Ciudad de Buenos Aires** que vos subas manualmente.
Realiza una **b√∫squeda por t√©rminos clave y patrones de pertinencia (verbos de acci√≥n normativa)** y genera un **Excel** con:
1) **Pertinentes (keywords+patrones)**: p√°ginas donde coexisten *al menos un keyword* y *al menos un patr√≥n de pertinencia*.
2) **Todos los hallazgos**: p√°ginas donde aparece *alg√∫n keyword* **o** *alg√∫n patr√≥n de pertinencia*.

## üîß Instalaci√≥n de dependencias

In [None]:
!pip install -qq pdfplumber openpyxl tqdm python-dotenv

## üì¶ Importaciones y configuraci√≥n

In [None]:
import io
import os
import re
import json
import zlib
import hashlib
import unicodedata
import pdfplumber
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

def normalize_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def clean_text(s: str) -> str:
    """Normaliza ligaduras, elimina guiones blandos/espacios invisibles y des-hifena cortes de l√≠nea."""
    if not s:
        return ""
    # 1) Normaliza Unicode (ej: Ô¨Å -> fi)
    s = unicodedata.normalize("NFKC", s)
    # 2) Limpia caracteres problem√°ticos
    s = s.replace("\x00", " ").replace("\xa0", " ").replace("\ufeff", "")
    s = s.replace("\u00ad", "")  # soft hyphen
    s = re.sub(r"[\u200B-\u200D\u2060]", "", s)  # zero-width
    # 3) Une palabras cortadas por gui√≥n de fin de l√≠nea: "Fi-\njar" -> "Fijar"
    s = re.sub(r"(\w)-\s+(\w)", r"\1\2", s)
    # 4) Normaliza saltos y espacios
    s = s.replace("\r", " ").replace("\n", " ")
    s = normalize_spaces(s)
    return s

# Utilidad para extraer un recorte de texto alrededor de una coincidencia
def extract_snippet(text, start_idx, window=220):
    a = max(0, start_idx - window)
    b = min(len(text), start_idx + window)
    return text[a:b].strip()

## ‚¨ÜÔ∏è Sub√≠ el/los PDF(s) del Bolet√≠n Oficial

In [None]:
# En Colab, ejecut√° esto y seleccion√° 1 o m√°s PDF del Bolet√≠n Oficial.
try:
    from google.colab import files
    uploaded = files.upload()
    pdf_paths = [fn for fn in uploaded.keys() if fn.lower().endswith(".pdf")]
    if not pdf_paths:
        raise ValueError("No se subieron archivos PDF.")
except Exception as e:
    # Si no est√°s en Colab y corr√©s localmente, pod√©s setear manualmente pdf_paths
    print("Advertencia:", e)
    # Ejemplo para uso local: coloc√° aqu√≠ rutas locales a PDFs si no us√°s Colab
    pdf_paths = []
    # pdf_paths = ["/path/a/tu/boletin.pdf"]

## üîé Listas de b√∫squeda: keywords y patrones de pertinencia

In [None]:
keywords = [
    "C√≥digo Urban√≠stico",
    "C√≥digo de Edificaci√≥n",
    "C√≥digo de Habilitaciones",
    "compendio normativo",
    "Reglamentos T√©cnicos",
    "Reglamento T√©cnico",
    "Autorizaci√≥n de actividad econ√≥mica",
    "Autorizaci√≥n de actividades econ√≥micas",
    "Autorizaciones de actividades econ√≥micas",
    "Impacto ambiental",
    "Ley Marco de Regulaci√≥n de Actividades Econ√≥micas de la Ciudad Aut√≥noma de Buenos Aires",
    "Ley Tarifaria",
    "publicidad exterior",
    "Unidad Tarifaria",
    "Sistema de Autoprotecci√≥n",
    "Sistemas de Autoprotecci√≥n",
    "Catastro",
    "Derecho para el Desarrollo Urbano y el H√°bitat Sustentable",
    "C√≥digo Fiscal",
    "√Årea C√©ntrica",
    "planos de mensura",
    "obras en contravenci√≥n",
    "UERESGP",
    r"[Dd]isposici[o√≥]n(?: [Nn]¬∞?)? ?3\\.?500(?:[-/]?GCABA)?[-/]?DGOEP[-/]?16",
    r"[Dd]isposici[o√≥]n(?: [Nn]¬∞?)? ?331(?:[-/]?GCABA)?[-/]?DGDCIV[-/]?25",
    r"[Dd]isposici[o√≥]n(?: [Nn]¬∞?)? ?89(?:[-/]?GCABA)?[-/]?DGROC[-/]?24",
    r"[Dd]isposici[o√≥]n(?: [Nn]¬∞?)? ?526(?:[-/]?GCABA)?[-/]?DGFYCO[-/]?24",
    r"[Rr]esoluci[o√≥]n(?: [Nn]¬∞?)? ?275(?:[-/]?GCABA)?[-/]?APRA[-/]?23",
    r"[Rr]esoluci[o√≥]n(?: [Nn]¬∞?)? ?188(?:[-/]?GCABA)?[-/]?SSGU[-/]?24",
    r"[Rr]esoluci[o√≥]n(?: [Nn]¬∞?)? ?160(?:[-/]?GCABA)?[-/]?SSHA[-/]?24",
    r"[Rr]esoluci[o√≥]n(?: [Nn]¬∞?)? ?96(?:[-/]?GCABA)?[-/]?AGC[-/]?25",
    r"[Rr]esoluci[o√≥]n(?: [Nn]¬∞?)? ?345(?:[-/]?GCABA)?[-/]?AGC[-/]?21",
    r"[Rr]esoluci[o√≥]n(?: [Nn]¬∞?)? ?103(?:[-/]?GCABA)?[-/]?APRA[-/]?25",
    r"[Rr]esoluci[o√≥]n(?: [Nn]¬∞?)? ?1(?:[-/]?GCABA)?[-/]?MEPHUGC[-/]?25",
    r"[Dd]ecreto(?: [Nn]¬∞?)? ?51/18",
    r"[Dd]ecreto(?: [Nn]¬∞?)? ?86/19",
    r"[Dd]ecreto(?: [Nn]¬∞?)? ?87/19",
    r"[Dd]ecreto(?: [Nn]¬∞?)? ?99/19",
    r"[Dd]ecreto(?: [Nn]¬∞?)? ?105/19",
    r"[Dd]ecreto(?: [Nn]¬∞?)? ?475/20",
    r"[Dd]ecreto(?: [Nn]¬∞?)? ?129/25",
    r"[Dd]decreto(?: [Nn]¬∞?)? ?116/25",
    r"[Dd]ecreto(?: [Nn]¬∞?)? ?164/25",
    r"[Dd]ecreto(?: [Nn]¬∞?)? ?189/25",
    r"[Ll]ey(?: [Nn]¬∞?)? ?123",
    r"[Ll]ey(?: [Nn]¬∞?)? ?2\\.?936",
    r"[Ll]ey(?: [Nn]¬∞?)? ?5\\.?920",
    r"[Ll]ey(?: [Nn]¬∞?)? ?6\\.101",
    r"[Ll]ey(?: [Nn]¬∞?)? ?6\\.?776",
    r"[Ll]ey(?: [Nn]¬∞?)? ?6\\.?779",
    r"[Ll]ey(?: [Nn]¬∞?)? ?6\\.?099",
    r"[Ll]ey(?: [Nn]¬∞?)? ?6\\.?100",
]

patrones_pertinencia = [
    # Modificar
    r"\bmodifica\b", r"\bmodificar\b", r"\bmodif√≠case\b",
    # Derogar
    r"\bderoga\b", r"\bderogar\b", r"\bder√≥gase\b",
    # Aprobar (ambas f√≥rmulas)
    r"\baprueba\b", r"\baprobar\b", r"\bapru√©bese\b", r"\bapru√©base\b",
    # Dejar sin efecto
    r"\bdeja sin efecto\b", r"\bdejar sin efecto\b", r"\bd√©jase sin efecto\b",
    # Sustituir
    r"\bsustituye\b", r"\bsustituir\b", r"\bsustit√∫yese\b", r"\bsustit√∫yase\b",
    # Establecer
    r"\bestablece\b", r"\bestablecer\b", r"\bestabl√©cese\b", r"\bestabl√©case\b",
    # Fijar (agrego la f√≥rmula ‚ÄúF√≠jase‚Äù)
    r"\bfija\b", r"\bfijar\b", r"\bf√≠jese\b", r"\bf√≠jase\b",
    # Determinar
    r"\bdetermina\b", r"\bdeterminar\b", r"\bdeterm√≠nase\b", r"\bdeterm√≠nese\b",
    # Reglamentar (ambas)
    r"\breglamenta\b", r"\breglamentar\b", r"\breglam√©ntese\b", r"\breglam√©ntase\b", r"\breglamentaci√≥n\b",
    # Prorrogar
    r"\bprorroga\b", r"\bprorrogar\b", r"\bprorr√≥gase\b", r"\bprorr√≥gese\b",
    # Incorporar / Crear (muy frecuentes en anexos)
    r"\bincorpora\b", r"\bincorporar\b", r"\bincorp√≥rase\b", r"\bincorp√≥rese\b",
    r"\bcrea\b", r"\bcrear\b", r"\bcr√©ase\b", r"\bcr√©ese\b",
    # Otras f√≥rmulas habituales
    r"\bdeclara\b", r"\bdeclarar\b", r"\bdecl√°rase\b", r"\bdecl√°rese\b",
    r"\botorga\b", r"\botorgar\b", r"\bot√≥rgase\b", r"\bot√≥rguese\b",
    r"\brectifica\b", r"\brectificar\b", r"\brectif√≠case\b", r"\brectif√≠quese\b",
]
patrones_pertinencia_comp = [re.compile(p, re.IGNORECASE) for p in patrones_pertinencia]

# Separa keywords en "simples" (texto literal) y "regulares" (contienen clases/escapes t√≠picos)
def es_patron_regex(s: str) -> bool:
    # Heur√≠stica m√≠nima: si contiene metacaracteres frecuentes, lo tratamos como regex
    return bool(re.search(r"[\\\[\]\(\)\?\+\*\|]", s))

keywords_regex = [k for k in keywords if es_patron_regex(k)]
keywords_simples = [k for k in keywords if not es_patron_regex(k)]

# Compilamos patrones de pertinencia y keywords regex para acelerar
patrones_pertinencia_comp = [re.compile(p, re.IGNORECASE) for p in patrones_pertinencia]
keywords_regex_comp = [re.compile(p, re.IGNORECASE) for p in keywords_regex]

## ‚ñ∂Ô∏è Procesamiento y exportaci√≥n

In [None]:
# ---------- CONFIG ----------
WINDOW_WORDS = 300          # ¬± palabras alrededor del keyword
MERGE_GAP_WORDS = 100       # unir contextos si est√°n a <100 palabras
SIMHASH_BITS = 64           # tama√±o de simhash
SIMHASH_THRESH = 3          # distancia de Hamming m√°xima para considerar duplicado
CACHE_DIR = "/content/bo_cache"  # cambia si quer√©s otro path
os.makedirs(CACHE_DIR, exist_ok=True)

# ---------- NORMALIZACI√ìN (asegurate de tener estas utilidades definidas) ----------
def normalize_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()

def clean_text(s: str) -> str:
    if not s: return ""
    s = unicodedata.normalize("NFKC", s)
    s = (s.replace("\x00"," ").replace("\xa0"," ").replace("\ufeff","")
           .replace("\u00ad",""))  # soft hyphen
    s = re.sub(r"[\u200B-\u200D\u2060]", "", s)  # zero-width
    # Une cortes "Fi-\njar" -> "Fijar"
    s = re.sub(r"(\w)-\s+(\w)", r"\1\2", s)
    s = s.replace("\r", " ").replace("\n", " ")
    # normaliza guiones raros a "-"
    s = s.translate(str.maketrans({c:"-" for c in "‚Äê-‚Äí‚Äì‚Äî"}))
    return normalize_spaces(s)

# ---------- TOKENIZACI√ìN ----------
def tokenize_with_spans(text):
    toks = []
    for m in re.finditer(r"\S+", text):
        toks.append((m.group(0), m.start(), m.end()))
    return toks

def charpos_to_word_index(spans, pos):
    # b√∫squeda lineal (robusta y suficiente)
    for i, (_, s, e) in enumerate(spans):
        if s <= pos < e:
            return i
    if not spans:
        return 0
    return min(range(len(spans)), key=lambda i: abs(spans[i][1]-pos))

# ---------- SIMHASH DEDUP ----------
def _hash64(x: str) -> int:
    # hash determinista 64-bit
    return int(hashlib.blake2b(x.encode("utf-8"), digest_size=8).hexdigest(), 16)

def simhash(text: str, bits: int = SIMHASH_BITS) -> int:
    # tokens simples (palabras); pod√©s cambiar a shingles si quer√©s m√°s fineza
    tokens = re.findall(r"\w{3,}", text.lower())
    v = [0]*bits
    for t in tokens:
        h = _hash64(t)
        for b in range(bits):
            bit = (h >> b) & 1
            v[b] += 1 if bit else -1
    out = 0
    for b in range(bits):
        if v[b] >= 0:
            out |= (1 << b)
    return out

def hamming(a: int, b: int) -> int:
    return (a ^ b).bit_count()

def dedup_by_simhash(rows, text_key="Extracto", keep="max_score"):
    """Dedup por SimHash. keep: 'max_score' o 'first'."""
    kept = []
    signatures = []  # (simhash, idx_en_kept)
    for r in rows:
        text = r.get(text_key, "") or ""
        sh = simhash(text)
        dup_idx = None
        for j, (sh2, kidx) in enumerate(signatures):
            if hamming(sh, sh2) <= SIMHASH_THRESH:
                dup_idx = kidx
                break
        if dup_idx is None:
            kept.append(r)
            signatures.append((sh, len(kept)-1))
        else:
            if keep == "max_score":
                # si el nuevo tiene score mayor, reemplaza
                if r.get("Score", 0) > kept[dup_idx].get("Score", 0):
                    kept[dup_idx] = r
                    signatures[dup_idx] = (sh, dup_idx)
    return kept

# ---------- COMPILACI√ìN REGEX (UNA VEZ) ----------
patrones_pertinencia_comp = [re.compile(p, re.IGNORECASE) for p in patrones_pertinencia]

def es_patron_regex(s: str) -> bool:
    return bool(re.search(r"[\\\[\]\(\)\?\+\*\|]", s))

keywords_regex = [k for k in keywords if es_patron_regex(k)]
keywords_simples = [k for k in keywords if not es_patron_regex(k)]
keywords_regex_comp = [re.compile(p, re.IGNORECASE) for p in keywords_regex]
keywords_simples_lower = [k.lower() for k in keywords_simples]  # para contains r√°pido

# ---------- CACHE ----------
def pdf_sha256(path: str) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""):
            h.update(chunk)
    return h.hexdigest()

def cache_path(pdf_hash: str) -> str:
    return os.path.join(CACHE_DIR, f"{pdf_hash}.json")

def load_cache(pdf_hash: str):
    p = cache_path(pdf_hash)
    if os.path.exists(p):
        with open(p, "r", encoding="utf-8") as f:
            return json.load(f)
    return None

def save_cache(pdf_hash: str, data):
    with open(cache_path(pdf_hash), "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False)

# ---------- PASADA R√ÅPIDA (marca ventanas candidatas por p√°gina) ----------
def quick_scan_page(args):
    pdf_path, page_index = args
    try:
        with pdfplumber.open(pdf_path) as pdf:
            page = pdf.pages[page_index]
            raw = page.extract_text() or ""
        text = clean_text(raw)
        # pistas m√≠nimas por performance
        lower_text = text.lower()
        hit_positions = []

        # simples (contains)
        for kw, q in zip(keywords_simples, keywords_simples_lower):
            start = 0
            while True:
                idx = lower_text.find(q, start)
                if idx == -1: break
                hit_positions.append(("simple", kw, idx, idx+len(kw)))
                start = idx + max(1, len(q)//2)

        # regex
        for rx in keywords_regex_comp:
            for m in rx.finditer(text):
                hit_positions.append(("regex", rx.pattern, m.start(), m.end()))

        return {
            "ok": True,
            "page_index": page_index,
            "text": text,
            "hits": hit_positions
        }
    except Exception as e:
        return {"ok": False, "page_index": page_index, "error": str(e)}

# ---------- MERGE DE CONTEXTOS SOLAPADOS ----------
def merge_contexts(spans, tokens, hit_positions, window_words=WINDOW_WORDS, gap=MERGE_GAP_WORDS):
    """Crea ventanas ¬±window_words por cada hit y las une si est√°n a <gap palabras."""
    # construir spans de palabra
    windows = []
    for (_, kw, start, _end) in hit_positions:
        widx = charpos_to_word_index(spans, start)
        a = max(0, widx - window_words)
        b = min(len(tokens), widx + window_words + 1)
        windows.append({"a": a, "b": b, "keywords": set([kw])})

    if not windows:
        return []

    # ordenar por inicio y merge
    windows.sort(key=lambda x: x["a"])
    merged = [windows[0]]
    for w in windows[1:]:
        last = merged[-1]
        if w["a"] <= last["b"] + gap:
            # solapa o cerca: unir
            last["b"] = max(last["b"], w["b"])
            last["keywords"].update(w["keywords"])
        else:
            merged.append(w)
    # construir contextos
    out = []
    for w in merged:
        contexto = " ".join(tokens[w["a"]:w["b"]])
        out.append({"a": w["a"], "b": w["b"], "keywords": sorted(list(w["keywords"])), "context": contexto})
    return out

# ---------- PATRONES EN CONTEXTO + SCORE ----------
VERBOS_FUERTES = re.compile(r"(aprueba|deroga|establece|fija|apruebese|apruebase|derogase|establ√©cese|f√≠jase|f√≠jese)", re.IGNORECASE)

def patrones_en_contexto(ctx_text):
    hits = []
    for prx in patrones_pertinencia_comp:
        if prx.search(ctx_text):
            hits.append(prx.pattern)
    return hits

def score_context(ctx_text, pats):
    score = 0
    if any(VERBOS_FUERTES.search(ctx_text) for _ in [0]):  # r√°pido
        score += 2
    if pats:
        score += 1
    # ‚ÄúArt√≠culo 1¬∞/Primero/RESUELVE‚Äù suma
    if re.search(r"Art√≠culo\s*1|Art\.\s*1|RESUELVE|DISP√ìNESE|EL JEFE DE GOBIERNO RESUELVE", ctx_text, re.IGNORECASE):
        score += 1
    return score

# ---------- PASADA DETALLE (sobre ventanas candidatas) ----------
def detail_scan_page(pdf_path, page_index, text, hits):
    spans = tokenize_with_spans(text)
    tokens = [t for (t, s, e) in spans]
    contexts = merge_contexts(spans, tokens, hits, WINDOW_WORDS, MERGE_GAP_WORDS)
    rows_all = []
    rows_pert = []
    for c in contexts:
        pats = patrones_en_contexto(c["context"])
        sc = score_context(c["context"], pats)
        row = {
            "Archivo": os.path.basename(pdf_path),
            "P√°gina": page_index + 1,
            "Keywords_en_contexto": "; ".join(c["keywords"]),
            "Coincidencias_patrones_en_contexto": "; ".join(pats),
            "Hay_patron_en_contexto": bool(pats),
            "Score": sc,
            "Extracto": c["context"]
        }
        rows_all.append(row)
        if pats:
            rows_pert.append(row)
    return rows_all, rows_pert

# =========================
#     EJECUCI√ìN
# =========================
all_rows = []
pertinent_rows = []

for pdf_path in pdf_paths:
    pdf_hash = pdf_sha256(pdf_path)
    cache = load_cache(pdf_hash)

    # Si no hay cache, o est√° incompleta, la regeneramos
    if not cache:
        # PASADA R√ÅPIDA EN PARALELO (por p√°gina)
        with pdfplumber.open(pdf_path) as pdf:
            n_pages = len(pdf.pages)
        tasks = [(pdf_path, i) for i in range(n_pages)]
        results = [None]*len(tasks)
        with ProcessPoolExecutor() as ex:
            futures = {ex.submit(quick_scan_page, t): t for t in tasks}
            for fut in tqdm(as_completed(futures), total=len(tasks), desc=f"Quick-scan {os.path.basename(pdf_path)}"):
                res = fut.result()
                results[res["page_index"]] = res
        # guardamos cache m√≠nima: texto normalizado + hits por p√°gina
        cache = {"pages": []}
        for r in results:
            if not r or not r.get("ok"):
                cache["pages"].append({"ok": False, "error": r.get("error","") if r else "unknown"})
            else:
                cache["pages"].append({
                    "ok": True,
                    "text": r["text"],
                    "hits": r["hits"]  # lista de (tipo, kw/patr√≥n, start, end)
                })
        save_cache(pdf_hash, cache)

    # PASADA DETALLE SOLO EN P√ÅGINAS CANDIDATAS (con hits)
    for idx, pg in enumerate(cache["pages"]):
        if not pg.get("ok"):
            continue
        text = pg.get("text","")
        hits = pg.get("hits", [])
        if not hits:
            continue  # sin keywords ‚Üí no detalle
        rows_all, rows_pert = detail_scan_page(pdf_path, idx, text, hits)
        all_rows.extend(rows_all)
        pertinent_rows.extend(rows_pert)

# ---------- DEDUP GLOBAL POR SIMHASH ----------
all_rows = dedup_by_simhash(all_rows, text_key="Extracto", keep="max_score")
pertinent_rows = dedup_by_simhash(pertinent_rows, text_key="Extracto", keep="max_score")

# ---------- EXPORTAR EXCEL ----------
df_all = pd.DataFrame(all_rows)
df_pert = pd.DataFrame(pertinent_rows)

fecha_tag = pd.Timestamp.now().strftime("%Y%m%d_%H%M")
nombre_xlsx = f"Resultados_BO_CABA_sin_LLM_{fecha_tag}.xlsx"

with pd.ExcelWriter(nombre_xlsx, engine="openpyxl") as writer:
    (df_pert if not df_pert.empty else pd.DataFrame(
        columns=["Archivo","P√°gina","Keywords_en_contexto",
                 "Coincidencias_patrones_en_contexto","Hay_patron_en_contexto","Score","Extracto"])
    ).to_excel(writer, sheet_name="Pertinentes (keywords+patrones)", index=False)

    (df_all if not df_all.empty else pd.DataFrame(
        columns=["Archivo","P√°gina","Keywords_en_contexto",
                 "Coincidencias_patrones_en_contexto","Hay_patron_en_contexto","Score","Extracto"])
    ).to_excel(writer, sheet_name="Todos los hallazgos", index=False)

print("Archivo generado:", nombre_xlsx)

try:
    from google.colab import files
    files.download(nombre_xlsx)
except Exception:
    pass