In [17]:
!pip install pdfminer

Collecting pdfminer
  Downloading pdfminer-20191125.tar.gz (4.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.2/4.2 MB[0m [31m141.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycryptodome (from pdfminer)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pdfminer
  Building wheel for pdfminer (setup.py) ... [?25l[?25hdone
  Created wheel for pdfminer: f

In [31]:
import argparse
import json
import csv
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List, Optional, Iterable, Dict, Tuple
import pdfminer

In [32]:
@dataclass
class SumulaRecord:
    number: Optional[str]
    topic: Optional[str]
    enunciado: Optional[str]
    referencias_legislativas: Optional[str]
    orgao_julgador: Optional[str]
    data_decisao: Optional[str]
    fonte: Optional[str]
    excertos_precedentes: Optional[str]

    def quality_score(self) -> int:
        return sum(1 for f in [
            self.topic, self.enunciado, self.referencias_legislativas,
            self.orgao_julgador, self.data_decisao, self.fonte, self.excertos_precedentes
        ] if f)

In [33]:
def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    try:
        import PyPDF2
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            parts = []
            for p in reader.pages:
                parts.append(p.extract_text() or "")
            text = "\n".join(parts)
    except Exception:
        text = ""
    if not text or len(text.strip()) < 1000:
        try:
            from pdfmine import extract_text as pdfminer_extract_text
            text2 = pdfminer_extract_text(pdf_path)
            if text2 and len(text2.strip()) > len(text.strip()):
                text = text2
        except Exception:
            pass

    return text or ""

In [34]:
LABELS = [
    "Referências Legislativas:", "Referencias Legislativas:",
    "Órgão Julgador:", "Orgao Julgador:",
    "Data da decisão:", "Data da decisao:",
    "Fonte:",
    "Excerto dos Precedentes Originários:", "Excerto dos Precedentes Originarios:",
]

def _preclean(text: str) -> str:
    """Normalize newlines and strip noisy headers/footers."""
    t = text.replace("\r\n", "\n").replace("\r", "\n")
    junk_patterns = [
        r"^\s*Inteiro Teor das\s+Súmulas.*?$",
        r"^\s*scon\.stj\.jus\.br/SCON/sumstj/.*?$",
        r"^\s*Página\s+\d+\s+de\s+\d+.*?$",
        r"^\s*\d+\s*$",
    ]
    for pat in junk_patterns:
        t = re.sub(pat, "", t, flags=re.MULTILINE | re.IGNORECASE)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t

In [35]:
def extract_with_label(block: str, labels: List[str], next_labels: List[str]) -> Optional[str]:
    lbl_union = "|".join([re.escape(l) for l in labels])
    next_union = "|".join([re.escape(nl) for nl in next_labels]) if next_labels else None
    if next_union:
        pattern = rf"(?is)({lbl_union})\s*(.*?)(?=(?:\s*\n)?(?:{next_union})|\n\s*S[úu]mula\s+\d+\s*$|\Z)"
    else:
        pattern = rf"(?is)({lbl_union})\s*(.*)"
    m = re.search(pattern, block)
    if not m:
        return None
    val = m.group(2).strip()
    for lab in LABELS:
        idx = val.find(lab)
        if idx != -1:
            val = val[:idx].rstrip()
            break
    return val

In [36]:
def parse_block(block: str) -> SumulaRecord:
    block = _preclean(block)
    mnum = re.search(r"S[ÚUúu]MULA\s+(\d+)", block, flags=re.IGNORECASE)
    number = mnum.group(1) if mnum else None
    topic = None
    lines = block.splitlines()
    for idx, ln in enumerate(lines[:10]):
        if re.search(r"S[ÚUúu]MULA\s+\d+", ln, flags=re.IGNORECASE):
            topic_lines = []
            for j in range(idx + 1, min(idx + 10, len(lines))):
                if re.match(r"(?i)\s*Enunciado\s*:", lines[j]):
                    break
                if not lines[j].strip():
                    continue
                topic_lines.append(lines[j].strip())
            topic = " ".join(topic_lines).strip() or None
            break
    if topic and re.search(r"(?i)S[úu]mula\s+\d+", topic):
        topic = re.split(r"(?i)S[úu]mula\s+\d+", topic)[0].strip(" -\n")

    enunciado = extract_with_label(block, ["Enunciado:"], LABELS)
    referencias = extract_with_label(block, ["Referências Legislativas:", "Referencias Legislativas:"], LABELS)
    orgao = extract_with_label(block, ["Órgão Julgador:", "Orgao Julgador:"], LABELS)
    data_decisao = extract_with_label(block, ["Data da decisão:", "Data da decisao:"], LABELS)
    fonte = extract_with_label(block, ["Fonte:"], LABELS)
    excertos = extract_with_label(block, ["Excerto dos Precedentes Originários:", "Excerto dos Precedentes Originarios:"], [])

    return SumulaRecord(
        number=number,
        topic=topic,
        enunciado=enunciado,
        referencias_legislativas=referencias,
        orgao_julgador=orgao,
        data_decisao=data_decisao,
        fonte=fonte,
        excertos_precedentes=excertos,
    )

In [37]:
def segment_blocks(text: str) -> List[str]:
    blocks: List[str] = []
    matches = list(re.finditer(r"\n?\s*S[úu]mula\s+(\d+)\s*$", text, flags=re.IGNORECASE | re.MULTILINE))
    for m in matches:
        start = m.start()
        men = re.search(r"Enunciado\s*:", text[start:], flags=re.IGNORECASE)
        if not men:
            continue
        en_pos = start + men.start()
        mnext = re.search(r"\n\s*S[úu]mula\s+\d+\s*$", text[en_pos:], flags=re.IGNORECASE | re.MULTILINE)
        end = en_pos + mnext.start() if mnext else len(text)
        blocks.append(text[start:end].strip())
    return blocks


In [38]:
def consolidate_best(records: List[SumulaRecord]) -> List[SumulaRecord]:
    best: Dict[str, SumulaRecord] = {}
    for r in records:
        if not r.number:
            continue
        current = best.get(r.number)
        if current is None or r.quality_score() > current.quality_score():
            best[r.number] = r
    return [best[n] for n in sorted(best.keys(), key=lambda x: int(re.sub(r"\D", "", x) or "0"))]


In [40]:
def run_extraction(pdf_path: str, outdir: str, min_quality: int = 0) -> Tuple[List["SumulaRecord"], Path, Path]:
    pdf_path = Path(pdf_path)
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    raw = extract_text_from_pdf(str(pdf_path))
    if not raw.strip():
        raise ValueError("Could not extract text from PDF. Is the file readable?")
    precise_blocks = segment_blocks(raw)
    precise_records = [parse_block(b) for b in precise_blocks]

    naive_blocks = re.split(r"(?im)^\s*S[úu]mula\s+\d+\s*$", raw)[1:]
    naive_blocks = [("Súmula X\n" + b).strip() for b in naive_blocks]
    naive_records = [parse_block(b) for b in naive_blocks if b.strip()]
    merged = precise_records + naive_records
    consolidated = consolidate_best(merged)

    if min_quality > 0:
        consolidated = [r for r in consolidated if r.quality_score() >= min_quality]

    json_path = outdir / "sumulas_stj.json"
    csv_path = outdir / "sumulas_stj.csv"

    with json_path.open("w", encoding="utf-8") as f:
        json.dump([asdict(r) for r in consolidated], f, ensure_ascii=False, indent=2)

    with csv_path.open("w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(
            f,
            fieldnames=[
                "number",
                "topic",
                "enunciado",
                "referencias_legislativas",
                "orgao_julgador",
                "data_decisao",
                "fonte",
                "excertos_precedentes",
            ],
        )
        writer.writeheader()
        for r in consolidated:
            writer.writerow(asdict(r))

    print(f"Parsed {len(consolidated)} records.")
    print(f"JSON: {json_path}")
    print(f"CSV:  {csv_path}")

    return consolidated, json_path, csv_path

In [41]:
PDF_PATH = "/content/SumulasSTJ (1).pdf"
OUT_PUT = "/content/"
run_extraction(PDF_PATH, OUT_PUT)

Parsed 680 records.
JSON: /content/sumulas_stj.json
CSV:  /content/sumulas_stj.csv


([SumulaRecord(number='1', topic='DIREITO CIVIL - INVESTIGAÇÃO DE PATERNIDADE 39', enunciado='O foro do domicílio ou da residência do alimentando é o\ncompetente para a ação de investigação de paternidade,\nquando cumulada com a de alimentos.', referencias_legislativas='LEG:FED LEI:005869 ANO:1973\n***** CPC-73 CÓDIGO DE PROCESSO CIVIL DE 1973\nART:00100 INC:00002', orgao_julgador='SEGUNDA SEÇÃO', data_decisao='25/04/1990', fonte='DJ DATA:02/05/1990 PG:03619\nRSTJ VOL.:00016 PG:00015', excertos_precedentes='"CONFLITO DE COMPETÊNCIA. ALIMENTOS. CUMULAÇÃO COM INVESTIGAÇÃO DE PATERNIDADE.\nPREVALÊNCIA DO FORO ESPECIAL DO DOMICÍLIO DO ALIMENTANDO. [...] EM SE TRATANDO DE\nCUMULAÇÃO DE AÇÕES DE ALIMENTOS E INVESTIGAÇÃO DE PATERNIDADE, MAIS RAZOÁVEL E\nADEQUADO SE MOSTRA O ENTENDIMENTO DE QUE A REGRA ESPECIAL DO FORO DO DOMICÍLIO DO\nALIMENTANDO (CPC, ART. 100, II) DEVA PREVALECER SOBRE A REGRA GERAL DO ART. 94, CPC." (CC\n683 SP, Rel. MIN. SALVIO DE FIGUEIREDO TEIXEIRA, SEGUNDA SEÇÃO, julga