In [75]:
import re

from pathlib import Path
from bs4 import BeautifulSoup

In [256]:
def remove_tables_from_soup(soup):
    for t in soup.find_all('table'):
        t.decompose()
        
    return soup

def remove_special_links_from_soup(soup):
    for a in soup.find_all('a'):
        text = a.text
        text = re.sub('\t+', ' ', text)
        text = re.sub('\n+', ' ', text)
        text = re.sub('&nbsp', ' ', text)
        text = re.sub(' {2,}', ' ', text)
        text = text.strip().lower()
    
        if text != '' :
            text = text.replace('(', '').replace(')', '')
            if text == 'regulamento' \
                or text == 'vigência' \
                or text == 'revogado' \
                or text == 'vetado':
                a.decompose()
            elif text.startswith('(vide decreto'):
                a.decompose()
            elif text.startswith('redação dada pel') : # pela lei OU pelo decreto-lei
                a.decompose()
            elif text.startswith('incluído pel'):
                a.decompose()
            elif text.startswith('revogado pel'):
                a.decompose()
            elif text.startswith('vide lei'):
                a.decompose()
            elif text.startswith('vide adi'):
                a.decompose()
            elif text.startswith('vide adpf'):
                a.decompose()
    return soup

In [257]:
def normalize_text(text):
    text = re.sub('\t+', ' ', text)
    text = re.sub('\n+', ' ', text)
    text = re.sub('&nbsp', ' ', text)
    text = re.sub('\xa0', ' ', text)
    text = re.sub('\.{4,}', '', text)
    text = re.sub('[“”]', '', text)
    text = re.sub(' {2,}', ' ', text)
    text = text.strip()
    
    return text

def remove_section_ref_from_begining(text):
    
    if text.startswith('Parágrafo único.'):
        text = re.sub(r'^Parágrafo único.', '', text)
    elif text.startswith('Art.'):
        text = re.sub(r"Art\.\s*\d+[º\w-]*\s*\.*", "", text)
    elif text.startswith("§") :
        text = re.sub(r"§ \d+[º°o.] ", "", text)
    else:
        text = re.sub(r"^[IVXLCDM]+\s*-\s*", "", text)
        text = re.sub(r"^[a-z]\) ", "", text)
    
    text = re.sub(r'^ +', '', text)
    
    return text


def remove_special_headers(text):
    text = re.sub(r"^(TÍTULO|CAPÍTULO|SEÇÃO|LIVRO)\s+[IVXLCDM]*\s*", "", text)
    
    return text

In [266]:
def process_paragraphs_from_soup(soup):
    paragraphs = list()
    for p in soup.find_all('p'):
        text = normalize_text(p.text)
        text = remove_section_ref_from_begining(text)
        text = remove_special_headers(text)
        
        if text != '':
            paragraphs.append(text)
    
    return paragraphs

In [267]:
def process_document(file : Path, encoding='windows-1252', errors='ignore'):
    soup = BeautifulSoup(
            file.read_text(encoding=encoding, errors=errors),
            'html.parser'
    )
    
    soup = remove_tables_from_soup(soup)
    soup = remove_special_links_from_soup(soup)
    
    return process_paragraphs_from_soup(soup) 
    

In [273]:
INPUT_FOLDER = Path('..', 'data', 'leis')

files = [f for f in INPUT_FOLDER.glob('*.htm*') ]

len(files)

53

In [292]:
OUT_FOLDER = Path('..', 'data', 'step1')

for file in files:
    output = OUT_FOLDER / f'{file.stem}.txt'
#     output.touch()
    lines = process_document(file)
    print(file.name, len(lines))
    
    with output.open(mode='w', encoding='utf8') as out:
        out.write('\n'.join(lines))
    

Constituicao-Compilado.html 3217
D10854.html 702
D11129.html 365
D24643compilado.html 545
Decreto nº 8777.html 72
Del0227compilado.html 406
DEL1001Compilado.html 1951
DEL1002Compilado.html 2929
DEL2848compilado.html 2120
Del3689Compilado.html 2204
Del4657compilado.html 94
Del5452compilado.html 3147
Emenda Constitucional nº 103.html 323
L10406compilada.html 4276
L11788.html 93
L12527.html 279
L12651compilado.html 639
L12846.html 178
L12965.html 245
L13105compilada.html 4227
L13146.html 622
L13303.html 661
L13709compilado.html 499
L14124.html 130
L14133.html 1561
L14217.html 99
L4320compilado.html 424
L4595compilado.html 267
L4737compilado.html 1410
L5172COMPILADO.html 725
L7064.html 67
L7492.html 94
L8069compiladoa.html 1397
L8078compilado.html 535
L8080.html 405
L8112compilado.html 1046
L8429compilada.html 273
L8666compilado.html 789
L8742compilado.html 306
L9096Consol.html 347
L9099.html 313
L9394compilado.html 635
L9503Compilado.html 2197
L9605.html 381
L9784.html 248
L9868.html 132
