# Importa dependências

In [2]:
import pdfplumber as pdfpb
import json
import os
import re
import os.path as osp
import glob
from os.path import abspath, dirname

# Funções necessárias para detectar documentos escaneados

In [3]:
def get_features(doc):
    metadata = doc["metadata"]
    has_author = int("Author" in metadata)
    word_creator = "Creator" in metadata and "word" in metadata["Creator"].lower()
    anycad_creator = "Creator" in metadata and "cad" in metadata["Creator"].lower()
    naps2_creator = "Creator" in metadata and "naps2" in metadata["Creator"].lower()
    ilovepdf = "Producer" in metadata and "ilovepdf" in metadata["Producer"].lower()
    return {
        "has_autor": has_author,
        "word": word_creator,
        "cad": anycad_creator,
        "naps2": naps2_creator,
        "ilovepdf": ilovepdf
    }

def count_cid(doc):
    n = 0
    for page in doc['text_content']:
        m = re.search("(\(cid:\d+\))", page)
        if m:
            return len(m.groups())
    return 0

def is_scanned(doc):
    features = get_features(doc)
    
    return (not features["word"] and features["cad"]) or \
           (not features["word"] and features["naps2"]) or \
           (len(doc['text_content']) and doc['text_content'][0] == None and features["ilovepdf"])

In [4]:
def json_opath(fd_path):
    return abspath(dirname(fd_path.rstrip("/")) + "/../files_json").rstrip("/") + "/"

def list_FD(fd):
    fd = fd.rstrip("/")
    file_descriptor = [json.loads(line) for line in open(fd).readlines()]
    files = [item["file_name"] for item in file_descriptor if item["type"] in ["application/pdf", "pdf", "pdf;"]]
    return [f"{dirname(fd)}/{f}".rstrip(";") for f in files]

def list_pdf(directory):
    directory = directory.rstrip("/")
    file_d = glob.glob(f"{directory}/data/files/file_description.jsonl")
    if len(file_d):
        return [{"docs": list_FD(file_d[0]), "output": json_opath(file_d[0])}]
             
    multi_file_d = glob.glob(f"{directory}/*/data/files/file_description.jsonl")
    if len(multi_file_d) == 0:
        raise Error("No file descriptor")
             
    file_descriptors = []
    for file_d in multi_file_d:
        file_descriptors.append({"docs": list_FD(file_d), "output": json_opath(file_d)})
    return file_descriptors
        
def get_name(directory):
    return re.search("licitacoes-(.*)/", directory)[1].replace("-", "_")

In [5]:
cities_dir = glob.glob("../data/*licitacoes*/")

cities_docs = {}

for city_dir in cities_dir:
    city = get_name(city_dir)
    if city == "bh":
        continue
    cities_docs[city] = list_pdf(city_dir)

# Converte Formatos: PDF → JSON

In [10]:
def exception_wrap(*, callback, default):
    if not callable(callback):
        raise Error("@func must be a callable.")
    try:
        return callback()
    except:
        return default
    
def convert(*, filename):
    result = {
        "file_id": filename.rsplit("/")[-1].rsplit(".")[0], # foo/bar/baz.any -> baz.any -> baz
        "original_name": filename.rsplit("/")[-1], 
        "n_pages": -1, 
        "metadata": {},
        "text_content": "", 
        "table_content": [], 
        "status": "FAILED"
    }
    try:
        with pdfpb.open(filename) as f:
            extract_text_func = lambda pages: [page.dedupe_chars().extract_text() for page in pages]
            extract_table_func = lambda pages: [page.extract_table() for page in pages]
        
            result["n_pages"] = len(f.pages)
            
            result["metadata"] = exception_wrap(callback=lambda:f.metadata, default={})
            
            result["text_content"] = exception_wrap(callback=lambda:extract_text_func(f.pages), default=[])
            result["text_content"] = [text for text in result["text_content"] if text]
            
            result["table_content"] = exception_wrap(callback=lambda:extract_table_func(f.pages), default=[])            
            result["table_content"] = [table for table in result["table_content"] if table]

            if bool(result["text_content"] or result["table_content"]): # empty list is cast to false
                if not is_scanned(result):
                    result["status"] = "SUCCESS"
                else:
                    print(result["file_id"], result["metadata"])
    except:
        pass

    return result

In [13]:
for city, documents_dirs in cities_docs.items():
    print(city)
    for document_dir in documents_dirs:
        base_path = document_dir["output"]
    
        if not os.path.isdir(base_path):
            os.mkdir(base_path) 

        for doc in document_dir["docs"]:
            doc_json = convert(filename=doc)
            file_id = doc_json["file_id"]
            print(f"    {file_id}")
            with open(f"{base_path}{file_id}.json", "wt", encoding="utf-8") as f:
                json.dump(doc_json, f)

cristais
*** COMISSÃO PERMANENTE DE LICITAÇÕES
A T A  D A  D I S P E N S A
No dia 1 de fevereiro de 2018, reuniu-se a Comissão Permanente de Licitação, o seu Presidente: Pablo José da Silva e
membros da comissão de licitação: Leonardo Luiz Oliveira e Edinamara Ribeiro Silva,nomeados pela Portaria  001/2018 de
01 de Janeiro de 2018. Após procedida avaliação para dispensa de licitação para CONTRATAÇÃO DE PROCEDIMENTO
CIRÚRGICO DE URETEROLITOTRIPSIA FLEXÍVEL A LASER PARA A PACIENTE TATIANA DOS SANTOS., entre
a Prefeitura Municipal de Cristais e a WGA MEDICINA DO TRABALHO LTDA
Aberta a etapa de julgamento, chegou-se então ao seguinte resultado:
WGA MEDICINA DO TRABALHO LTDA
CNPJ/CPF: 01.026.623/0001-04
ENDEREÇO: CONEGO ULISSES, 37 - SALA 02               , 37 - CENTRO - CAMPO BELO - MG - 37275-000
Seq. Item Descrição Marca UN Qtd. Valor Unitário Valor Total
1 2556 PROCEDIMENTO CIRÚRGICO DE SV 1 7800,0000 7800,00
URETEROLITOTRIPSIA FLEXÍVEL A
LASER -
Sub Total 7800,00
Total Geral 7800,00
At

In [None]:
# cities_docs = {'cristais': [{'docs': ['../data/290-licitacoes-cristais/data/files/56a09c5d1d04cc95ada4d68ad22dcbd9.pdf',]}]}
# cities_docs

# Exemplo de uso

In [None]:
cities

In [None]:
for city, documents_dirs in cities_docs.items():
    for document_dir in documents_dirs:
        base_path = document_dir["output"]
        docs = glob.glob(f"{base_path}/*.json")
        for doc in docs:
            with open(doc) as f:
                doc_json = json.load(f)
                print(f"{doc_json['file_id']:>12s} {doc_json['status']}")