In [None]:
from IPython.display import display
from os.path import dirname, abspath
from sklearn.decomposition import PCA
import glob
import json
import os
import pandas as pd
import pdfplumber
import re
import seaborn as sns

# Funções para listagem de docs no formato JSON

In [None]:
def json_opath(fd_path):
    return abspath(dirname(fd_path.rstrip("/")) + "/../files_json").rstrip("/") + "/"

def list_FD(fd):
    fd = fd.rstrip("/")
    file_descriptor = [json.loads(line) for line in open(fd).readlines()]
    files = [item["file_name"] for item in file_descriptor if item["type"] in ["application/pdf", "pdf", "pdf;"]]
    return [f"{dirname(fd)}/{f}".rstrip(";") for f in files]

def list_pdf(directory):
    directory = directory.rstrip("/")
    file_d = glob.glob(f"{directory}/data/files/file_description.jsonl")
    if len(file_d):
        return [{"docs": list_FD(file_d[0]), "output": json_opath(file_d[0])}]
             
    multi_file_d = glob.glob(f"{directory}/*/data/files/file_description.jsonl")
    if len(multi_file_d) == 0:
        raise Error("No file descriptor")
             
    file_descriptors = []
    for file_d in multi_file_d:
        file_descriptors.append({"docs": list_FD(file_d), "output": json_opath(file_d)})
    return file_descriptors
        
def get_name(directory):
    return re.search("licitacoes-(.*)/", directory)[1].replace("-", "_")

### Preenche um dict com o município e os arquivos json

In [3]:
cities_dir = glob.glob("../data/*licitacoes*/")

cities_docs = {}

for city_dir in cities_dir:
    city = get_name(city_dir)
    if city == "bh":
        continue
    cities_docs[city] = list_pdf(city_dir)

# Funções para a dectção de docs escaneados

> Boa parte dos documentos são detectados através dos campos de metadata.


> Alguns documentos escaneados possuem um campo "(cid:\d+)", porém eles já\
>foram detectados através dos metadados, portanto, não foram incluidos.

In [4]:
def get_features(doc):
    metadata = doc["metadata"]
    has_author = int("Author" in metadata)
    word_creator = "Creator" in metadata and "word" in metadata["Creator"].lower()
    anycad_creator = "Creator" in metadata and "cad" in metadata["Creator"].lower()
    naps2_creator = "Creator" in metadata and "naps2" in metadata["Creator"].lower()
    ilovepdf = "Producer" in metadata and "ilovepdf" in metadata["Producer"].lower()
    return {
        "has_autor": has_author,
        "word": word_creator,
        "cad": anycad_creator,
        "naps2": naps2_creator,
        "ilovepdf": ilovepdf
    }

def count_cid(doc):
    n = 0
    for page in doc['text_content']:
        m = re.search("(\(cid:\d+\))", page)
        if m:
            return len(m.groups())
    return 0

def is_scanned(doc):
    features = get_features(doc)
    
    return (not features["word"] and features["cad"]) or \
           (not features["word"] and features["naps2"]) or \
           (len(doc['text_content']) and doc['text_content'][0] == None and features["ilovepdf"]) or \
           (doc['status'] == 'FAILED')


### Lista todos os prováveis documentos escaneados

In [5]:
header = f"{'#':>5}{'DOC_ID':>35s} {'STATUS': >12} {'MUNICIPIO': >20}"
print(header)
print("-" * len(header))
n = 0
for city, documents_dirs in cities_docs.items():
    for document_dir in documents_dirs:
        base_path = document_dir["output"]
        docs = glob.glob(f"{base_path}/*.json")
        for doc in docs:
            with open(doc) as f:
                doc_json = json.load(f)
                c = count_cid(doc_json)                
                if is_scanned(doc_json) and doc_json['status'] != 'FAILED':
                    n += 1
                    print(f"{n:>5} {doc_json['file_id']:>35s} {doc_json['status']:>12} {city:>20}")

    #                             DOC_ID       STATUS            MUNICIPIO
--------------------------------------------------------------------------
    1    79a70b2e386a7ae63c8435c2a41f12a6      SUCCESS               olaria
    2    11a6281a3e73f0dc7ae77fb22fc00315      SUCCESS               olaria
    3    a1feebf7fece41653a65ace04f9de6ad      SUCCESS               olaria
    4    a7a6050bec5349fc6b5395839bac7e31      SUCCESS               olaria
    5    65ca0c6888f52634fd63a1097d8dff9e      SUCCESS               olaria
    6    2314b47af4b8d17443c5c6c2fc401024      SUCCESS               olaria
    7    19a904b26613294dfa0e13a04730f5ef      SUCCESS               olaria
    8    79a1c51973f2f1f52b9874d2615b287a      SUCCESS               olaria
    9    950560a7461609a7df81cc37bed789b6      SUCCESS            coqueiral
   10    2a034a7ff07d518edf2ec7bdb0290346      SUCCESS            coqueiral
   11    ed6f00eef27e2e05f30c4f27208e541d      SUCCESS            coqueiral
   12    871ae

# Apenas funções para exploração (Apenas use para desenvolvimento)

In [None]:
df = pd.DataFrame.from_records(documents)

In [None]:
sheets = pd.read_excel("../data/docs_escaneados/input/resultado_parcial.xlsx", sheet_name=None)
sheets = {k.lower():v for k,v in sheets.items() if "doc_id" in v.columns.str.lower()}
for s in sheets.values():
    s.columns = s.columns.str.lower()
    if 'escaneado' not in s.columns:
        s['escaneado'] = float('nan')
    s.escaneado = s.escaneado.apply(lambda v: v == v) # Como NaN != NaN, v == v converte os escaneados para True e os não para False

In [None]:
path_for = {}
for f in glob.glob("../data/*/data/files/*.pdf"):
    doc_id = re.search("/([a-z0-9]+).pdf", f)[1]
    path_for[doc_id] = os.path.abspath(f)
    
for f in glob.glob("../data/*/*/data/files/*.pdf"):
    doc_id = re.search("/([a-z0-9]+).pdf", f)[1]
    path_for[doc_id] = os.path.abspath(f)

In [None]:
path_for['2d0b66a94463bd83e6f68d33cf91d38a']