# Agrupa documentos que pertence a mesma licitação

In [1]:
import pandas as pd
import numpy as np
import json
import bz2
import argparse
import pdfplumber
import os
import re
from enum import Enum
from unidecode import unidecode

In [20]:
# funções úteis
def list_by_file_format(path, file_format):
    path = path.rstrip("/") + "/"
    file_descriptor = [json.loads(line) for line in open(path + "file_description.jsonl").readlines()]
    files = [item["file_name"] for item in file_descriptor if item["type"] in file_format]
    return [f"{path}{f}".rstrip(";") for f in files]

def list_pdf(path):
    return list_by_file_format(path, ["application/pdf", "pdf", "pdf;"])

def list_csv(path):
    return list_by_file_format(path, ["csv"])

def beauty_print(grouped):
    for city, processes in grouped.items():
        #print(city + ":")
        total_processos, total_arquivos = 0, 0
        for no_process, documents in processes.items():
            total_arquivos += len(documents)
            total_processos += 1
            #print(f"\t{no_process:>8s}: {', '.join([d[0] for d in documents])}")
        print(f"{city} -> ({total_processos}, {total_arquivos})")

# Inicialização

In [3]:
# list path
base_path = "../data/"
pattern = "\\d+[-_]licitacoes-(([a-z]|-)+)"
sel_path = os.listdir(base_path)
sel_path = list(filter(lambda f: re.match(pattern, f), sel_path)) # Seleciona os que iniciam com <cod>-licitacoes-<cidade>
cities = [re.match(pattern, p)[1] for p in sel_path if not p.endswith("bh")] # Remove BH da lista

# arquivos por cidade
files = {city: list_pdf(f"{base_path}{p}/data/files/")  for city, p in zip(cities, sel_path)}
print("cities: " + ", ".join(files.keys()))
print([(key, len(docs)) for key, docs in files.items()])

cities: sao-bento-abade, olaria, coqueiral, cristais, pirapetinga, passa-vinte, arantina, ijaci
[('sao-bento-abade', 232), ('olaria', 42), ('coqueiral', 1528), ('cristais', 1736), ('pirapetinga', 1007), ('passa-vinte', 395), ('arantina', 983), ('ijaci', 451)]


## Versão 1: Lê direto dos arquivos pdf

In [4]:
# list documents
def load_peek(files, n=None, verbose=False):
    result_text = dict()
    errors = list()
    
    for city, files in files.items():
        result_text[city] = {}
        
        for i, filename in enumerate(files):
            try:
                with pdfplumber.open(filename) as f:
                    text = f.pages[0].dedupe_chars().extract_text()
                    if text != None:
                        result_text[city][filename] = text

            except:
                errors.append(filename)
                continue
            
    if verbose and len(errors):
        print(f"Errors found while reading:")
        for filename in errors:
            print(f"\t{filename}")
                
    return result_text

content_text = load_peek(files)

## Versão 2: Lê dos arquivos JSON (+ rápido)

In [23]:
def load_peek(files, verbose=False):
    result_text = dict()
    errors = list()
    
    for file in files:
        city_name = file.rsplit("/")[-1].replace(".json", "")
        result_text[city_name] = {}
        print(city_name)
        with open(file) as f:
            i = 0
            for line in f:
                i += 1
                document = json.loads(line)
                doc_id = document["file_id"] # retorna o id do documento
                status = document["status"] # indica se a extração do pdf ocorreu corretamente
                pages_count = len(document["text_content"]) # a quantidade de páginas contendo
                                                            # texto (não necessáriamente re-
                                                            # flete a quantidade de pags do
                                                            # arquivo em pdf.
                if status == "SUCCESS" and pages_count > 0:
                    result_text[city_name][doc_id] = document["text_content"][0]
                else:
                    errors.append(doc_id)
        print(i)  
    if verbose and len(errors):
        print(f"Errors found while reading:")
        for filename in errors:
            print(f"\t{filename}")
                
    return result_text

base_path = f"../data/documentos_json/"
files = [f"{base_path}{filename}" for filename in os.listdir(base_path)]
print(files)
content_text_f = load_peek(files)

['../data/documentos_json/sao-bento-abade.json', '../data/documentos_json/olaria.json', '../data/documentos_json/coqueiral.json', '../data/documentos_json/cristais.json', '../data/documentos_json/pirapetinga.json', '../data/documentos_json/passa-vinte.json', '../data/documentos_json/arantina.json', '../data/documentos_json/ijaci.json']
sao-bento-abade
232
olaria
42
coqueiral
1528
cristais
1736
pirapetinga
1007
passa-vinte
395
arantina
983
ijaci
451


# Amostra

In [None]:
# Amostra
#print(files["../data/documentos_json/olaria.json"])
#print(list(content_text["cristais"].values())[:3])
print(content_text["olaria"])

# Processamento

In [81]:
pattern = "(Processo(?:\s{1,7})(?:Administrativo:?(?:\s{1,7}))?(?:Licitatorio:?(?:\s{1,7}))?(?:.{1,5})?(\d{3}/\d{4}))"

def search_for_pattern(data, pattern):
    grouped = {}
    uncertain = {}
    empty = {}
    
    for city, documents in data.items():
        grouped[city] = {}
        uncertain[city] = {}
        empty[city] = set()
        
        for filename, text in documents.items():
            text = re.sub("[\s\t\n]+", " ", text)

            matches = re.findall(pattern, unidecode(text), flags=re.IGNORECASE)
            
            if not matches:
                empty[city].add(filename)
                continue
            
            unique_values = set([m[1] for m in matches])
            
            if len(unique_values) == 1:
                no_process = unique_values.pop()
                
                if no_process not in grouped[city]:
                    grouped[city][no_process] = []
                
                matched_expression = matches[0][0]
                grouped[city][no_process].append((filename, matched_expression))
            else:
                if filename not in uncertain[city]:
                    uncertain[city][filename] = set()
                    
                uncertain[city][filename] = uncertain[city][filename].union(unique_values)
    
    return grouped, uncertain, empty

grouped, uncertain, empty = search_for_pattern(content_text_f, pattern) 

# Resultados

In [82]:
beauty_print(grouped) # Quantidade de arquivos que contém associados ao mesmo processo licitatório

sao-bento-abade -> (139, 185)
olaria -> (15, 18)
coqueiral -> (390, 1044)
cristais -> (286, 632)
pirapetinga -> (162, 567)
passa-vinte -> (50, 80)
arantina -> (220, 252)
ijaci -> (176, 261)


In [65]:
n_good = {}
n_uncertain = {}
n_bad = {}
n_total = {}

for city, bids in grouped.items():
    total = 0
    for bidding, documents in bids.items():
        total += len(documents)
    n_good[city] = total
    
for city, documents in uncertain.items():
    n_uncertain[city] = len(documents.keys())
    
for city, documents in empty.items():
    n_bad[city] = len(documents)
    
for city, documents in content_text.items():
    n_total[city] = len(documents)

print("RESULTADOS".center(80, '-'))
cities = list(content_text.keys())
for city in cities:
    if n_total[city] == 0:
        continue
    print(f"{city:<15s}:")
    print(f"\tencontrado: {100 * n_good[city]/n_total[city]:>.2f}% ({n_good[city]})")
    print(f"\tambiguo: {100 * n_uncertain[city]/n_total[city]:>.2f}% ({n_uncertain[city]})")
    print(f"\tnão detectado: {100 * n_bad[city]/n_total[city]:>.2f}% ({n_bad[city]})")
    print(f"\ttotal: {100 * n_total[city]/n_total[city]:>.2f}% ({n_total[city]})")
    print()
    

-----------------------------------RESULTADOS-----------------------------------
sao-bento-abade:
	encontrado: 81.22% (186)
	ambiguo: 2.18% (5)
	não detectado: 16.59% (38)
	total: 100.00% (229)

olaria         :
	encontrado: 50.00% (18)
	ambiguo: 0.00% (0)
	não detectado: 50.00% (18)
	total: 100.00% (36)

coqueiral      :
	encontrado: 72.20% (1078)
	ambiguo: 2.21% (33)
	não detectado: 25.79% (385)
	total: 100.00% (1493)

cristais       :
	encontrado: 51.57% (657)
	ambiguo: 0.86% (11)
	não detectado: 48.59% (619)
	total: 100.00% (1274)

pirapetinga    :
	encontrado: 75.76% (597)
	ambiguo: 2.54% (20)
	não detectado: 21.70% (171)
	total: 100.00% (788)

passa-vinte    :
	encontrado: 22.76% (84)
	ambiguo: 3.79% (14)
	não detectado: 73.44% (271)
	total: 100.00% (369)

arantina       :
	encontrado: 76.83% (252)
	ambiguo: 0.00% (0)
	não detectado: 23.17% (76)
	total: 100.00% (328)

ijaci          :
	encontrado: 69.10% (275)
	ambiguo: 0.00% (0)
	não detectado: 30.90% (123)
	total: 100.00% (398)

# Salvar resultados

In [84]:
# Resultados 
for city, bids in grouped.items():
    lines = []
    for bidding, documents in bids.items():
        docs = [doc[0] for doc in documents]
        regex = [doc[1] for doc in documents]        
        lines.append({
            "no. licitação": bidding,
            "n_arquivos": len(documents),
            "arquivos": ', '.join(docs),
            "expressão": ', '.join(regex)
        })
    lines.sort(key=lambda x: [int(v) for v in x["no. licitação"].split("/")[::-1]])
    df = pd.DataFrame.from_dict(lines)
    df.to_csv(f"resultado_processo_licitatorio/{city}.csv", index=False)

In [13]:
for city, documents in uncertain.items():
    result = list()
    for document, possibilities in documents.items():
        row = {"document": document.rsplit("/")[-1], "possibilities": ', '.join(possibilities)}
        result.append(row)
    if len(result) == 0:
        continue
        
    df = pd.DataFrame.from_dict(result)
    df.to_csv(f"resultado_processo_licitatorio/{city}_ambiguos.csv", index=False)
    print(df)
    


                               document       possibilities
0  ad013fd852ac592b1d778ad63a3bb116.pdf  034/2018, 024/2018
                                document  \
0   49f5d4b211bf6dad76304c831b75afd0.pdf   
1   ae46ee32d6c857e398a1f99bb9c74c71.pdf   
2   bed0b84d999b051568e6836884c26daf.pdf   
3   237115227fc23be2beba7b6d3a769f66.pdf   
4   ea4de9bbc3754d365f7f6e648f978cba.pdf   
5   0eb4ea0d75b06364b0a280ca2543a05c.pdf   
6   dc2f32710168b33ce998b9cc28e03439.pdf   
7   98cf40676ec82f1775ce9fd536897883.pdf   
8   4652201b47087054b9521046008b8db3.pdf   
9   9c4d753f9ceac3f6ed6874d3e988e84d.pdf   
10  6a24d7c176c9d0bf21b898b82efa657d.pdf   
11  5b942bc25602d9e32bbde50720b400e8.pdf   
12  5354b7bc2160801b43d2b39ba4c45ef5.pdf   
13  49d7ab0ea0c9449a88ee3b3a2567ecd8.pdf   
14  ce99c9e451d10ea1b89bc1247feaa143.pdf   
15  a465f1445659ae15b2eafd1ae8142496.pdf   
16  ff6b27a4ec6ec651150999425af603b9.pdf   

                                       possibilities  
0                               

In [14]:
for city, documents in empty.items():
    print(f"{city}:")
    resultado = list()
    for document in documents:
        resultado.append({"documento": document.rsplit("/")[-1]})
    df = pd.DataFrame.from_dict(resultado)
    df.to_csv(f"resultado_processo_licitatorio/{city}_sem_resultados.csv", index=False)
    print(df)

sao-bento-abade:
                               documento
0   be95f1267d98b85b789595387e2cb71e.pdf
1   294cd12ad49f5255e140535dd252b894.pdf
2   e58afaaca85d24ec10e23325b331fa80.pdf
3   85091b6e4fc49bb06fb619a45e661669.pdf
4   dcf3a12244fcb5b2ff8c61661ffc8370.pdf
..                                   ...
93  45af5cc070110fb2b6dc1782daeb1175.pdf
94  549190f035c1bb97ba9359b63e9c1a7c.pdf
95  99729b966cb005c54a744c7052278f67.pdf
96  a10707ab7606d6c0c001e402c977278b.pdf
97  7b29bbbfad06b6908cd6529177ff24e8.pdf

[98 rows x 1 columns]
olaria:
                               documento
0   281f464653c7b5ac70f8c975e467884f.pdf
1   7087aaed7cb40a722df310e1dbd7d881.pdf
2   79a70b2e386a7ae63c8435c2a41f12a6.pdf
3   55d1e42ab1c6b2d1f5337b7ae3a2d55e.pdf
4   6f64f76f7ad5c127b9f1492c5d67747d.pdf
5   4e2cd1dce059f2afaee5b837ad53d61a.pdf
6   79a1c51973f2f1f52b9874d2615b287a.pdf
7   11a6281a3e73f0dc7ae77fb22fc00315.pdf
8   2b184379f31f160faf8d7fcd82cdd226.pdf
9   a7a6050bec5349fc6b5395839bac7e31.pdf
10  2314b