#### Imports

In [84]:
import glob
import json
import os
import random
import re
import shutil
from collections import defaultdict
from pandas import ExcelWriter 
import pandas as pd
import pdfplumber
import seaborn as sns

import utils.preprocessing_portuguese as preprossPT

#### Useful variables

In [85]:
text_preprocessing = preprossPT.TextPreProcessing()

ATA, HOMOLOG, EDITAL, OUTROS = "ATA", "HOMOLOG", "EDITAL", "OUTROS"

title_keys = ["ata_title_count", "homolog_title_count", "edital_title_count", "outros_title_count"]
content_keys = ["ata_content_count", "homolog_content_count", "edital_content_count","outros_content_count"]
json_path = "../data/documentos_json/*.json"
folder_paths = [
    "../data/288_licitacoes-pirapetinga/data/files/*.pdf",
    "../data/289-licitacoes-coqueiral/data/files/*.pdf",
    "../data/290-licitacoes-cristais/data/files/*.pdf",
    "../data/302-licitacoes-olaria/data/files/*.pdf",
    "../data/304-licitacoes-passa-vinte/data/files/*.pdf",
    "../data/306-licitacoes-arantina/data/files/*.pdf",
    "../data/353-licitacoes-ijaci/data/files/*.pdf",
    "../data/381-licitacoes-sao-bento-abade/data/files/*.pdf",
    "../data/48-licitacoes-bh/data/files/*.pdf",
]

keywords = [
    {
        "word": "ata",
        "title_regex": str(r"\bata\b"),
        "content_regex": str(r"\bata\b"),
        "class": ATA,
    },
    {
        "word": "sessão pública",
        "title_regex": str(r"\bsessão pública\b"),
        "content_regex": str(r"\bsessão pública\b"),
        "class": ATA,
    },
    {
        "word": "homolog",
        "title_regex": str(r"\bhomologação\b"),
        "content_regex": str(r"\bhomologação\b"),
        "class": HOMOLOG,
    },
    {
        "word": "adjudicação",
        "title_regex": str(r"\badjudicação\b"),
        "content_regex": str(r"\badjudicação\b"),
        "class": HOMOLOG,
    },
    {
        "word": "convite",
        "title_regex": str(r"\bconvite\b"),
        "content_regex": str(r"\bconvite\b"),
        "class": EDITAL,
    },
    {
        "word": "edital",
        "title_regex": str(r"\bedital\b"),
        "content_regex": str(r"\bedital\b"),
        "class": EDITAL,
    },
    {
        "word": "cronograma",
        "title_regex": str(r"\bcronograma\b"),
        "content_regex": str(r"\bcronograma\b"),
        "class": OUTROS,
    },
    {
        "word": "aditamento",
        "title_regex": str(r"\baditamento\b"),
        "content_regex": str(r"\baditamento\b"),
        "class": OUTROS,
    },
    {
        "word": "retificação",
        "title_regex": str(r"\bretificação\b"),
        "content_regex": str(r"\bretificação\b"),
        "class": OUTROS,
    },
    {
        "word": "contrato administrativo",
        "title_regex": str(r"\bcontrato administrativo\b"),
        "content_regex": str(r"\bcontrato administrativo\b"),
        "class": OUTROS,
    },
    {
        "word": "ordem de serviço",
        "title_regex": str(r"\bordem de serviço\b"),
        "content_regex": str(r"\bordem de serviço\b"),
        "class": OUTROS,
    },
    {
        "word": "resposta",
        "title_regex": str(r"\bresposta\b"),
        "content_regex": str(r"\bresposta\b"),
        "class": OUTROS,
    },
    {
        "word": "extrato",
        "title_regex": str(r"\bextrato\b"),
        "content_regex": str(r"\bextrato\b"),
        "class": OUTROS,
    },
    {
        "word": "diário oficial",
        "title_regex": str(r"\bdiário oficial\b"),
        "content_regex": str(r"\bdiário oficial\b"),
        "class": OUTROS,
    },
    {
        "word": "aviso de",
        "title_regex": str(r"\baviso de\b"),
        "content_regex": str(r"\baviso de\b"),
        "class": OUTROS,
    },
]
# default_match_dict = {
#     "doc_id": pdf_id, "title": title_s, "city": city, "all_matches": [],
#     "ata_title_matches": [], "ata_content_matches": [], "ata_title_count": 0, "ata_content_count": 0,
#     "homolog_title_matches": [], "homolog_content_matches": [], "homolog_title_count": 0, "homolog_content_count": 0,
#     "edital_title_matches": [], "edital_content_matches": [], "edital_title_count": 0, "edital_content_count": 0,
#     "outros_title_matches": [], "outros_content_matches": [], "outros_title_count": 0, "outros_content_count": 0,
#     }
# pdfs = []

#### Analysis parameters

In [289]:
# [ATA, HOMOLOG, EDITAL, OUTROS]
classes_of_interest = [ATA]
keywords_of_interest = [key_word for key_word in keywords if key_word["class"] in classes_of_interest]
limit = 400
minimum = 20

#### Content extraction methods

In [86]:
def get_pages(pdf):
    return pdf.pages

def title_extraction_breaklines(pdf, words_limit=20):
    page = pdf.pages[0]
    content = page.extract_text()
    first_lines = []
    if bool(content):
        content = text_preprocessing.remove_special_characters(content, exceptions=["\n"]).lower()
        content = text_preprocessing.remove_excessive_spaces(content)
        content = re.sub(r"\n\s*\n", "\n", content)
        first_lines = content.split("\n", 6)[:-1]
    else:
        return None
    return first_lines

def get_first_page_content(pdf):
    page = pdf.pages[0]
    content = page.extract_text()
    if bool(content):
        content = text_preprocessing.remove_special_characters(content).lower()
        content = text_preprocessing.remove_excessive_spaces(content)
    else:
        return None
    return content

#### Saves a bunch of DataFrames in the same Spreadsheet

In [87]:
def save(dict_of_dataframes, path_to_save):
    with ExcelWriter(path_to_save) as f:
        for _key, _df in dict_of_dataframes.items():
            _df.to_excel(f, sheet_name=_key)

#### Read from .json

In [88]:
# \brief Read the all documents of all cities
#
# @param base_dir Path to the directory that contains the JSONs
# return A dictionary whose keys are city names and the contents are a list of documents
def load_content(base_dir):
    files_json = [f"{base_dir}/{f}" for f in os.listdir(base_dir) if f.endswith("json")]
    
    cities = dict()
    for city in files_json:
        city_name = city.split("/")[-1].split(".")[0]
        
        documents = list()
        with open(city) as f:
            for line in f:
                documents.append(json.loads(line))
        
        cities[city_name] = documents
        documents = list()
        
    for key, values in cities.items():    
        print(f"{key} contains {len(values)} documents")
        
    return cities
    
#load_content("../data/documentos_json")

#### Meta-classes methods

In [111]:
def get_meta_classe(matches_dict):
    title_counts = dict((k, matches_dict[k]) for k in matches_dict if k in title_keys)
    # se houver uma palavra que "anule" alguma das meta classes (ex: retificação de edital)
    if title_counts["outros_title_count"] > 0:
        return OUTROS
    if title_counts["homolog_title_count"] > 0:
        return HOMOLOG
    
    # ordena as contagens de palavras que ocorreram no título em ordem decrescente
    title_counts = [(k, v) for k, v in sorted(title_counts.items(), key=lambda item: item[1], reverse=True)]
    
    # ordena as contagens de palavras que ocorreram no conteúdo em ordem decrescente
    content_counts = dict((k, matches_dict[k]) for k in matches_dict if k in content_keys)
    content_counts = [(k, v)for k, v in sorted(content_counts.items(), key=lambda item: item[1], reverse=True)]

    doc_class = ""
    ## se a palavra chave estiver no título tem um peso maior
    if title_counts[0][1] > 0:
        doc_class = key_to_class(title_counts[0][0])
    elif content_counts[0][1] > 0:
        doc_class = key_to_class(content_counts[0][0])
    else:
        doc_class = OUTROS
    return doc_class

def key_to_class(key):
    return {
        "ata_title_count": ATA,
        "ata_content_count": ATA,
        "homolog_title_count": HOMOLOG,
        "homolog_content_count": HOMOLOG,
        "edital_title_count": EDITAL,
        "edital_content_count": EDITAL,
    }.get(key, OUTROS)


def update_class_count(doc_class, ata_count, homolog_count, edital_count, others_count):
    if doc_class == ATA:
        ata_count += 1
    if doc_class == HOMOLOG:
        homolog_count += 1
    if doc_class == EDITAL:
        edital_count += 1
    if doc_class == OUTROS:
        others_count += 1
    return ata_count, homolog_count, edital_count, others_count

# constrói a condicional para interromper a busca por arquivos
def conditions_of_interest(ata_count, edital_count, homolog_count, others_count):
    condition = False
    if ATA in classes_of_interest:
        condition = condition or ata_count < minimum
    if EDITAL in classes_of_interest:
        condition = condition or edital_count < minimum
    if HOMOLOG in classes_of_interest:
        condition = condition or homolog_count < minimum
    if OUTROS in classes_of_interest:
        condition = condition or others_count < minimum
    return condition

def get_content_matches(title, content):
    matches_dict = {
        "doc_id": pdf_id, "title": title_s, "city": city, "all_matches": [],
        "ata_title_matches": [], "ata_content_matches": [], "ata_title_count": 0, "ata_content_count": 0,
        "homolog_title_matches": [], "homolog_content_matches": [], "homolog_title_count": 0, "homolog_content_count": 0,
        "edital_title_matches": [], "edital_content_matches": [], "edital_title_count": 0, "edital_content_count": 0,
        "outros_title_matches": [], "outros_content_matches": [], "outros_title_count": 0, "outros_content_count": 0,
    }
    
    for word_dict in keywords:
        word = word_dict["word"]
        title_regex = word_dict["title_regex"]
        content_regex = word_dict["title_regex"]
        doc_class = word_dict["class"].lower()
        title_matches = []
        
        for index in range(len(title)):
            line = title[index]
            match = re.findall(title_regex, line.lower())
            
            if bool(match) and len(match) > 0:
                title_matches.append({"match": match, "line": index + 1})
                matches_dict[f"{doc_class}_title_matches"] += title_matches
                matches_dict[f"{doc_class}_title_count"] += len(title_matches)
                
        content_matches = re.findall(content_regex, content.lower())
        matches_dict[f"{doc_class}_content_matches"] += content_matches
        matches_dict[f"{doc_class}_content_count"] += max(len(content_matches) - len(title_matches), 0)
        matches_dict["all_matches"] += content_matches
    return matches_dict

#### Analysis

In [292]:
cities = set()
for folder_path in folder_paths:
    for _, filepath in zip(range(limit), glob.iglob(folder_path)):
        try:
            pdf_id = filepath.split("/")[-1]
            pdf_city = (
                filepath.split("/")[2]
                .split("licitacoes")[-1]
                .replace("-", "", 1)
                .replace("-", " ")
            )
            cities.add(pdf_city)
            pdfs.append({"id": pdf_id, "city": pdf_city, "path": filepath})
        except Exception as e:
            print("**", e)

In [293]:
cities_data = defaultdict(list)
empty = 0

for city in ["pirapetinga", "coqueiral", "cristais", "olaria", "passa vinte", "arantina", "ijaci", "sao bento abade"]:
    city_pdfs = [info for info in pdfs if info["city"] == city]
    ata_count, homolog_count, edital_count, others_count = (0,)*4
    
    pdf_i = 0
    print('-'*80)
    print(f'## {city} ##')
    
    while pdf_i < len(city_pdfs) and conditions_of_interest(ata_count, edital_count, homolog_count, others_count):
        try:
            pdf = city_pdfs[pdf_i]
            pdf_file = pdfplumber.open(pdf['path'])
            pdf_id = pdf["id"].split(".")[0]
        
            pages = get_pages(pdf_file)
            content = get_first_page_content(pdf_file)
            title = title_extraction_breaklines(pdf_file)
            
            if (bool(title) and len(title)) == 0 or not (content):
                empty += 1
                print("ERRO! DOCUMENTO ESCANEADO")
            else:
                matches_dict = get_content_matches(title, content)
                doc_class = get_meta_classe(matches_dict)
                ata_count, homolog_count, edital_count, others_count = update_class_count(doc_class, ata_count, homolog_count, edital_count, others_count)
                matches_dict["class"] = doc_class
                cities_data[city].append(matches_dict)
                pdf_file.close()
        except Exception as e:
            print("*-*-*", e)
        
        print('ata_count:', ata_count)
        print('homolog_count:', homolog_count)
        print('edital_count:', edital_count)
        print('others_count:', others_count)
        pdf_i += 1

--------------------------------------------------------------------------------
## pirapetinga ##
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 1
others_count: 0
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 1
others_count: 0
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 1
others_count: 1
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 1
others_count: 2
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 2
others_count: 2
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 2
others_count: 2
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 3
others_count: 2
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 3
others_count: 3
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 4
others_count: 3
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 5
others_count: 3
doc_class: ATA
ata_count: 1
homolog_count: 0
edital_count: 5
others_count: 3
ERRO! DO

doc_class: ATA
ata_count: 3
homolog_count: 6
edital_count: 22
others_count: 44
doc_class: HOMOLOG
ata_count: 3
homolog_count: 7
edital_count: 22
others_count: 44
doc_class: EDITAL
ata_count: 3
homolog_count: 7
edital_count: 23
others_count: 44
ERRO! DOCUMENTO ESCANEADO
ata_count: 3
homolog_count: 7
edital_count: 23
others_count: 44
doc_class: OUTROS
ata_count: 3
homolog_count: 7
edital_count: 23
others_count: 45
doc_class: HOMOLOG
ata_count: 3
homolog_count: 8
edital_count: 23
others_count: 45
doc_class: HOMOLOG
ata_count: 3
homolog_count: 9
edital_count: 23
others_count: 45
doc_class: HOMOLOG
ata_count: 3
homolog_count: 10
edital_count: 23
others_count: 45
doc_class: HOMOLOG
ata_count: 3
homolog_count: 11
edital_count: 23
others_count: 45
doc_class: OUTROS
ata_count: 3
homolog_count: 11
edital_count: 23
others_count: 46
doc_class: OUTROS
ata_count: 3
homolog_count: 11
edital_count: 23
others_count: 47
doc_class: EDITAL
ata_count: 3
homolog_count: 11
edital_count: 24
others_count: 47
d

doc_class: OUTROS
ata_count: 9
homolog_count: 20
edital_count: 40
others_count: 90
ERRO! DOCUMENTO ESCANEADO
ata_count: 9
homolog_count: 20
edital_count: 40
others_count: 90
doc_class: OUTROS
ata_count: 9
homolog_count: 20
edital_count: 40
others_count: 91
ERRO! DOCUMENTO ESCANEADO
ata_count: 9
homolog_count: 20
edital_count: 40
others_count: 91
doc_class: HOMOLOG
ata_count: 9
homolog_count: 21
edital_count: 40
others_count: 91
doc_class: OUTROS
ata_count: 9
homolog_count: 21
edital_count: 40
others_count: 92
doc_class: HOMOLOG
ata_count: 9
homolog_count: 22
edital_count: 40
others_count: 92
doc_class: OUTROS
ata_count: 9
homolog_count: 22
edital_count: 40
others_count: 93
doc_class: HOMOLOG
ata_count: 9
homolog_count: 23
edital_count: 40
others_count: 93
ERRO! DOCUMENTO ESCANEADO
ata_count: 9
homolog_count: 23
edital_count: 40
others_count: 93
doc_class: OUTROS
ata_count: 9
homolog_count: 23
edital_count: 40
others_count: 94
ERRO! DOCUMENTO ESCANEADO
ata_count: 9
homolog_count: 23
edi

doc_class: OUTROS
ata_count: 19
homolog_count: 30
edital_count: 53
others_count: 125
doc_class: OUTROS
ata_count: 19
homolog_count: 30
edital_count: 53
others_count: 126
doc_class: ATA
ata_count: 20
homolog_count: 30
edital_count: 53
others_count: 126
--------------------------------------------------------------------------------
## coqueiral ##
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 0
others_count: 1
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 0
others_count: 2
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 0
others_count: 3
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 0
others_count: 4
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 0
others_count: 5
doc_class: ATA
ata_count: 1
homolog_count: 0
edital_count: 0
others_count: 5
doc_class: OUTROS
ata_count: 1
homolog_count: 0
edital_count: 0
others_count: 6
doc_class: EDITAL
ata_count: 1
homolog_count: 0
edital_count: 1
others_count: 6
doc_class: ATA

doc_class: ATA
ata_count: 1
homolog_count: 0
edital_count: 1
others_count: 1
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 1
others_count: 1
doc_class: OUTROS
ata_count: 1
homolog_count: 0
edital_count: 1
others_count: 2
doc_class: OUTROS
ata_count: 1
homolog_count: 0
edital_count: 1
others_count: 3
doc_class: EDITAL
ata_count: 1
homolog_count: 0
edital_count: 2
others_count: 3
doc_class: OUTROS
ata_count: 1
homolog_count: 0
edital_count: 2
others_count: 4
doc_class: EDITAL
ata_count: 1
homolog_count: 0
edital_count: 3
others_count: 4
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 3
others_count: 4
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 3
others_count: 4
doc_class: OUTROS
ata_count: 1
homolog_count: 0
edital_count: 3
others_count: 5
doc_class: EDITAL
ata_count: 1
homolog_count: 0
edital_count: 4
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 4
others_count: 5
ERRO! DOCUM

doc_class: OUTROS
ata_count: 13
homolog_count: 0
edital_count: 32
others_count: 30
ERRO! DOCUMENTO ESCANEADO
ata_count: 13
homolog_count: 0
edital_count: 32
others_count: 30
doc_class: ATA
ata_count: 14
homolog_count: 0
edital_count: 32
others_count: 30
ERRO! DOCUMENTO ESCANEADO
ata_count: 14
homolog_count: 0
edital_count: 32
others_count: 30
doc_class: OUTROS
ata_count: 14
homolog_count: 0
edital_count: 32
others_count: 31
doc_class: ATA
ata_count: 15
homolog_count: 0
edital_count: 32
others_count: 31
ERRO! DOCUMENTO ESCANEADO
ata_count: 15
homolog_count: 0
edital_count: 32
others_count: 31
doc_class: EDITAL
ata_count: 15
homolog_count: 0
edital_count: 33
others_count: 31
ERRO! DOCUMENTO ESCANEADO
ata_count: 15
homolog_count: 0
edital_count: 33
others_count: 31
doc_class: EDITAL
ata_count: 15
homolog_count: 0
edital_count: 34
others_count: 31
doc_class: EDITAL
ata_count: 15
homolog_count: 0
edital_count: 35
others_count: 31
doc_class: OUTROS
ata_count: 15
homolog_count: 0
edital_count

doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 2
others_count: 2
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 2
others_count: 3
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 3
others_count: 3
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 3
others_count: 4
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 4
others_count: 4
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 4
others_count: 5
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 4
others_count: 6
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 5
others_count: 6
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 6
others_count: 6
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 6
others_count: 7
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 7
others_count: 7
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 7
others_count: 8
doc_class: EDITAL
ata_count: 0
homolog_c

doc_class: EDITAL
ata_count: 1
homolog_count: 2
edital_count: 46
others_count: 48
doc_class: OUTROS
ata_count: 1
homolog_count: 2
edital_count: 46
others_count: 49
doc_class: EDITAL
ata_count: 1
homolog_count: 2
edital_count: 47
others_count: 49
doc_class: OUTROS
ata_count: 1
homolog_count: 2
edital_count: 47
others_count: 50
doc_class: EDITAL
ata_count: 1
homolog_count: 2
edital_count: 48
others_count: 50
doc_class: OUTROS
ata_count: 1
homolog_count: 2
edital_count: 48
others_count: 51
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 2
edital_count: 48
others_count: 51
doc_class: EDITAL
ata_count: 1
homolog_count: 2
edital_count: 49
others_count: 51
doc_class: EDITAL
ata_count: 1
homolog_count: 2
edital_count: 50
others_count: 51
doc_class: ATA
ata_count: 2
homolog_count: 2
edital_count: 50
others_count: 51
doc_class: OUTROS
ata_count: 2
homolog_count: 2
edital_count: 50
others_count: 52
doc_class: OUTROS
ata_count: 2
homolog_count: 2
edital_count: 50
others_count: 53
doc_class: 

doc_class: EDITAL
ata_count: 2
homolog_count: 5
edital_count: 84
others_count: 100
doc_class: OUTROS
ata_count: 2
homolog_count: 5
edital_count: 84
others_count: 101
doc_class: EDITAL
ata_count: 2
homolog_count: 5
edital_count: 85
others_count: 101
doc_class: ATA
ata_count: 3
homolog_count: 5
edital_count: 85
others_count: 101
doc_class: EDITAL
ata_count: 3
homolog_count: 5
edital_count: 86
others_count: 101
doc_class: OUTROS
ata_count: 3
homolog_count: 5
edital_count: 86
others_count: 102
doc_class: EDITAL
ata_count: 3
homolog_count: 5
edital_count: 87
others_count: 102
doc_class: EDITAL
ata_count: 3
homolog_count: 5
edital_count: 88
others_count: 102
doc_class: EDITAL
ata_count: 3
homolog_count: 5
edital_count: 89
others_count: 102
doc_class: OUTROS
ata_count: 3
homolog_count: 5
edital_count: 89
others_count: 103
doc_class: EDITAL
ata_count: 3
homolog_count: 5
edital_count: 90
others_count: 103
doc_class: EDITAL
ata_count: 3
homolog_count: 5
edital_count: 91
others_count: 103
doc_cla

doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 119
others_count: 154
doc_class: EDITAL
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 154
doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 155
doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 156
doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 157
doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 158
ERRO! DOCUMENTO ESCANEADO
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 158
doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 159
doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 160
doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 161
doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 120
others_count: 162
doc_class: OUTROS
ata_count: 6
homolog_count: 5
edital_count: 120
ot

doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 1
others_count: 1
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 1
others_count: 1
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 2
others_count: 1
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 2
others_count: 1
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 3
others_count: 1
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 3
others_count: 1
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 3
others_count: 1
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 4
others_count: 1
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 5
others_count: 1
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 5
others_count: 1
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 6
others_count: 1
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 6
others_c

doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 34
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 34
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 34
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 34
others_count: 5
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 35
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 35
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 35
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 35
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 35
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 35
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 35
others_count: 5
ERRO! DOCUMENTO ESCANEADO
ata_count: 

ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 11
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 11
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 11
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 11
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 12
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 12
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 13
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 13
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 13
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 13
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 13
doc_class: EDITAL
ata_coun

ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 79
others_count: 16
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 79
others_count: 16
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 79
others_count: 16
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 79
others_count: 16
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 79
others_count: 16
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 79
others_count: 16
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 79
others_count: 16
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 79
others_count: 16
doc_class: EDITAL
ata_count: 1
homolog_count: 0
edital_count: 80
others_count: 16
doc_class: EDITAL
ata_count: 1
homolog_count: 0
edital_count: 81
others_count: 16
doc_class: EDITAL
ata_count: 1
homolog_count: 0
edital_count: 82
others_count: 16
ERRO! DOCUMENTO ESCANEADO
ata_coun

ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19
ERRO! DOCUMENTO ESCANEADO
ata_count: 1
homolog_count: 0
edital_count: 106
others_count: 19

doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 52
others_count: 26
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 53
others_count: 26
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 54
others_count: 26
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 54
others_count: 26
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 55
others_count: 26
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 56
others_count: 26
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 26
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 27
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 57
others_count: 28
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 58
others_count: 28
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 58
others_count: 29
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 59
others_count: 29
doc_clas

doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 108
others_count: 58
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 109
others_count: 58
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 110
others_count: 58
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 110
others_count: 59
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 111
others_count: 59
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 112
others_count: 59
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 113
others_count: 59
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 113
others_count: 60
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 114
others_count: 60
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 115
others_count: 60
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 115
others_count: 60
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 115
others_count:

ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 162
others_count: 90
ERRO! DOCUMENTO ESCANEADO
ata_count: 0
homolog_count: 0
edital_count: 162
others_count: 90
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 163
others_count: 90
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 164
others_count: 90
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 165
others_count: 90
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 166
others_count: 90
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 166
others_count: 91
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 167
others_count: 91
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 168
others_count: 91
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 168
others_count: 92
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 169
others_count: 92
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 169
other

doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 217
others_count: 119
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 218
others_count: 119
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 218
others_count: 120
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 219
others_count: 120
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 220
others_count: 120
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 220
others_count: 121
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 221
others_count: 121
doc_class: OUTROS
ata_count: 0
homolog_count: 0
edital_count: 221
others_count: 122
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 222
others_count: 122
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 223
others_count: 122
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 224
others_count: 122
doc_class: EDITAL
ata_count: 0
homolog_count: 0
edital_count: 225
others_cou

In [294]:
documents_list = []
for city in list(cities_data.keys()):
    documents_list = documents_list + cities_data[city]
documents_df = pd.DataFrame(documents_list)[["doc_id", "title", "city", "all_matches", "class"]]
atas_df = documents_df[documents_df['class']==ATA].groupby(['city'])

In [295]:
for city, group in atas_df:
    print(f'## {city} ##')
    for _, row in group.iterrows():
        doc_id = row['doc_id']
        doc_path = [info['path'] for info in pdfs if info['id'] == f'{doc_id}.pdf'][0]
        pdf_file = pdfplumber.open(doc_path)
        pages = get_pages(pdf_file)
        has_table = False
        for index in range(len(pages)):
            page_tables = table_extraction(pdf_file, index)
            if len(page_tables) > 0:
                has_table = True
                break
                
        if has_table:
            file_name = doc_path.split('/')[-1].replace('.pdf', '_com_tabela.pdf')
        else:
            file_name = doc_path.split('/')[-1].replace('.pdf', '_sem_tabela.pdf')
        
        new_path = str(f"../data/ata_documents/{city}_files/{file_name}.pdf")
        try:
            os.mkdir(str(f"../data/ata_documents/{city}_files"))
        except Exception as e:
            print(e)
        print(file_name)
        shutil.copyfile(doc_path, new_path)
        #table_extraction

## arantina ##
4becab8695c598376574a4452973ec32_com_tabela.pdf
## coqueiral ##
35dc0bb8dd2dedbff03f2360b4b8c6ed_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/coqueiral_files'
9256d0fb4dcb29c108981381763b3669_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/coqueiral_files'
73d8120f264477207498d9c91c7bbdc7_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/coqueiral_files'
5d3064bda50bd93b9f5f5628e1509116_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/coqueiral_files'
3f16496856fed314801b4dd3aece0c3f_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/coqueiral_files'
6a24d7c176c9d0bf21b898b82efa657d_sem_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/coqueiral_files'
fa58b08e736c68b2b5eb25f97f663f33_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/coqueiral_files'
5f7c36a0f5898517051b0913c5d56158_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/coqueiral_files'
239274f51830851f94505ef16

[Errno 17] File exists: '../data/ata_documents/sao bento abade_files'
fa486b16718aa5ff7d9a9de3ad82a4d4_sem_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/sao bento abade_files'
3f2bfb1ae90cad81b9d6849b9e715a0f_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/sao bento abade_files'
08b43f26baa37068d9128b9b248c1c6b_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/sao bento abade_files'
c7059c691dc4a51074ee5105cb2b5a69_sem_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/sao bento abade_files'
3346e5dde8c391674022d13db6e68867_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/sao bento abade_files'
1e194e1136e56d47f2e9bfc5ab79b97e_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/sao bento abade_files'
580e6393365c2957e221de56844dd011_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/sao bento abade_files'
0944ae72c0f0efa8f4d5e8cc499c131a_com_tabela.pdf
[Errno 17] File exists: '../data/ata_documents/sao bento

In [250]:
ata_df

Unnamed: 0,doc_id,title,city,all_matches,class
1354,1e194e1136e56d47f2e9bfc5ab79b97e,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,"[ata, edital]",ATA
1358,0944ae72c0f0efa8f4d5e8cc499c131a,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,"[ata, sessão pública, edital, edital, edital]",ATA
1321,d9d98477b8a563b4fe686767156c0a80,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,"[ata, sessão pública, edital]",ATA
1309,b4391b7028124fe2941417a750eb2390,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,"[ata, ata, ata, ata, ata, edital, edital]",ATA
1343,fa486b16718aa5ff7d9a9de3ad82a4d4,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,"[ata, ata, edital, edital]",ATA
1292,91700bfc0d3e9cab7f6febbb93730bd1,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,"[ata, ata, sessão pública]",ATA
1294,32bcc95227da6b2afdd7fa14edc75e1b,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,"[ata, ata, ata, ata, ata, edital, edital]",ATA
1354,1e194e1136e56d47f2e9bfc5ab79b97e,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,"[ata, edital]",ATA
1330,c7d2371dbc67b51c8e350e03d0c46b8a,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,[ata],ATA
1309,b4391b7028124fe2941417a750eb2390,cia da flor ltda epp cnpj 13 164 990 0001 4...,sao bento abade,"[ata, ata, ata, ata, ata, edital, edital]",ATA


In [253]:
for city in cities_data.keys():
    try:
        ata_df = documents_df[(documents_df["class"] == ATA) & (documents_df["city"] == city)].sample(min(len(documents_df[(documents_df["class"] == ATA) & (documents_df["city"] == city)]),minimum,),replace=True)
#         edital_df = documents_df[(documents_df["class"] == "EDITAL") & (documents_df["city"] == city)].sample(min(len(documents_df[(documents_df["class"] == "EDITAL") & (documents_df["city"] == city)]),minimum,),replace=True)
#         homolog_df = documents_df[(documents_df["class"] == "HOMOLOGAÇÃO") & (documents_df["city"] == city)].sample(min(len(documents_df[(documents_df["class"] == "HOMOLOGAÇÃO") & (documents_df["city"] == city)]),minimum,),replace=True)
#         others_df = documents_df[(documents_df["class"] == "OUTROS") & (documents_df["city"] == city)].sample(min(len(documents_df[(documents_df["class"] == "OUTROS") & (documents_df["city"] == city)]),minimum,),replace=True)
        
        df = ata_df.append(edital_df).append(homolog_df).append(others_df)
#         city_doc = list(ata_df.doc_id.unique()) + list(edital_df.doc_id.unique()) + list(homolog_df.doc_id.unique()) + list(others_df.doc_id.unique())
        city_doc = list(ata_df.doc_id.unique())
        for doc in city_doc:
            doc_path = [info['path'] for info in pdfs if info['id'] == f'{doc}.pdf'][0]
            file_name = doc.split("/")[-1]
            new_path = str(f"../data/ata_documents/{city}_files/{file_name}.pdf")
            try:
                os.mkdir(str(f"../data/ata_documents/{city}_files"))
            except Exception as e:
                print(e)
            print(new_path)
            shutil.copyfile(doc_path, new_path)

#         df.to_csv(str(f"cities_ata_documents/{city}_doc_classification.csv"), sep=",", encoding="utf-8")
    except Exception as e:
        print("**", e)

../data/ata_documents/pirapetinga_files/bbd89bca24b1fe7bcf7b791a7d4767ea.pdf
[Errno 17] File exists: '../data/ata_documents/pirapetinga_files'
../data/ata_documents/pirapetinga_files/6423156f2479b83267beae37664b8559.pdf
[Errno 17] File exists: '../data/ata_documents/pirapetinga_files'
../data/ata_documents/pirapetinga_files/f4decdfaff333dcdf476d0d0bc8a2a4a.pdf
[Errno 17] File exists: '../data/ata_documents/pirapetinga_files'
../data/ata_documents/pirapetinga_files/b67877144aeb0a59f6e63ecf9d914bc1.pdf
[Errno 17] File exists: '../data/ata_documents/pirapetinga_files'
../data/ata_documents/pirapetinga_files/2c892bb52aa3ba1d158473820673c636.pdf
[Errno 17] File exists: '../data/ata_documents/pirapetinga_files'
../data/ata_documents/pirapetinga_files/7228fb0214d8ce043a657df41dd316b8.pdf
[Errno 17] File exists: '../data/ata_documents/pirapetinga_files'
../data/ata_documents/pirapetinga_files/4c3777c684dfca6171414a06a0f03279.pdf
[Errno 17] File exists: '../data/ata_documents/pirapetinga_files'

In [None]:
shutil.make_archive(
    "cities_doc_classification/cities_doc_classification",
    "zip",
    "cities_doc_classification",
)

In [None]:
sns.histplot(documents_df["class"], kde=False, bins=8, stat="count")

In [None]:
documents_df[documents_df["city"] == "sao bento abade"]

In [None]:
len(
    documents_df[
        (documents_df["class"] == "ATA") & (documents_df["city"] == "ijaci")
    ]
)

In [None]:
documents_df[
    (documents_df["class"] == "ATA") & (documents_df["city"] == "arantina")
].sample(
    min(
        len(
            documents_df[
                (documents_df["class"] == "ATA") & (documents_df["city"] == "coqueiral")
            ].sample(10, replace=True)
        ),
        10,
    ),
    replace=True,
)

In [None]:
documents_df[
    (documents_df["class"] == "HOMOLOGAÇÃO") & (documents_df["city"] == "olaria")
].sample(10, replace=True)

In [None]:
len(documents_df[(documents_df["class"] == "ATA") & (documents_df["city"] == "ijaci")])

#### Adaptations...

In [112]:
def title_extraction_breaklines(pages, words_limit=20):
    content = pages[0]
    content = text_preprocessing.remove_special_characters(content, exceptions=["\n"]).lower()
    content = text_preprocessing.remove_excessive_spaces(content)
    content = re.sub(r"\n\s*\n", "\n", content)
    first_lines = content.split("\n", 6)[:-1]
    return first_lines

def get_first_page_content(pages):
    content = pages[0]
    content = text_preprocessing.remove_special_characters(content).lower()
    content = text_preprocessing.remove_excessive_spaces(content)
    return content

def get_content_matches(doc_id, title_s, title, content):
    matches_dict = {
        "doc_id": doc_id, "title": title_s, "city": city, "all_matches": [],
        "ata_title_matches"    : [], "ata_content_matches"    : [], "ata_title_count"    : 0, "ata_content_count"    : 0,
        "homolog_title_matches": [], "homolog_content_matches": [], "homolog_title_count": 0, "homolog_content_count": 0,
        "edital_title_matches" : [], "edital_content_matches" : [], "edital_title_count" : 0, "edital_content_count" : 0,
        "outros_title_matches" : [], "outros_content_matches" : [], "outros_title_count" : 0, "outros_content_count" : 0,
    }
    
    for word_dict in keywords:
        word = word_dict["word"]
        title_regex = word_dict["title_regex"]
        content_regex = word_dict["title_regex"]
        doc_class = word_dict["class"].lower()
        title_matches = []
        
        for index in range(len(title)):
            line = title[index]
            match = re.findall(title_regex, line.lower())
            
            if bool(match) and len(match) > 0:
                title_matches.append({"match": match, "line": index + 1})
                matches_dict[f"{doc_class}_title_matches"] += title_matches
                matches_dict[f"{doc_class}_title_count"] += len(title_matches)
                
        content_matches = re.findall(content_regex, content.lower())
        matches_dict[f"{doc_class}_content_matches"] += content_matches
        matches_dict[f"{doc_class}_content_count"] += max(len(content_matches) - len(title_matches), 0)
        matches_dict["all_matches"] += content_matches
    return matches_dict

In [155]:
cities_data = defaultdict(list)
empty = 0

cities_documents = load_content("../data/documentos_json")
ata_count, homolog_count, edital_count, others_count = [0, 0, 0, 0]

for city, documents in cities_documents.items():
    for document in documents:
        if document["status"] != "SUCCESS":
            continue
            
        pages = document["text_content"]
        if len(pages) == 0:
            continue
        
        doc_id = document["file_id"]
        content = get_first_page_content(pages)
        title = title_extraction_breaklines(pages)
        title_s = " ".join(title)
        
        if (bool(title) and len(title)) == 0 or not (content):
            empty += 1
            #print("ERRO! DOCUMENTO ESCANEADO")
        else:
            matches_dict = get_content_matches(doc_id, title_s, title, content)
            doc_class = get_meta_classe(matches_dict)
            ata_count, homolog_count, edital_count, others_count = update_class_count(doc_class, ata_count, homolog_count, edital_count, others_count)
            matches_dict["class"] = doc_class
            cities_data[city].append(matches_dict)

        #print('ata_count:', ata_count)
        #print('homolog_count:', homolog_count)
        #print('edital_count:', edital_count)
        #print('others_count:', others_count)

sao-bento-abade contains 232 documents
olaria contains 42 documents
coqueiral contains 1528 documents
cristais contains 1736 documents
pirapetinga contains 1007 documents
passa-vinte contains 395 documents
arantina contains 983 documents
ijaci contains 451 documents
itamarati contains 1110 documents
ribeirao-vermelho contains 684 documents


In [156]:
dataframes = dict()
for city in cities_data:
    _df = pd.DataFrame(data=cities_data[city])
    _df = _df.loc[:, ["doc_id", "title", "city", "all_matches", "class"]].rename(columns={"class": "meta-class"})
    _df["real_meta-class"] = ""
    _df["real_class"] = ""
    _df["key_words"] = ""
    _df["obs"] = ""
    dataframes[city] = _df

In [152]:
dataframes["olaria"]

Unnamed: 0,doc_id,title,city,all_matches,meta-class,real_meta-class,real_class,key_words,obs
0,4d53c2fbba114e2e594a3feb6d210c44,2014 plano municipal de gestão integrada de...,olaria,[],OUTROS,,,,
1,7087aaed7cb40a722df310e1dbd7d881,prefeitura municipal de olaria 2013 2016 num...,olaria,[ordem de serviço],OUTROS,,,,
2,3947c322bad70b9843133a03e1986eea,comunicado está suspenso por tempo indeter...,olaria,[edital],EDITAL,,,,
3,281f464653c7b5ac70f8c975e467884f,resposta ao pedido de esclarecimento ao edit...,olaria,"[edital, edital, resposta, resposta, resposta]",OUTROS,,,,
4,83d54594c1ec629f6138b613a8574e4d,termo de retificação de edital processo lic...,olaria,"[edital, edital, edital, retificação]",OUTROS,,,,
5,2d0172c38b553df7ceb653c791d59732,resposta ao pedido de esclarecimento ao edit...,olaria,"[edital, edital, edital, resposta, resposta]",OUTROS,,,,
6,0307dcdf9d1d057713a180f61d5daa1f,termo de retificação de edital processo lic...,olaria,"[edital, edital, edital, edital, cronograma, r...",OUTROS,,,,
7,60a76df5be4434ae1aadfa62c98ad1bd,termo de retificação de edital processo lic...,olaria,"[edital, edital, retificação]",OUTROS,,,,
8,8856d28873f1463ce4e1c7626148baf0,termo de retificação de edital processo lic...,olaria,"[edital, edital, edital, retificação]",OUTROS,,,,
9,11ebc58df6363a7c883d54d14f4add65,termo de retificação de edital processo lic...,olaria,"[edital, edital, edital, edital, edital, edita...",OUTROS,,,,


In [157]:
from IPython.display import display, HTML
pre_classified_xlsx = "./resultado_m03_meta_classes_extraction/doc_classification.xlsx"

for city, _df in dataframes.items():
    try:
        adapted_name = city.replace("-", " ")
        pre_classified = pd.read_excel(pre_classified_xlsx, adapted_name)
        pre_classified = pre_classified.loc[:, ~pre_classified.columns.str.contains('^Unnamed')]
        pre_classified.drop_duplicates(subset="doc_id", keep="first", inplace=True)
        pre_classified = pre_classified.loc[:, ["doc_id", "real_meta-class", "real_class", "key_words", "obs"]]
        _df_intermediary = _df.merge(pre_classified, on="doc_id", how="left", suffixes=("_to_drop", None))
        _df_intermediary = _df_intermediary.drop(columns=_df_intermediary.columns[_df_intermediary.columns.str.endswith("_to_drop")])
        _df_intermediary = _df_intermediary.sort_values(by="real_meta-class")
        _df_intermediary.reset_index(drop=True, inplace=True)
        dataframes[city] = _df_intermediary
    except Exception as e:
        print(city, e)

itamarati No sheet named <'itamarati'>
ribeirao-vermelho No sheet named <'ribeirao vermelho'>


In [158]:
save(dataframes, "./resultado_m03_meta_classes_extraction/resultado_parcial.xlsx")