# files/file_description.jsonl
- cada linha representa um arquivo coletado (pdf, docx, ...)
- url: indica o link de download do arquivo
- referer: indica a url da página onde o arquivo foi coletado

# raw_pages/file_description.jsonl
- cada linha representa uma página html que foi coletada
- url: indica o link da página coletada
- referer: indica o link da página (mãe) que foi utilizada para acessar a página baixada

## Encontrar ancestral comum
- É possível encontrar o html utilizado para baixar um arquivo a partir do join abaixo:
    - raw_pages['url'] == files['referer']
- Primeira abordagem: 
    - encontramos os ancestrais comuns utilizando apenas os registros do raw_pages
    - podamos os nós folha que não foram utilizados para baixar arquivos pdf

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Tree

from anytree import Node, RenderTree, search
from anytree.dotexport import DotExporter

def remove_leaves_without_pdf(nodes, root):
    null_leaves = search.findall(nodes[root], filter_=lambda node: node.is_leaf and node.pdf == ['Null'])
    for leaf in null_leaves:
        leaf.parent = None

def find_siblings(nodes, root, file_name):
    leaf = search.findall(nodes[root], filter_=lambda node: file_name in node.pdf)
    return leaf

def find_parent_with_pdf(nodes, root, file_name):
    leaf = search.findall(nodes[root], filter_=lambda node: file_name in node.pdf)
    return leaf[0].parent

def add_nodes(nodes, parent, child, file_name_pdf, file_name_html):
    if parent not in nodes:
        nodes[parent] = Node(parent, pdf = ["Null"], html = ["Null"])        
    if child not in nodes:
        nodes[child] = Node(child, pdf = file_name_pdf, html = file_name_html)

    nodes[child].pdf = file_name_pdf
    nodes[child].html = file_name_html
    # avoid cycles
    try:
        nodes[child].parent = nodes[parent]
    except:
        pass

In [3]:

from bs4 import BeautifulSoup
import re
import collections
import tqdm

def get_tag(soup, url):
    # skip patterns
    url = url.replace("(", "\(").replace(")", "\)").replace("?", "\?")
    # find url
    tag = soup.findAll('a', href=re.compile(url))
    if len(tag) > 0:
        return tag[0]
    else:
        return None

def lowest_common_ancestor(parents=None, depth=0, *args):
    if parents is None:
        parents = collections.defaultdict(int)
    for tag in args:
        parents[tag] += 1
        depth +=1
        if parents[tag] == NUM_OF_NODES:
            return tag, depth
    next_arg_list = [tag.parent for tag in args if tag.parent is not None]

    return lowest_common_ancestor(parents, depth, *next_arg_list)

def remove_common_prefix(m):
    s1 = m['url_html']
    s2 = m['url_pdf']
    if str(s1) != "nan" and str(s2) != "nan":
        for i, c in enumerate(s1):
            if c != s2[i]:
                return s2[i:]
    return None

In [44]:
def remove_file_name_extension(df):
    df['file_name_pdf'] = df['file_name_pdf'].str.replace('.pdf','').str.replace(";","")
    return df

In [45]:
def get_processo_licitatorio_from_tag(tags):
    tag_proc_lic = dict()
    for tag in tags:
        tag_proc_lic[str(tag)] = re.findall(pattern, tag.text, re.IGNORECASE)[-1][-1]
    return tag_proc_lic

In [46]:
def check_processo_licitatorio_in_list(x):
    ground_truth = x['num_processo_licitatorio_doc']
    try:
        classified = x['processo_licitatorio_lca_html'].replace("[","").replace("]","").replace("'","").replace(" ","").split(",")
    except:
        return pd.Series([0, 0])
    classified = [num_proc.zfill(8) for num_proc in classified]
    if ground_truth in classified:
        return pd.Series([1, len(classified)])
    else:
        return pd.Series([0, len(classified)])

def retrieve_list_size(x):
    ground_truth = x['num_processo_licitatorio_doc']
    try:
        classified = x['processo_licitatorio_lca_html'].replace("[","").replace("]","").replace("'","").replace(" ","").split(",")
    except:
        return pd.Series([0])
    return pd.Series([len(classified)])

In [None]:
### Raw pages

#raw_pages_path = "../data/288_licitacoes-pirapetinga/data/raw_pages/"
cities_path = {
    #'arantina': '../data/306-licitacoes-arantina/',
    #'coqueiral': '../data/289-licitacoes-coqueiral/',
    'cristais': '../data/290-licitacoes-cristais/', # uma página por PL, é possível encontrar documentos que pertencem ao mesmo PL pela árvore
    'ijaci': '../data/353-licitacoes-ijaci/', # todo em apenas uma página, tabela com um PL por linha, não é possível linkar o arquivo com a tabela no html
    'olaria': '../data/302-licitacoes-olaria/', # todo em apenas uma página, tabela com um PL por linha
    'passa-vinte': '../data/304-licitacoes-passa-vinte/', # links não batem
    'pirapetinga': '../data/288_licitacoes-pirapetinga/', # 
    'sao-bento-abade': '../data/381-licitacoes-sao-bento-abade/' # todo em apenas uma página, possível recuperar via div
}
for city in cities_path.keys():
    print("-"*100)
    print(city)
    print("-"*100)
    root_path = cities_path[city]

    df_raw_pages = pd.read_json(root_path+"data/raw_pages/file_description.jsonl", lines=True)
    # drop columns
    columns_to_keep = ['file_name', 'url', 'referer']
    df_raw_pages = df_raw_pages[columns_to_keep]
    df_raw_pages.head()

    ### Files

    df_files = pd.read_json(root_path+"/data/files/file_description.jsonl", lines=True)
    # select only pdf
    df_files = df_files.loc[(df_files['type'].str.contains("pdf")) & (df_files['type'] != None)]
    # drop columns
    columns_to_keep = ['file_name', 'url', 'referer']
    df_files = df_files[columns_to_keep]

    num_referers = df_files['referer'].nunique()
    num_urls = df_files['url'].nunique()
    #print("{} referers and {} urls".format(num_referers, num_urls))
    df_files.head()

    # Search for pdf

    data = df_raw_pages.merge(df_files, left_on='url', right_on='referer', suffixes=['_html', '_pdf'], how='left' )
    # Group file names in list
    data.fillna('Null', inplace=True)
    data = pd.DataFrame(data.groupby(['url_html', 'referer_html', 'referer_pdf', 'file_name_html'])['file_name_pdf'].apply(list)).reset_index()
    data.head()

    nodes = {}  # store references to created nodes 
    # data.apply(lambda x: add_nodes(nodes, x["referer_html"], x["url_html"]), axis=1)  # 1-liner
    for parent, child, file_name_pdf, file_name_html in zip(data["referer_html"],data["url_html"], data["file_name_pdf"], data["file_name_html"]):
        add_nodes(nodes, parent, child, file_name_pdf, file_name_html)

    roots = list(data[~data["referer_html"].isin(data["url_html"])]["referer_html"].unique())
    # remove leaves without pdfs
    remove_leaves_without_pdf(nodes, roots[0])

    #print(RenderTree(nodes[roots[0]]))

    file_name = "0b96de38bae73ce0cc9f749d4732ee1a.pdf"
    find_siblings(nodes, roots[0], file_name)

    # Parsing html files


    df_data_lca = df_raw_pages.merge(df_files, left_on='url', right_on='referer', suffixes=['_html', '_pdf'], how='left' )
    #print(df_data_lca.shape)
    df_data_lca.drop_duplicates(subset=['file_name_pdf'], inplace=True)
    #print(df_data_lca.shape)

    df_data_lca['lca_url'] = None
    df_data_lca['lca_min'] = None
    df_data_lca['lca_dist'] = np.inf
    df_data_lca['href_pdf'] = df_data_lca.apply(lambda x: remove_common_prefix(x), axis=1)

    html_files = df_data_lca['file_name_html'].unique()
    #html_files = ["0066af496d38f92ee926726027d88cbe.html"]
    #html_files = html_files[:5]
    for file_name_html in tqdm.tqdm_notebook(html_files):
        #url_pdf = df_data_lca.loc[df_data_lca['file_name_html'] == file_name_html, 'url_pdf'].unique()
        url_pdf = df_data_lca.loc[df_data_lca['file_name_html'] == file_name_html, 'href_pdf'].unique()
        try:
            fp =  open(root_path+"data/raw_pages/"+file_name_html)
            soup = BeautifulSoup(fp)
        except Exception as e:
            #print(e)
            try:
                fp =  open(root_path+"data/raw_pages/"+file_name_html, encoding="windows-1252")
                soup = BeautifulSoup(fp)
            except:
                #print(file_name_html)
                continue
        #for id_a, url_a in enumerate(url_pdf[:-1]):
        for id_a, url_a in enumerate(url_pdf):
            lca_min_size = np.inf
            lca_min = None
            lca_url = [url_a]
            #print(url_a)
            #tag_a = get_tag(soup, url_a.split("/")[-1])
            try:
                #tag_a = get_tag(soup, url_a.split("/",3)[-1])
                tag_a = get_tag(soup, url_a)
                #print(tag_a)
            except Exception as e:
                #print(e)
                continue
            # ignore page without link
            if tag_a != None:
                #for url_b in url_pdf[id_a+1:]:
                for id_b, url_b in enumerate(url_pdf):
                    if id_a == id_b:
                        continue
                    #tag_b = get_tag(soup, url_b.split("/")[-1])
                    try:
                        #tag_b = get_tag(soup, url_b.split("/",3)[-1])
                        tag_b = get_tag(soup, url_b)
                        #print(tag_b)
                    except Exception as e:
                        #print(e)
                        continue

                    if tag_b != None:                    
                        # List of tag to be searched
                        list_of_tag = [tag_a, tag_b]
                        NUM_OF_NODES = len(list_of_tag)
                        # find lca
                        lca, lca_size = lowest_common_ancestor(None,0, *list_of_tag)

                        if lca_size < lca_min_size:
                            lca_min_size = lca_size
                            lca_url = [url_b]
                            lca_min = lca
                        elif lca_min == lca:
                            lca_url.append(url_b)

            if df_data_lca.loc[df_data_lca['href_pdf'] == url_a, 'lca_dist'].values > lca_min_size:
                df_data_lca.loc[df_data_lca['href_pdf'] == url_a, ['lca_url']] = str(lca_url)
                df_data_lca.loc[df_data_lca['href_pdf'] == url_a, ['lca_dist']] = lca_min_size
                df_data_lca.loc[df_data_lca['href_pdf'] == url_a, ['lca_min']] = str(lca_min)
            for url in lca_url:
                if df_data_lca.loc[df_data_lca['href_pdf'] == url, 'lca_dist'].values > lca_min_size:
                    df_data_lca.loc[df_data_lca['href_pdf'] == url, ['lca_dist']] = lca_min_size
                    df_data_lca.loc[df_data_lca['href_pdf'] == url, ['lca_url']] = str(lca_url)
                    df_data_lca.loc[df_data_lca['href_pdf'] == url, ['lca_min']] = str(lca_min)
            """if df_data_lca.loc[df_data_lca['href_pdf'] == lca_url, 'lca_dist'].values > lca_min_size:
                df_data_lca.loc[df_data_lca['href_pdf'] == lca_url, ['lca_url', 'lca_dist']] = [url_a, lca_min_size]"""


    # Find PL

    

    # Read file
    df_processo_lic = pd.read_csv("./resultado_processo_licitatorio/{}.csv".format(city))
    # Explode file name
    df_processo_lic['arquivos'] = df_processo_lic["arquivos"].apply(lambda x: [file.split("/")[-1] for file in x.split(",")])
    df_processo_lic['expressão'] = df_processo_lic["expressão"].apply(lambda x: [file for file in x.split(",")])
    df_processo_lic = df_processo_lic.explode("arquivos")[['no. licitação', 'arquivos']]
    # rename columns
    df_processo_lic.columns = ["num_processo_licitatorio", "file_name_pdf"]
    # remove extension
    df_data_lca = remove_file_name_extension(df_data_lca)
    # merge data
    df_data_lca_pl = df_data_lca.merge(df_processo_lic, on="file_name_pdf", how='left')
    #print(df_data_lca_pl.shape)
    df_data_lca_pl['num_processo_licitatorio'].isnull().sum()

    #df_data_lca_pl = df_data_lca_pl.copy()#.loc[df_data_lca_pl['file_name_html'] == '7f2c78b4493818628d2649a7320cfa81.html'].copy()
    df_data_lca_pl_2 = df_data_lca_pl.dropna(axis=0, subset=["num_processo_licitatorio"])
    df_data_lca_pl = df_data_lca_pl.merge(df_data_lca_pl_2[["num_processo_licitatorio", "lca_min"]].drop_duplicates(), on="lca_min", how='left', suffixes=('_doc', '_lca_doc'))
    #df_data_lca_pl['num_processo_licitatorio_x'].fillna(df_data_lca_pl['num_processo_licitatorio_y'], inplace=True)
    #print(df_data_lca_pl.shape)
    df_data_lca_pl.drop_duplicates("file_name_pdf", inplace=True)
    df_data_lca_pl.reset_index(drop=True, inplace=True)
    #df_data_lca_pl['validação'] = False
    #print(df_data_lca_pl.shape)
    #print(df_data_lca_pl["num_processo_licitatorio_doc"].isnull().sum())

    # Find PL in HTML files

    df_data_lca['lca_url'] = None
    df_data_lca['lca_min'] = None
    df_data_lca['lca_dist'] = np.inf
    df_data_lca['href_pdf'] = df_data_lca.apply(lambda x: remove_common_prefix(x), axis=1)
    df_data_lca['processo_licitatorio_lca_html'] = None

    html_files = df_data_lca['file_name_html'].unique()
    #html_files = html_files[:5]
    for file_name_html in tqdm.tqdm_notebook(html_files):
        #url_pdf = df_data_lca.loc[df_data_lca['file_name_html'] == file_name_html, 'url_pdf'].unique()
        url_pdf = df_data_lca.loc[df_data_lca['file_name_html'] == file_name_html, 'href_pdf'].unique()
        try:
            fp =  open(root_path+"data/raw_pages/"+file_name_html)
            soup = BeautifulSoup(fp)
        except Exception as e:
            #print(e)
            try:
                fp =  open(root_path+"data/raw_pages/"+file_name_html, encoding="windows-1252")
                soup = BeautifulSoup(fp)
            except:
                #print(file_name_html)
                continue


        #pattern = "(Processo(?:.{1})?(?:Licitatório)?(?:Licitatorio)?(?:.{1,5})(\d{3}/\d{4}))"
        # Expected matchs:
        # Processo Licitatorio XXX/XXXX - Processo Licitatorio XX/XXXX
        # Processo XXX/XXXX - Processo XX/XXXX
        # Processo de Licitação XXX/XXXX - Processo de Licitação XX/XXXX
        pattern = "(Processo(?:.{1,20})?(?:Licitatório)?(?:Licitatorio)?(?:.{1,5})((?:\d{3})?(?:\d{2})/\d{4}))"

        tags = soup.find_all(string=re.compile(pattern, re.IGNORECASE), recursive=True)
        # drop duplicated
        #tags = set([tag.findParent() for tag in tags])
        tags = [tag.findParent() for tag in tags]
        # get pl number from tag
        tag_proc_lic = get_processo_licitatorio_from_tag(tags)

        for id_a, url_a in enumerate(url_pdf):
            lca_min_size = np.inf
            lca_min = None
            lca_url = [url_a]
            try:
                tag_a = get_tag(soup, url_a)
            except Exception as e:
                #print(e)
                continue
            # ignore page without link
            if tag_a != None:
                for tag_b in tags:
                    # List of tag to be searched
                    list_of_tag = [tag_a, tag_b]
                    NUM_OF_NODES = len(list_of_tag)
                    # find lca
                    lca, lca_size = lowest_common_ancestor(None,0, *list_of_tag)
                    if lca_size < lca_min_size:
                        lca_min_size = lca_size
                        lca_url = [tag_b]
                        lca_min = lca
                    elif lca_min == lca:
                        lca_url.append(tag_b)                    


            if df_data_lca.loc[df_data_lca['href_pdf'] == url_a, 'lca_dist'].values > lca_min_size:
                df_data_lca.loc[df_data_lca['href_pdf'] == url_a, ['lca_url']] = str(lca_url)
                df_data_lca.loc[df_data_lca['href_pdf'] == url_a, ['lca_dist']] = lca_min_size
                df_data_lca.loc[df_data_lca['href_pdf'] == url_a, ['lca_min']] = str(lca_min)
                df_data_lca.loc[df_data_lca['href_pdf'] == url_a, ['processo_licitatorio_lca_html']] = str([tag_proc_lic[str(tag)] for tag in lca_url])

    df_data_lca_pl = df_data_lca_pl.merge(df_data_lca[['file_name_pdf', 'processo_licitatorio_lca_html']], on='file_name_pdf')
    df_data_lca_pl

    # Comparação entre métodos

    #print(df_data_lca_pl.shape)
    df_ground_truth = df_data_lca_pl.loc[~df_data_lca_pl['num_processo_licitatorio_doc'].isnull()].copy()
    #print(df_ground_truth.shape)

    df_ground_truth.loc[df_ground_truth.num_processo_licitatorio_doc != df_ground_truth.num_processo_licitatorio_lca_doc]

    

    df_ground_truth['found_num_processo_html'] = None
    df_ground_truth['found_num_processo_lca_doc'] = 0
    df_ground_truth['size_processos_found'] = None
    print(df_ground_truth.apply(lambda x: check_processo_licitatorio_in_list(x), axis=1))
    df_ground_truth[['found_num_processo_html', 'size_processos_found']] = df_ground_truth.apply(lambda x: check_processo_licitatorio_in_list(x), axis=1)
    df_ground_truth.loc[df_ground_truth['num_processo_licitatorio_doc'] == df_ground_truth['num_processo_licitatorio_lca_doc'], 'found_num_processo_lca_doc'] =1
    df_ground_truth[['found_num_processo_lca_doc','found_num_processo_html', 'size_processos_found']]

    # Total de docs que achou apenas 1
    df_data_lca_pl['size_processos_found'] = None
    df_data_lca_pl['size_processos_found'] = df_data_lca_pl.apply(lambda x: retrieve_list_size(x), axis=1)


    try:
        df_acertos.loc[len(df_acertos)] = [city,  
                                           df_data_lca_pl.shape[0],
                                           df_data_lca_pl.loc[df_data_lca_pl['size_processos_found'] == 1].shape[0],
                                           df_data_lca_pl.loc[df_data_lca_pl['size_processos_found'] != 1].shape[0],
                                           df_ground_truth.shape[0],
                                           df_ground_truth.loc[(df_ground_truth.found_num_processo_html == 1)&(df_ground_truth.size_processos_found == 1)].shape[0]
                                          ]
    except:
        df_acertos = pd.DataFrame(columns = ["city",'Total documentos' ,'Achou apenas 1', 'Achou != 1', 'Tamanho GT', 'Achou 1 e acertou'])
        df_acertos.loc[len(df_acertos)] = [city, 
                                           df_data_lca_pl.shape[0],
                                           df_data_lca_pl.loc[df_data_lca_pl['size_processos_found'] == 1].shape[0],
                                           df_data_lca_pl.loc[df_data_lca_pl['size_processos_found'] != 1].shape[0],
                                           df_ground_truth.shape[0],
                                           df_ground_truth.loc[(df_ground_truth.found_num_processo_html == 1)&(df_ground_truth.size_processos_found == 1)].shape[0]
                                          ]
    display(df_acertos)
    

----------------------------------------------------------------------------------------------------
cristais
----------------------------------------------------------------------------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=711.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=711.0), HTML(value='')))


      0  1
18    1  1
19    1  1
20    1  1
23    1  1
26    0  1
...  .. ..
1375  0  1
1391  0  1
1452  0  1
1579  1  1
1678  1  1

[286 rows x 2 columns]


Unnamed: 0,city,Total documentos,Achou apenas 1,Achou != 1,Tamanho GT,Achou 1 e acertou
0,arantina,938,798,140,220,113
1,coqueiral,1529,194,1335,390,35
2,cristais,1737,1648,89,286,199


----------------------------------------------------------------------------------------------------
ijaci
----------------------------------------------------------------------------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


     0  1
0    0  0
2    0  0
7    0  0
12   0  0
14   0  0
..  .. ..
445  0  0
446  0  0
447  0  0
448  0  0
449  0  0

[176 rows x 2 columns]


Unnamed: 0,city,Total documentos,Achou apenas 1,Achou != 1,Tamanho GT,Achou 1 e acertou
0,arantina,938,798,140,220,113
1,coqueiral,1529,194,1335,390,35
2,cristais,1737,1648,89,286,199
3,ijaci,451,0,451,176,0


----------------------------------------------------------------------------------------------------
olaria
----------------------------------------------------------------------------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


    0  1
3   1  1
6   1  1
8   1  1
9   0  1
10  1  1
11  1  1
12  1  1
16  1  1
18  1  1
20  0  1
25  1  1
27  1  1
33  1  1
35  1  1
40  1  1


Unnamed: 0,city,Total documentos,Achou apenas 1,Achou != 1,Tamanho GT,Achou 1 e acertou
0,arantina,938,798,140,220,113
1,coqueiral,1529,194,1335,390,35
2,cristais,1737,1648,89,286,199
3,ijaci,451,0,451,176,0
4,olaria,43,39,4,15,13


----------------------------------------------------------------------------------------------------
passa-vinte
----------------------------------------------------------------------------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=329.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=329.0), HTML(value='')))


     0  1
18   0  0
35   0  0
44   0  0
47   0  0
52   0  0
56   0  0
74   0  0
102  0  0
111  0  0
112  0  0
116  0  0
121  0  0
130  0  0
138  0  0
140  0  0
148  0  0
152  0  0
153  0  0
155  0  0
163  0  0
164  0  0
166  0  0
180  0  0
200  0  0
215  0  0
223  0  0
235  0  0
238  0  0
257  0  0
264  0  0
270  0  0
272  0  0
275  0  0
280  0  0
289  0  0
298  0  0
300  0  0
310  0  0
312  0  0
314  0  0
316  0  0
351  0  0
353  0  0
358  0  0
360  0  0
362  0  0
364  0  0
373  0  0
375  0  0
381  0  0


Unnamed: 0,city,Total documentos,Achou apenas 1,Achou != 1,Tamanho GT,Achou 1 e acertou
0,arantina,938,798,140,220,113
1,coqueiral,1529,194,1335,390,35
2,cristais,1737,1648,89,286,199
3,ijaci,451,0,451,176,0
4,olaria,43,39,4,15,13
5,passa-vinte,396,0,396,50,0


----------------------------------------------------------------------------------------------------
pirapetinga
----------------------------------------------------------------------------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=236.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=236.0), HTML(value='')))


      0  1
11    0  1
12    0  1
13    0  1
15    0  1
16    0  1
...  .. ..
987   0  1
989   0  1
990   0  1
1004  0  1
1006  0  1

[162 rows x 2 columns]


Unnamed: 0,city,Total documentos,Achou apenas 1,Achou != 1,Tamanho GT,Achou 1 e acertou
0,arantina,938,798,140,220,113
1,coqueiral,1529,194,1335,390,35
2,cristais,1737,1648,89,286,199
3,ijaci,451,0,451,176,0
4,olaria,43,39,4,15,13
5,passa-vinte,396,0,396,50,0
6,pirapetinga,1008,866,142,162,43


----------------------------------------------------------------------------------------------------
sao-bento-abade
----------------------------------------------------------------------------------------------------


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

- Coqueiral: utiliza Processo: XX/XXXX que não é contemplado pelo regex
- São Bento Abade: utiliza PL XX/XXXX que não é contemplado pelo regex
- Pirapetinga: utiliza Processo XX/XXXX que não é contemplado pelo regex
- Ijaci: on-click
- Passa-Vinte: on-click

In [48]:
df_acertos

Unnamed: 0,city,Total documentos,Achou apenas 1,Achou != 1,Tamanho GT,Achou 1 e acertou
0,arantina,938,798,140,220,113
1,coqueiral,1529,194,1335,390,35
2,cristais,1737,1648,89,286,199
3,ijaci,451,0,451,176,0
4,olaria,43,39,4,15,13
5,passa-vinte,396,0,396,50,0
6,pirapetinga,1008,866,142,162,43
7,sao-bento-abade,232,12,220,139,3
