In [1]:
import json
import pandas as pd
import pickle
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import math
from IPython.display import display, HTML
from itertools import chain
import re
plt.rcParams['figure.dpi'] = 300

### Selecione qual base usar

In [2]:
with open("../data/heuristicas_reagrupamento/new_groups_licitacao.json") as f:
    groups_ = json.load(f)

df = pd.read_csv("../data/output/druid_fasttext/f03_items.csv.zip", sep=";") # itens licitação

with open("../data/output/druid_fasttext/results.pkl", "rb") as f:
    groups = pickle.load(f)

In [None]:
with open("../data/heuristicas_reagrupamento/new_groups_nf.json") as f:
    groups_ = json.load(f)

df = pd.read_csv("../data/output/nota_fiscal_fasttext/f03_items.csv.zip", sep=";") # itens NF

with open("../data/output/nota_fiscal_fasttext/results.pkl", "rb") as f:
    groups = pickle.load(f)

In [3]:
# Remove os outliers e agrupa todos os itens de uma mesma família, ie, "gasolina_1", "gasolina_2", ..., "gasolina_n" => "gasolina"
inverse = {}
real_cluster = {}

for cluster, items in groups.items():
    if "_" in cluster:
        group, cod = cluster.split("_")
    else:
        cod = "-1"
        
    for item in items:
        if cod == "-1":
            inverse[item] = "outlier"
            real_cluster[item] = cluster
        else:
            inverse[item] = cluster
            real_cluster[item] = cluster

df["group"] = df.index
df.group = df.group.apply(lambda i: inverse.get(i, "outlier"))

# Salva os grupos originais para análise posterior
df["real_cluster"] = df.index
df.real_cluster = df.real_cluster.apply(lambda i: real_cluster.get(i, "outlier"))

In [4]:
unidades_medidas = df.unidades_medida.drop_duplicates().apply(lambda v:eval(v))
unidades_medidas = list(chain.from_iterable(unidades_medidas.values))
print(", ".join(unidades_medidas[:15]) + " [...]")

mg, ml, cm, mm, litro, gr, mg, ml, lt, kg, mes, ml, ano, cm, mt [...]


In [None]:
#

# Top 20-descrições canônicas

In [5]:
def get_description_and_n_items(groups, *, max_length=70, n=20):
    group_n_itens = []
    descriptions = []
    for group in groups:
        items = df.loc[df.group == group]["original_prep"].apply(lambda lst: " ".join(eval(lst))).to_list()
        group_n_itens.append(f"{group} ({len(items)})")
        descriptions.extend(items)
        
    description_count = Counter(descriptions)
    top_n_descriptions = []
    total = sum(description_count.values())
    for description, count in description_count.most_common(n):
        top_n_descriptions.append(f"{fmt(description, max_length)} ({100 * count/total:>6.2f}%)")
        
    return group_n_itens, top_n_descriptions

def get_units_of_measure(groups):
    units_of_measure = []
    for group in groups:
        units_of_measure_by_group = df.loc[df.group == group].unidades_medida.apply(lambda v: eval(v))
        units_of_measure_by_group = list(chain.from_iterable(units_of_measure_by_group.values))
        units_of_measure.extend(units_of_measure_by_group)
            
    return list(set(units_of_measure))

def get_numbers(groups):
    numbers = []
    pattern = re.compile("^[0-9]+$")
    for group in groups:
        numbers_by_group = df.loc[df.group == group]["original_prep"].apply(lambda v: [str(e) for e in eval(v) if pattern.match(e)])
        numbers_by_group = list(chain.from_iterable(numbers_by_group.values))
        numbers.extend(numbers_by_group)
            
    return list(set(numbers))
    
def fmt(string, n):
    if len(string) > n:
        return "\"" + string[:n-3] + "..." + "\""
    else:
        return "\"" + string + "\""
    
# Is it a number or a unit of measure?
def contains_number_and_unit_of_measure(canonical_description):
    tokens = canonical_description.split()
    contains_number, contains_unit_of_measure = False, False
    pattern = re.compile("[0-9]+")
    for token in tokens:
        if pattern.match(token):
            contains_number = True
        if token in unidades_medidas:
            contains_unit_of_measure = True
            
    return contains_number and contains_unit_of_measure

def plot_hist(data: Counter, *, xlabel, filename, persist=True):
    _df = pd.DataFrame(data.items())
    sns.barplot(data=_df, x=0, y=1, color="blue")
    plt.xlabel(xlabel)
    plt.ylabel("# ocorrências")
    plt.xticks(fontsize=10)
    plt.xticks(fontsize=8, rotation=90)
    if persist:
        plt.savefig(filename, dpi=300)
    display(plt.show())

### Geral

In [6]:
   
html_str = "<table>"
html_str += "<td>Métrica</td><td>Grupos originais</td><td>Descrição canônica</td><td>Descrições frequentes</td>"
for metric in ["median", "sum"]:  
    for n in [5]:
        print(f"{metric}_{n}")
        groups = list(groups_[f"{metric}_{n}"].items())
        groups.sort(key=lambda group: len(group[1]["groups"]), reverse=True)    
        for g in groups[:10]:
            new_group = g[0]
            canonical_description = g[1]["description"]
            previous_groups = g[1]["groups"]
            previous_groups_with_n_items, frequent_descriptions = get_description_and_n_items(previous_groups)
            previous_groups_str = "<br/>".join(previous_groups_with_n_items)
            top_n_frequent_descriptions = "<ul>" + "".join(["<ol>" + x + "</ol>" for x in frequent_descriptions]) + "<ol></ol></ul>"

            html_str += "<tr>"
            html_str += f"<td>{metric}_{n}</td>"
            html_str += f"<td>{previous_groups_str}</td><td>{canonical_description}</td><td>{top_n_frequent_descriptions}</td>"
            html_str += "</tr>"
        
html_str += "</table>"
display(HTML(html_str))

median_5
sum_5


0,1,2,3
median_5,outro_0 (59) outro_1 (213) outro_2 (48) outro_3 (38) outro_5 (137) outro_6 (34) outro_7 (87) outro_8 (40) outro_9 (38) outro_10 (280) outro_11 (70) outro_12 (297),juridico outro pessoa servico terceiro,"""outro servico terceiro pessoa juridico promocao evento interesse pu..."" ( 15.88%)""outro servico terceiro pessoa juridico"" ( 7.01%)""outro servico terceiro pessoa juridico manutencao conservacao maqui..."" ( 5.44%)""outro servico terceiro pessoa juridico grafico"" ( 4.10%)""outro servico terceiro pessoa juridico apoio evento interesse publico"" ( 2.54%)""outro servico terceiro pessoa juridico locacao maquina equipamento"" ( 2.31%)""outro servico terceiro pessoa juridico manutencao conservacao bem i..."" ( 1.86%)""outro servico terceiro pessoa juridico confeccao geral"" ( 1.57%)""outro servico terceiro pessoa juridico locacao veiculo"" ( 1.27%)""outro servico terceiro pessoa juridico medico hospitalar odontologi..."" ( 0.67%)""outro servico terceiro pessoa juridico publicidade propaganda"" ( 0.60%)""outro servico terceiro pessoa juridico tecnologia informacao"" ( 0.60%)""outro servico terceiro pessoa juridico assentamento acomodacao remo..."" ( 0.60%)""outro servico terceiro pessoa juridico tecnologia informacao invest..."" ( 0.60%)""outro despesa pessoal decorrente contrato terceirizacao administrativo"" ( 0.52%)""outro"" ( 0.52%)""outro daqui mil ano"" ( 0.52%)""outro servico terceiro pessoa juridico telecomunicacao"" ( 0.45%)""outro servico pessoa juridico"" ( 0.45%)""outro servico terceiro pessoa juridico seguro geral"" ( 0.37%)"
median_5,dipirona_5 (633) dipirona_6 (396) dipirona_9 (219) dipirona_10 (526) dipirona_15 (170) dipirona_16 (159) dipirona_19 (86) dipirona_20 (238) dipirona_21 (248) dipirona_25 (566),500 dipirona mg ml sodico,"""dipirona sodico 500 mg ml"" ( 6.97%)""dipirona sodico 500 mg ml solucao oral"" ( 6.14%)""dipirona 500 mg ml"" ( 6.02%)""dipirona sodico 500 mg ml solucao injetavel"" ( 2.65%)""dipirona 500 mg ml injetavel"" ( 2.19%)""dipirona sodico 500 mg ml 2"" ( 1.76%)""dipirona sodico 500 mg ml solucao injetavel ampola 2"" ( 1.57%)""dipirona sodico 500 mg ml ampola 2"" ( 1.36%)""dipirona sodico 500 mg ml 10"" ( 1.17%)""dipirona sodico 500 mg ml injetavel ampola 2"" ( 1.14%)""dipirona 500 mg ml ampola 2"" ( 0.99%)""dipirona 500 mg ml solucao oral"" ( 0.96%)""dipirona sodico 500 mg 2 ml"" ( 0.96%)""dipirona 500 mg ml 10"" ( 0.93%)""dipirona sodico 500 mg ml solucao oral gota"" ( 0.89%)""dipirona sodico 500 mg ml solucao injetavel ver endovenoso intramus..."" ( 0.77%)""dipirona 500 mg ml 2"" ( 0.77%)""dipirona sodico 500 mg ml injetavel"" ( 0.71%)""dipirona sodico 500 mg ml solucao oral 10"" ( 0.71%)""dipirona 500 mg ml gts"" ( 0.68%)"
median_5,calcado_1 (134) calcado_4 (315) calcado_12 (64) calcado_16 (65) calcado_17 (76) calcado_18 (52) calcado_19 (113) calcado_24 (47) calcado_26 (53),botina calcado ocupacional seguranca tipo,"""calcado seguranca tipo botina cor preto n 39"" ( 1.09%)""calcado seguranca tipo botina cor preto n 43"" ( 0.98%)""calcado seguranca tipo botina cor preto n 40"" ( 0.98%)""calcado seguranca tipo botina cor preto n 42"" ( 0.98%)""calcado seguranca tipo botina cor preto n 41"" ( 0.87%)""calcado seguranca protecao pe perna contra umidade proveniente oper..."" ( 0.76%)""calcado seguranca tipo botina cor preto n 44"" ( 0.76%)""calcado seguranca tipo botina n 38"" ( 0.65%)""calcado seguranca uso profissional tipo bota impermeavel preto sem ..."" ( 0.65%)""calcado seguranca com biqueiro aco uso profissional tipo botina mod..."" ( 0.65%)""calcado para trabalho cozinha ocupacional tipo sapato fechamento el..."" ( 0.65%)""calcado seguranca modelo bota cano medio cabedal pvc cor branco ent..."" ( 0.65%)""calcado ocupar tipo bota n 38"" ( 0.54%)""calcado seguranca tipo botina 42"" ( 0.54%)""calcado seguranca tamanho 38 tipo sapato feminino modelo blatt resi..."" ( 0.44%)""calcado tipo sapato mod blatt cor preto n 38"" ( 0.44%)""calcado seguranca tipo botina nobuck n 38"" ( 0.44%)""calcado seguranca sem biqueiro aco uso profissional tipo botina mod..."" ( 0.44%)""calcado seguranca uso profissional tipo botina"" ( 0.44%)""calcado ocupacional profissional tipo botina n 43"" ( 0.44%)"
median_5,pincel_11 (1570) pincel_14 (2332) pincel_20 (2064) pincel_27 (649) pincel_30 (3505) pincel_40 (1211) pincel_41 (694) pincel_49 (405) pincel_51 (907),chato n para pincel pintura,"""pincel para pintura n 8"" ( 1.28%)""pincel para pintura n 12"" ( 1.13%)""pincel para pintura n 10"" ( 1.11%)""pincel n 10"" ( 0.87%)""pincel para pintura n 6"" ( 0.76%)""pincel chato n 10"" ( 0.55%)""pincel n 0"" ( 0.49%)""pincel chato n 6"" ( 0.48%)""pincel para pintura n 0"" ( 0.45%)""pincel p pintura n 10"" ( 0.44%)""pincel chato n 12"" ( 0.40%)""pincel 10"" ( 0.37%)""pincel p pintura n 6"" ( 0.36%)""pincel p pintura n 12"" ( 0.36%)""pincel para pintura n 18"" ( 0.36%)""pincel para pintura n 16"" ( 0.35%)""pincel redondo n 10"" ( 0.34%)""pincel para pintura tecido n 6"" ( 0.34%)""pincel para pintura n 14"" ( 0.34%)""pincel para pintura 8"" ( 0.32%)"
median_5,fralda_0 (313) fralda_2 (214) fralda_4 (1316) fralda_9 (81) fralda_11 (1477) fralda_12 (123) fralda_15 (6357) fralda_16 (7902) fralda_20 (424),descartavel fralda geriatrico infantil tamanho,"""fralda geriatrico g"" ( 1.20%)""fralda geriatrico m"" ( 1.15%)""fralda geriatrico tamanho g"" ( 0.91%)""fralda geriatrico tamanho m"" ( 0.86%)""fralda geriatrico p"" ( 0.77%)""fralda descartavel infantil m"" ( 0.71%)""fralda descartavel m"" ( 0.68%)""fralda descartavel tamanho g"" ( 0.67%)""fralda descartavel infantil g"" ( 0.65%)""fralda descartavel g"" ( 0.62%)""fralda geriatrico"" ( 0.60%)""fralda descartavel tamanho m"" ( 0.58%)""fralda descartavel infantil p"" ( 0.58%)""fralda descartavel infantil tamanho m"" ( 0.57%)""fralda descartavel"" ( 0.55%)""fralda geriatrico tam g"" ( 0.54%)""fralda geriatrico tam m"" ( 0.52%)""fralda descartavel infantil tamanho g"" ( 0.48%)""fralda geriatrico tamanho p"" ( 0.48%)""fralda descartavel geriatrico tamanho g"" ( 0.47%)"
median_5,clonazepam_0 (268) clonazepam_1 (175) clonazepam_3 (227) clonazepam_6 (296) clonazepam_7 (183) clonazepam_8 (51) clonazepam_9 (166) clonazepam_19 (435),2 5 clonazepam mg ml,"""clonazepam 2 5 mg ml"" ( 20.32%)""clonazepam 2 5 mg ml solucao oral"" ( 13.27%)""clonazepam 2 5 mg ml gota"" ( 6.83%)""clonazepam gota 2 5 mg ml"" ( 3.55%)""clonazepam 2 5 mg ml solucao oral frasco 20"" ( 3.28%)""clonazepam 2 5 mg ml frasco 20"" ( 2.67%)""clonazepam 2 5 mg ml sol oral"" ( 2.50%)""clonazepam solucao oral 2 5 mg ml"" ( 2.39%)""clonazepam 2 5 mg ml gota 20"" ( 2.28%)""clonazepam 2 5 mg ml 20"" ( 2.22%)""clonazepam 2 5 mg ml solucao"" ( 1.83%)""clonazepam 2 5 mg gota"" ( 1.39%)""clonazepam 2 5 mg ml 20 gota"" ( 1.33%)""clonazepam 2 5 mg ml frasco com 20"" ( 1.33%)""clonazepam 2 5 mg ml gts"" ( 1.11%)""clonazepam 2 5 mg ml solucao oral frasco com 20"" ( 1.00%)""clonazepam 2 5 mg ml solucao oral 20"" ( 0.94%)""clonazepam 2 5 mg ml gota frasco com 20"" ( 0.83%)""clonazepam 2 5 mg ml gota frasco"" ( 0.78%)""clonazepam 2 5 mg ml solucao oral frasco gotejador 20"" ( 0.72%)"
median_5,miconazol_0 (70) miconazol_6 (145) miconazol_7 (50) miconazol_13 (97) miconazol_14 (36) miconazol_16 (72) miconazol_23 (37) miconazol_24 (430),20 g mg miconazol nitrato,"""miconazol nitrato 20 mg g creme vaginal"" ( 23.69%)""miconazol nitrato 20 mg g creme"" ( 14.62%)""miconazol nitrato 20 mg g"" ( 8.75%)""miconazol nitrato 20 mg g locao"" ( 6.30%)""miconazol nitrato 20 mg g creme dermatologico"" ( 3.09%)""miconazol nitrato"" ( 2.88%)""miconazol nitrato 20 mg creme vaginal"" ( 2.03%)""miconazol nitrato 20 mg g creme vaginal bisnaga 80 aplicador"" ( 1.81%)""miconazol nitrato 20 mg g creme dermatologico bisnaga 28"" ( 1.81%)""miconazol nitrato 20 mg g creme vaginal 80"" ( 1.49%)""miconazol nitrato 20 mg g creme vaginal bisnaga com 80 aplicador"" ( 1.07%)""miconazol nitrato 20 mg g 80"" ( 0.96%)""miconazol nitrato 20 mg g creme vaginal tubo com 80"" ( 0.75%)""miconazol nitrato 20 mg g crer"" ( 0.64%)""miconazol nitrato 20 mg creme bisnaga 28 grs"" ( 0.64%)""miconazol nitrato 20 mg g creme locao frasco 30 grama"" ( 0.64%)""miconazol nitrato 20 mg g creme vaginal frasco contender 80 grama"" ( 0.64%)""miconazol nitrato 20 mgg creme vaginal"" ( 0.64%)""miconazol nitrato 20 mg g creme vaginal bisnaga 80"" ( 0.64%)""miconazol nitrato 20 mg g creme dermatologico tubo 28 grama"" ( 0.64%)"
median_5,pistola_5 (216) pistola_12 (179) pistola_19 (558) pistola_20 (120) pistola_21 (114) pistola_22 (146) pistola_25 (59) pistola_27 (80),aplicador cola para pistola quente,"""pistola aplicador cola quente"" ( 2.58%)""pistola cola quente 40 w"" ( 1.83%)""pistola cola quente para bastao grosso"" ( 1.36%)""pistola aplicador cola quente grande"" ( 1.22%)""pistola aplicador para cola quente"" ( 1.15%)""pistola cola quente hk 60 w"" ( 1.09%)""pistola para aplicacao cola quente grande 40 w bivolt silicone refi..."" ( 1.02%)""pistola cola quente para bastao fino"" ( 0.88%)""pistola aplicador cola quente pequeno"" ( 0.82%)""pistola para aplicacao cola quente pequeno 40 w bivolt silicone ref..."" ( 0.75%)""pistola cola quente para tubo fino 7 mm"" ( 0.68%)""pistola aplicador cola"" ( 0.61%)""pistola aplicador cola peq"" ( 0.61%)""pistola para cola quente refil fino 5 16"" ( 0.61%)""pistola para aplicacao cola quente refil resina termoplastico fino ..."" ( 0.61%)""pistola para cola quente 7 mm"" ( 0.61%)""pistola aplicador tensao alimentacao 110 v aplicacao colagem caract..."" ( 0.61%)""pistola cola quente pequeno para refil 0 75 cm diametro"" ( 0.61%)""pistola aplicador p cola quente grande"" ( 0.54%)""pistola aplicador p cola quente pequeno"" ( 0.54%)"
median_5,benzilpenicilina_5 (86) benzilpenicilina_6 (391) benzilpenicilina_9 (104) benzilpenicilina_17 (98) benzilpenicilina_21 (119) benzilpenicilina_22 (75) benzilpenicilina_33 (51),000 1 200 benzatina benzilpenicilina,"""benzilpenicilina benzatina 1 200 000 ui"" ( 25.11%)""benzilpenicilina benzatina 1 200 000 ui suspensao injetavel"" ( 4.87%)""benzilpenicilina benzatina 1 200 000 ui po para suspensao injetavel"" ( 4.22%)""benzilpenicilina benzatina 1 200 000"" ( 3.35%)""benzilpenicilina benzatina 1 200 000 ui injetavel"" ( 3.35%)""benzilpenicilina benzatina 1 200 000 ui frasco ampola"" ( 2.49%)""benzilpenicilina benzatina 1 200 000 ui po injetavel"" ( 1.84%)""benzilpenicilina benzatina po para suspensao injetavel 1 200 000 ui"" ( 1.84%)""benzilpenicilina benzatina 1 200 000 u i"" ( 1.41%)""benzilpenicilina benzatina 1 200 000 ui po"" ( 1.41%)""benzilpenicilina benzatina 1 200 000 ui po liofilizar injetavel fra..."" ( 1.41%)""benzilpenicilina benzatina 1 200 000 ui po para solucao injetavel"" ( 1.30%)""benzilpenicilina benzatina po injetavel 1 200 000 ui"" ( 1.19%)""benzilpenicilina benzatina 1 200 000 u i suspensao injetavel frasco..."" ( 1.19%)""benzilpenicilina benzatina 1 2000 000 ui frasco ampola po para susp..."" ( 1.08%)""benzilpenicilina benzatina 1 200 000 ui susp injetavel"" ( 0.97%)""benzilpenicilina benzatina 1 200 000 ui injetavel frasco ampola"" ( 0.97%)""benzilpenicilina benzatina 1 200 000 ui po p suspensao injetavel"" ( 0.97%)""benzilpenicilina benzatina 1 200 000 injetavel"" ( 0.87%)""benzilpenicilina benzatina 1 200 000 diluente"" ( 0.87%)"
median_5,sulfadiazina_5 (147) sulfadiazina_6 (173) sulfadiazina_14 (121) sulfadiazina_15 (81) sulfadiazina_21 (70) sulfadiazina_22 (148) sulfadiazina_30 (36),10 g mg prata sulfadiazina,"""sulfadiazina prata 10 mg g creme"" ( 13.27%)""sulfadiazina prata 10 mg g"" ( 12.50%)""sulfadiazina prata 10 mg"" ( 3.61%)""sulfadiazina prata 10 mg g creme dermatologico"" ( 2.32%)""sulfadiazina prata 10 mg g pote 400"" ( 1.55%)""sulfadiazina prata 10 mg g creme dermatologico pote com 400"" ( 1.55%)""sulfadiazina prata 10 mg g bisnaga 50"" ( 1.29%)""sulfadiazina prata 10 mg g creme 50"" ( 1.29%)""sulfadiazina prata 10 mg g 400"" ( 1.16%)""sulfadiazina prata creme dermatologico 10 mg g"" ( 1.03%)""sulfadiazina prata creme 10 mg g"" ( 1.03%)""sulfadiazina prata 10 mg com 400 gr"" ( 1.03%)""sulfadiazina prata 10 mg g creme bisnaga"" ( 0.90%)""sulfadiazina prata 1 10 mg g"" ( 0.90%)""sulfadiazina prata 10 mg g 30"" ( 0.90%)""sulfadiazina prata 10 mg g bisnaga 30"" ( 0.90%)""sulfadiazina prata 10 mg g 400 gr"" ( 0.90%)""sulfadiazina prata 10 mg g creme dermatologico 50"" ( 0.90%)""sulfadiazina prata 10 mg g creme dermatologico bisnaga 30"" ( 0.90%)""sulfadiazina prata 10 1556"" ( 0.77%)"


### Contém número ou unidade de medida

In [7]:
html_str = "<table>"
html_str += "<td>Métrica</td><td>Grupos originais</td><td>Descrição canônica</td><td>Descrições frequentes</td>"
for metric in ["median", "sum"]:  
    n = 5
    print(metric)
    groups = list(groups_[f"{metric}_{n}"].items())
    groups.sort(key=lambda group: len(group[1]["groups"]), reverse=True)
    
    selected_groups_count = 0
    for g in groups:
        new_group = g[0]
        canonical_description = g[1]["description"]
                
        if not contains_number_and_unit_of_measure(canonical_description):
            continue
            
        if selected_groups_count > 20:
            break
            
        previous_groups = g[1]["groups"]
        previous_groups_with_n_items, frequent_descriptions = get_description_and_n_items(previous_groups)
        previous_groups_str = "<br/>".join(previous_groups_with_n_items)
        top_n_frequent_descriptions = "<ul>" + "".join(["<ol>" + x + "</ol>" for x in frequent_descriptions]) + "<ol></ol></ul>"

            
        selected_groups_count += 1
        html_str += "<tr>"
        html_str += f"<td>{metric}_{n}</td>"
        html_str += f"<td>{previous_groups_str}</td><td>{canonical_description}</td><td>{top_n_frequent_descriptions}</td>"
        html_str += "</tr>"
        
html_str += "</table>"
display(HTML(html_str))

median
sum


0,1,2,3
median_5,dipirona_5 (633) dipirona_6 (396) dipirona_9 (219) dipirona_10 (526) dipirona_15 (170) dipirona_16 (159) dipirona_19 (86) dipirona_20 (238) dipirona_21 (248) dipirona_25 (566),500 dipirona mg ml sodico,"""dipirona sodico 500 mg ml"" ( 6.97%)""dipirona sodico 500 mg ml solucao oral"" ( 6.14%)""dipirona 500 mg ml"" ( 6.02%)""dipirona sodico 500 mg ml solucao injetavel"" ( 2.65%)""dipirona 500 mg ml injetavel"" ( 2.19%)""dipirona sodico 500 mg ml 2"" ( 1.76%)""dipirona sodico 500 mg ml solucao injetavel ampola 2"" ( 1.57%)""dipirona sodico 500 mg ml ampola 2"" ( 1.36%)""dipirona sodico 500 mg ml 10"" ( 1.17%)""dipirona sodico 500 mg ml injetavel ampola 2"" ( 1.14%)""dipirona 500 mg ml ampola 2"" ( 0.99%)""dipirona 500 mg ml solucao oral"" ( 0.96%)""dipirona sodico 500 mg 2 ml"" ( 0.96%)""dipirona 500 mg ml 10"" ( 0.93%)""dipirona sodico 500 mg ml solucao oral gota"" ( 0.89%)""dipirona sodico 500 mg ml solucao injetavel ver endovenoso intramus..."" ( 0.77%)""dipirona 500 mg ml 2"" ( 0.77%)""dipirona sodico 500 mg ml injetavel"" ( 0.71%)""dipirona sodico 500 mg ml solucao oral 10"" ( 0.71%)""dipirona 500 mg ml gts"" ( 0.68%)"
median_5,clonazepam_0 (268) clonazepam_1 (175) clonazepam_3 (227) clonazepam_6 (296) clonazepam_7 (183) clonazepam_8 (51) clonazepam_9 (166) clonazepam_19 (435),2 5 clonazepam mg ml,"""clonazepam 2 5 mg ml"" ( 20.32%)""clonazepam 2 5 mg ml solucao oral"" ( 13.27%)""clonazepam 2 5 mg ml gota"" ( 6.83%)""clonazepam gota 2 5 mg ml"" ( 3.55%)""clonazepam 2 5 mg ml solucao oral frasco 20"" ( 3.28%)""clonazepam 2 5 mg ml frasco 20"" ( 2.67%)""clonazepam 2 5 mg ml sol oral"" ( 2.50%)""clonazepam solucao oral 2 5 mg ml"" ( 2.39%)""clonazepam 2 5 mg ml gota 20"" ( 2.28%)""clonazepam 2 5 mg ml 20"" ( 2.22%)""clonazepam 2 5 mg ml solucao"" ( 1.83%)""clonazepam 2 5 mg gota"" ( 1.39%)""clonazepam 2 5 mg ml 20 gota"" ( 1.33%)""clonazepam 2 5 mg ml frasco com 20"" ( 1.33%)""clonazepam 2 5 mg ml gts"" ( 1.11%)""clonazepam 2 5 mg ml solucao oral frasco com 20"" ( 1.00%)""clonazepam 2 5 mg ml solucao oral 20"" ( 0.94%)""clonazepam 2 5 mg ml gota frasco com 20"" ( 0.83%)""clonazepam 2 5 mg ml gota frasco"" ( 0.78%)""clonazepam 2 5 mg ml solucao oral frasco gotejador 20"" ( 0.72%)"
median_5,miconazol_0 (70) miconazol_6 (145) miconazol_7 (50) miconazol_13 (97) miconazol_14 (36) miconazol_16 (72) miconazol_23 (37) miconazol_24 (430),20 g mg miconazol nitrato,"""miconazol nitrato 20 mg g creme vaginal"" ( 23.69%)""miconazol nitrato 20 mg g creme"" ( 14.62%)""miconazol nitrato 20 mg g"" ( 8.75%)""miconazol nitrato 20 mg g locao"" ( 6.30%)""miconazol nitrato 20 mg g creme dermatologico"" ( 3.09%)""miconazol nitrato"" ( 2.88%)""miconazol nitrato 20 mg creme vaginal"" ( 2.03%)""miconazol nitrato 20 mg g creme vaginal bisnaga 80 aplicador"" ( 1.81%)""miconazol nitrato 20 mg g creme dermatologico bisnaga 28"" ( 1.81%)""miconazol nitrato 20 mg g creme vaginal 80"" ( 1.49%)""miconazol nitrato 20 mg g creme vaginal bisnaga com 80 aplicador"" ( 1.07%)""miconazol nitrato 20 mg g 80"" ( 0.96%)""miconazol nitrato 20 mg g creme vaginal tubo com 80"" ( 0.75%)""miconazol nitrato 20 mg g crer"" ( 0.64%)""miconazol nitrato 20 mg creme bisnaga 28 grs"" ( 0.64%)""miconazol nitrato 20 mg g creme locao frasco 30 grama"" ( 0.64%)""miconazol nitrato 20 mg g creme vaginal frasco contender 80 grama"" ( 0.64%)""miconazol nitrato 20 mgg creme vaginal"" ( 0.64%)""miconazol nitrato 20 mg g creme vaginal bisnaga 80"" ( 0.64%)""miconazol nitrato 20 mg g creme dermatologico tubo 28 grama"" ( 0.64%)"
median_5,sulfadiazina_5 (147) sulfadiazina_6 (173) sulfadiazina_14 (121) sulfadiazina_15 (81) sulfadiazina_21 (70) sulfadiazina_22 (148) sulfadiazina_30 (36),10 g mg prata sulfadiazina,"""sulfadiazina prata 10 mg g creme"" ( 13.27%)""sulfadiazina prata 10 mg g"" ( 12.50%)""sulfadiazina prata 10 mg"" ( 3.61%)""sulfadiazina prata 10 mg g creme dermatologico"" ( 2.32%)""sulfadiazina prata 10 mg g pote 400"" ( 1.55%)""sulfadiazina prata 10 mg g creme dermatologico pote com 400"" ( 1.55%)""sulfadiazina prata 10 mg g bisnaga 50"" ( 1.29%)""sulfadiazina prata 10 mg g creme 50"" ( 1.29%)""sulfadiazina prata 10 mg g 400"" ( 1.16%)""sulfadiazina prata creme dermatologico 10 mg g"" ( 1.03%)""sulfadiazina prata creme 10 mg g"" ( 1.03%)""sulfadiazina prata 10 mg com 400 gr"" ( 1.03%)""sulfadiazina prata 10 mg g creme bisnaga"" ( 0.90%)""sulfadiazina prata 1 10 mg g"" ( 0.90%)""sulfadiazina prata 10 mg g 30"" ( 0.90%)""sulfadiazina prata 10 mg g bisnaga 30"" ( 0.90%)""sulfadiazina prata 10 mg g 400 gr"" ( 0.90%)""sulfadiazina prata 10 mg g creme dermatologico 50"" ( 0.90%)""sulfadiazina prata 10 mg g creme dermatologico bisnaga 30"" ( 0.90%)""sulfadiazina prata 10 1556"" ( 0.77%)"
median_5,brometo_5 (143) brometo_11 (182) brometo_19 (356) brometo_20 (105) brometo_22 (225) brometo_27 (113),0 25 brometo ipratropio mg,"""brometo ipratropio 0 25 mg ml"" ( 10.68%)""brometo ipratropio 0 25 mg ml 20"" ( 3.29%)""brometo ipratropio 0 25 mg ml frasco 20"" ( 3.20%)""brometo ipratropio 0 25 mg 20 ml"" ( 2.40%)""brometo ipratropio 0 25 mg ml solucao para inalacao"" ( 1.96%)""brometo ipratropio 0 250 mg ml solucao para inalacao"" ( 1.78%)""brometo ipratropio 0 25 mg"" ( 1.51%)""brometo ipratropio 0 25 mg gota"" ( 1.16%)""brometo ipratropio atrovent 20 ml solucao para inalacao"" ( 1.16%)""brometo ipratropio 0 25 mg frasco com 20 ml"" ( 1.16%)""brometo ipratropio 0 25 gota frasco 20 ml"" ( 0.98%)""brometo ipratropio 0 25 mg ml solucao para inalacao frasco 20"" ( 0.98%)""brometo ipratropio 0 25 mg ml gota"" ( 0.80%)""brometo ipratropio 0 25 gota"" ( 0.80%)""brometo ipratropio 0 25 solucao inalacao 20 ml"" ( 0.80%)""brometo ipratropio 0 25 solucao para inalacao"" ( 0.80%)""brometo ipratropio 0 250 mg ml gota"" ( 0.71%)""brometo ipratropio gota 25 gml frasco 20 ml 231"" ( 0.71%)""brometo ipratropio 250 mg ml gota 20"" ( 0.71%)""brometo ipratropio 0 2 mg ml gota"" ( 0.71%)"
median_5,heparina_6 (48) heparina_8 (201) heparina_9 (193) heparina_21 (101) heparina_22 (50) heparina_24 (37),000 5 heparina sodico ui,"""heparina sodico 5 000 ui ml"" ( 6.19%)""heparina 5 000 ui ml"" ( 3.81%)""heparina 5 000 ui 0 25 ml"" ( 3.02%)""heparina sodico 5 000 ui 0 25 ml"" ( 2.86%)""heparina sodico 5 000 ui ml frasco ampola"" ( 2.38%)""heparina sodico 5 000 ui ml injetavel"" ( 2.38%)""heparina sodico 5 000 ui 0 25 ml ampola"" ( 1.90%)""heparina 5 000 ui 0 25 ml ampola solucao injetavel esteril"" ( 1.75%)""heparina sodico 5 000 ui 0 25 ml solucao injetavel"" ( 1.75%)""heparina sodico 5 000 ui ml ev injetavel frasco ampola"" ( 1.59%)""heparina sub 5 000 ui ml"" ( 1.43%)""heparina sodico 5 000 ui 0 25 ml subcutaneo"" ( 1.43%)""heparina sodico 5 000 ui 0 25 ml sc injetavel ampola"" ( 1.43%)""heparina 5 000 ui ml sc"" ( 1.27%)""heparina sodico 5 000 ui ml endovenoso"" ( 1.11%)""heparina sodico solucao injetavel 5 000 ui ml"" ( 1.11%)""heparina 5 000 ui hsb"" ( 0.95%)""heparina 5 000 ui amp"" ( 0.95%)""heparina 5 000 ui ml endovenoso"" ( 0.95%)""heparina 5 000 ui c 0 25 ml subcutaneo 3590"" ( 0.95%)"
median_5,butilbrometo_12 (236) butilbrometo_13 (40) butilbrometo_14 (235) butilbrometo_17 (53) butilbrometo_27 (90) butilbrometo_28 (83),20 butilbrometo escopolamina mg ml,"""butilbrometo escopolamina 20 mg ml"" ( 16.15%)""butilbrometo escopolamina 20 mg ml injetavel"" ( 6.78%)""butilbrometo escopolamina 20 mg ml 1"" ( 5.02%)""butilbrometo escopolamina 20 mg ml ampola 1"" ( 4.21%)""butilbrometo escopolamina 20 mg ml solucao injetavel"" ( 3.53%)""butilbrometo escopolamina 20 mg ml injetavel ampola 1"" ( 2.58%)""butilbrometo escopolamina 20 mg ml inj"" ( 1.49%)""butilbrometo escopolamina 20 mg ml solucao injetavel ampola 1"" ( 1.49%)""butilbrometo escopolamina 20 mg injetavel 1 ml"" ( 1.36%)""butilbrometo escopolamina 20 mg 1 ml"" ( 1.22%)""butilbrometo escopolamina 20 ml"" ( 1.09%)""butilbrometo escopolamina 1 ml hioscina"" ( 1.09%)""butilbrometo escopolamina 20 mg ampola 1 ml"" ( 1.09%)""butilbrometo escopolamina 20 mg ml ampola"" ( 0.95%)""butilbrometo escopolamina 20 mg ml 4778"" ( 0.95%)""butilbrometo escopolamina solucao injetavel 20 mg ml"" ( 0.95%)""butilbrometo escopolamina 20 mg ampola ml"" ( 0.81%)""butilbrometo escopolamina 20 mg ml 1 145"" ( 0.81%)""butilbrometo escopolamina 20 mg ml 1 ampola"" ( 0.81%)""butilbrometo escopolamina 1"" ( 0.81%)"
median_5,fenobarbital_1 (322) fenobarbital_9 (277) fenobarbital_12 (161) fenobarbital_18 (176) fenobarbital_20 (214) fenobarbital_21 (90),40 fenobarbital mg ml sodico,"""fenobarbital 40 mg ml"" ( 16.85%)""fenobarbital sodico 40 mg ml solucao oral"" ( 12.90%)""fenobarbital sodico 40 mg ml"" ( 5.24%)""fenobarbital 40 mg ml solucao oral frasco 20"" ( 4.44%)""fenobarbital 40 mg ml 20"" ( 3.47%)""fenobarbital 40 mg ml solucao oral"" ( 3.23%)""fenobarbital 40 mg ml sol oral"" ( 2.42%)""fenobarbital solucao oral 40 mg ml"" ( 2.26%)""fenobarbital 40 mg ml frasco 20"" ( 1.29%)""fenobarbital solucao oral gota 40 mg ml fr 20"" ( 1.29%)""fenobarbital sodico 40 mg ml sol oral"" ( 1.29%)""fenobarbital apresentacao solucao oral gota dosagem 40 mg ml"" ( 1.21%)""fenobarbital 40 mg ml solucao oral gota"" ( 1.13%)""fenobarbital sodico 40 mg ml solucao oral frasco 20"" ( 1.05%)""fenobarbital 40 mg ml suspensao oral"" ( 0.97%)""fenobarbital sodico 40 mg ml solucao oral frasco gotejador 20"" ( 0.97%)""fenobarbital sodico 40 mg ml solucao oral gota frasco 20"" ( 0.89%)""fenobarbital 40 mg ml solucao oral frasco com 20"" ( 0.89%)""fenobarbital 40 mg ml solucao oral 20"" ( 0.89%)""fenobarbital 40 mg ml sol oral frasco 20"" ( 0.89%)"
median_5,bromidrato_5 (199) bromidrato_8 (178) bromidrato_11 (90) bromidrato_13 (61) bromidrato_14 (35) bromidrato_16 (66),5 bromidrato fenoterol mg ml,"""bromidrato fenoterol 5 mg ml"" ( 29.09%)""bromidrato fenoterol 5 mg ml 20"" ( 10.49%)""bromidrato fenoterol 5 mg ml frasco 20"" ( 3.50%)""bromidrato fenoterol 5 mg ml gota 20"" ( 3.34%)""bromidrato fenoterol 5 mg ml gota"" ( 3.34%)""bromidrato fenoterol 5 mg ml solucao oral"" ( 2.70%)""bromidrato fenoterol 5 mg ml fr 20"" ( 2.23%)""bromidrato fenoterol 5 mg gota 20 ml"" ( 2.07%)""bromidrato fenoterol solucao 0 5 frasco 20 ml"" ( 1.75%)""bromidrato fenoterol 5 mg ml solucao frasco com 20"" ( 1.59%)""bromidrato fenoterol 20 ml 5 mg frs gota"" ( 1.27%)""bromidrato fenoterol 5 mg 20 ml"" ( 0.95%)""bromidrato fenoterol 20 ml 5 mg"" ( 0.95%)""bromidrato fenoterol 5 mg ml faso 20"" ( 0.95%)""bromidrato fenoterol 5 mg ml frasco com 20"" ( 0.95%)""bromidrato fenoterol 5 mg ml solucao oral fr c 20"" ( 0.95%)""bromidrato fenoterol 5 mg ml sol oral fr 20 s c gen"" ( 0.95%)""bromidrato fenoterol gota 5 mg ml"" ( 0.95%)""bromidrato fenoterol 5 mg ml solucao inalatoria 7623"" ( 0.95%)""bromidrato fenoterol gts 5 mg ml"" ( 0.79%)"
median_5,fenitoina_3 (406) fenitoina_7 (169) fenitoina_10 (159) fenitoina_11 (120) fenitoina_14 (85) fenitoina_17 (402),50 fenitoina mg ml sodico,"""fenitoina 50 mg ml"" ( 8.95%)""fenitoina sodico 50 mg ml"" ( 7.83%)""fenitoina 50 mg ml injetavel"" ( 5.37%)""fenitoina sodico 50 mg ml solucao injetavel"" ( 4.03%)""fenitoina 50 mg ml ampola 5"" ( 3.28%)""fenitoina 50 mg ml 5"" ( 3.13%)""fenitoina 50 mg ml injetavel ampola 5"" ( 2.76%)""fenitoina sodico 50 mg ml 5"" ( 2.09%)""fenitoina 50 mg ml inj"" ( 1.94%)""fenitoina sodico 50 mg ml solucao injetavel ver endovenoso intramus..."" ( 1.86%)""fenitoina sodico 50 mg ml ampola 5"" ( 1.79%)""fenitoina 50 mg 5 ml"" ( 1.72%)""fenitoina sodico 50 mg ml injetavel"" ( 1.57%)""fenitoina sodico 50 mg ml solucao injetavel ampola 5"" ( 1.34%)""fenitoina 50 mg ml solucao injetavel"" ( 1.19%)""fenitoina 50 mg ml inj 5"" ( 1.19%)""fenitoina 50 mg injetavel 5 ml"" ( 1.12%)""fenitoina 50 mg ml ampola"" ( 0.97%)""fenitoina inj 50 mg 5 ml"" ( 0.89%)""fenitoina 50 mg ml injetavel 5"" ( 0.82%)"


### Distribuição números e unidades de medidas

In [None]:
html_str = "<table>"
html_str += "<td>Métrica</td><td>Novo Grupo</td><td>Grupos originais</td><td>Un. de medidas únicas</td><td>Números unicos</td>"
for metric in ["median", "sum"]:  
    n = 5
    print(metric)
    groups = list(groups_[f"{metric}_{n}"].items())
    groups.sort(key=lambda group: len(group[1]["groups"]), reverse=True)
    groups = list(filter(lambda group: len(group[1]["groups"]) > 1, groups))
    
    hist_unique_units_of_measure, hist_unique_numbers = Counter(), Counter()
    selected_groups_count = 0
    for g in groups:
        new_group = g[0]
        canonical_description = g[1]["description"]
                
        if not contains_number_and_unit_of_measure(canonical_description):
            continue
            
        previous_groups = g[1]["groups"]
        previous_groups_with_n_items, frequent_descriptions = get_description_and_n_items(previous_groups)
        previous_groups_str = ", ".join(previous_groups_with_n_items)
        
        unique_units_of_measure = get_units_of_measure(previous_groups)
        hist_unique_units_of_measure[len(unique_units_of_measure)] += 1
        unique_units_of_measure_str = ", ".join(unique_units_of_measure)
        
        unique_numbers = get_numbers(previous_groups)
        hist_unique_numbers[len(unique_numbers)] += 1
        unique_numbers_str = ", ".join(unique_numbers)
        
        if selected_groups_count > 20:
            continue
            
        selected_groups_count += 1
        html_str += "<tr>"
        html_str += f"<td>{metric}_{n}</td>"
        html_str += f"<td>{new_group}</td><td>{previous_groups_str}</td><td>{unique_units_of_measure_str}</td><td>{unique_numbers_str}</td>"
        html_str += "</tr>"
    
    print(metric, n)
    plot_hist(hist_unique_units_of_measure, xlabel="unidades de medidas únicas", filename=f"avaliacao_reagrupamento_unidade_medida_{metric}_5.jpg")
    plot_hist(hist_unique_numbers, xlabel="números únicos", filename=f"avaliacao_reagrupamento_numeros_{metric}_5.jpg")
        
html_str += "</table>"
display(HTML(html_str))

# Histograma dos tamanho dos grupos após o reagrupamento

In [None]:
for metric in groups_:
    counter = Counter()
    for key in groups_[metric]:
        n_regrouped = key.count("_") 
        if n_regrouped < 2:
            continue
        counter[n_regrouped] += 1
    if len(counter) == 0:
        print("ups!")
        continue
    _df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
    sns.barplot(data=_df, x="index", y=0, color="blue")
    plt.xlabel("tamanho dos grupos")
    plt.ylabel("quantidade de grupos")
    print(metric)
    display(plt.show())

# Histograma da quantidade de itens em cada grupo após o reagrupamento

In [None]:
step = 30
for metric in groups_:
    counter = Counter()
    for key in groups_[metric]:
        n_regrouped = key.count("_") 
        if n_regrouped < 2:
            continue
            
        original_groups = groups_[metric][key]["groups"]
        num_itens = len(df.loc[df.group.isin(original_groups)])
        counter[num_itens] += 1
    
    buckets = {}
    for k, v in counter.most_common():
        bucket = k//step if k < 1000 else 1000
        if bucket not in buckets:
            if bucket == 1000:
                buckets[bucket] = {"bucket": bucket, "# de itens": f"(1.000, +∞]", "# grupos": 0}
            else:
                buckets[bucket] = {"bucket": bucket, "# de itens": f"({bucket * step}, {(bucket + 1) * step}]", "# grupos": 0}
        buckets[bucket]["# grupos"] += v
    if len(buckets) == 0:
        print("ups!")
        continue
        
    _df = pd.DataFrame.from_records(list(buckets.values())).reset_index().sort_values(by="bucket")  
    sns.barplot(data=_df, x="# de itens", y="# grupos", color="blue")
    plt.xticks(rotation=90)
    print(f"{metric}")
    display(plt.show())

In [None]:
d = {"a": 10, "b": 20, "c": 30}
__df = pd.DataFrame(d.items())
sns.barplot(data=__df, x=0, y=1, color="blue")
plt.xlabel("Foo")
plt.ylabel("Bar")
plt.xticks(fontsize=10)
plt.xticks(fontsize=8, rotation=90)
display(plt.show())

In [None]:
df