In [2]:
import pandas as pd
import numpy as np
import collections
import copy
import random
import pickle
import re
import matplotlib.pyplot as plt
from nlp.utils import (
    plot_histogram,
    get_completetext,
    plot_wordcloud,
    print_statistics,
    groups_frequency_sort)
from nlp.text_statistics import (
    count_tokens,
    unique_tokens
)
from utils.read_files import (
    get_items)
from item.item_list import (
    ItemList,
    Item
)
from nlp.pos_tagging import (
    get_tokens_tags
)
from nlp.named_entity_extraction import (
    get_tokens_categories
)
from item.utils import get_tokens_set

# Load items

In [3]:
itemlist = ItemList()
# itemlist.load_items_from_file('../dados/sample_dataset.csv.zip')
itemlist.load_items_from_file('../dados/items_preprocessed_v3_complete_train.csv.zip')

In [4]:
len(itemlist.items_df)

5821282

In [5]:
len(itemlist.items_df[itemlist.items_df.quantidade != '[]'])

880524

In [6]:
itemlist.items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5821282 entries, 0 to 5821281
Data columns (total 15 columns):
 #   Column              Dtype  
---  ------              -----  
 0   palavras            object 
 1   unidades_medida     object 
 2   numeros             object 
 3   cores               object 
 4   materiais           object 
 5   tamanho             object 
 6   quantidade          object 
 7   preco               float64
 8   dsc_unidade_medida  object 
 9   original            object 
 10  licitacao           int64  
 11  original_prep       object 
 12  funcao              object 
 13  ano                 int64  
 14  item_id             int64  
dtypes: float64(1), int64(3), object(11)
memory usage: 666.2+ MB


In [7]:
itemlist.items_df.head()

Unnamed: 0,palavras,unidades_medida,numeros,cores,materiais,tamanho,quantidade,preco,dsc_unidade_medida,original,licitacao,original_prep,funcao,ano,item_id
0,"['gasolina', 'comum']",[],[],[],[],[],[],3.89,litro,GASOLINA COMUM,714515,"['gasolina', 'comum']",Legislativa,2016,76
1,['etanol'],[],[],[],[],[],[],2.99,litro,ETANOL,714515,['etanol'],Legislativa,2016,77
2,"['oleo', 'para', 'motor', 'a50', 'gasol']",[],['20'],[],[],[],[],15.0,litro,OLEO PARA MOTOR 20 W50-M.GASOL,714515,"['oleo', 'para', 'motor', '20', 'a50', 'm', 'g...",Legislativa,2016,78
3,"['oleo', 'sintetico', 'para', 'motor']",[],[],[],[],[],[],32.0,litro,OLEO SINTETICO PARA MOTOR,714515,"['oleo', 'sintetico', 'para', 'motor']",Legislativa,2016,79
4,"['tela', 'soldado', 'malha', 'altura']","['mm', 'cm', 'mt']","['2', '40', '5', '15', '3']",[],[],[],[],31.73,metro,"TELA SOLDADA 2,40MM, MALHA 5 X 15CM, ALTURA 2,...",714521,"['tela', 'soldado', '2', '40', 'mm', 'malha', ...",Educação | Saúde,2016,80


In [8]:
itemlist.items_df[(itemlist.items_df.unidades_medida != '[]') & (itemlist.items_df.tamanho != '[]') & (itemlist.items_df.materiais != '[]') & (itemlist.items_df.quantidade != '[]')].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7102 entries, 2783 to 5820724
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   palavras            7102 non-null   object 
 1   unidades_medida     7102 non-null   object 
 2   numeros             7102 non-null   object 
 3   cores               7102 non-null   object 
 4   materiais           7102 non-null   object 
 5   tamanho             7102 non-null   object 
 6   quantidade          7102 non-null   object 
 7   preco               7102 non-null   float64
 8   dsc_unidade_medida  7102 non-null   object 
 9   original            7102 non-null   object 
 10  licitacao           7102 non-null   int64  
 11  original_prep       7102 non-null   object 
 12  funcao              7102 non-null   object 
 13  ano                 7102 non-null   int64  
 14  item_id             7102 non-null   int64  
dtypes: float64(1), int64(3), object(11)
memory usage:

In [9]:
pd.set_option("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None)
itemlist.items_df[(itemlist.items_df.unidades_medida != '[]') & (itemlist.items_df.cores != '[]') & (itemlist.items_df.materiais != '[]')].tail(50)

Unnamed: 0,palavras,unidades_medida,numeros,cores,materiais,tamanho,quantidade,preco,dsc_unidade_medida,original,licitacao,original_prep,funcao,ano,item_id
5817453,"['monofilamento', 'cuticular', 'agulha', 'triangular']",['cm'],"['2', '0', '45', '3', '8', '4']",['preto'],['nylon'],[],[],27.86,caixa,"NYLON MONOFILAMENTO PRETO 2-0 CUTICULAR 45CM C/ AGULHA 3/8 TRIAGULAR DE 4,0CM",1192817,"['nylon', 'monofilamento', 'preto', '2', '0', 'cuticular', '45', 'cm', 'c', 'agulha', '3', '8', 'triangular', '4']",Vazio,2018,7544927
5817615,"['fio', 'algodao', 'com', 'torcer', 'sem', 'agulha']",['cm'],"['0', '15', '45', '24', '1']",['azul'],['poliester'],[],[],70.91,caixa,"FIO ALGODÃO AZUL COM POLIESTER TORCIDO Nº 0, 15X45CM, SEM AGULHA 24X1",1192817,"['fio', 'algodao', 'azul', 'com', 'poliester', 'torcer', 'n', '0', '15', 'x', '45', 'cm', 'sem', 'agulha', '24', '1']",Vazio,2018,7545090
5818870,"['tubo', 'soldavel', 'predial', 'classe']",['mm'],['60'],['marrom'],['pvc'],[],[],60.0,unid,Tubo PVC marrom soldável predial classe A 60mm,1192846,"['tubo', 'pvc', 'marrom', 'soldavel', 'predial', 'classe', '60', 'mm']",Vazio,2018,7546698
5818871,"['tubo', 'soldavel', 'predial', 'classe']",['mm'],['50'],['marrom'],['pvc'],[],[],54.82,unid,Tubo PVC marrom soldável predial classe A 50mm,1192846,"['tubo', 'pvc', 'marrom', 'soldavel', 'predial', 'classe', '50', 'mm']",Vazio,2018,7546699
5818872,"['tubo', 'soldavel', 'predial', 'classe']",['mm'],['40'],['marrom'],['pvc'],[],[],35.0,unid,Tubo PVC marrom soldável predial classe A 40mm,1192846,"['tubo', 'pvc', 'marrom', 'soldavel', 'predial', 'classe', '40', 'mm']",Vazio,2018,7546700
5818873,"['tubo', 'soldavel', 'predial', 'classe']",['mm'],['32'],['marrom'],['pvc'],[],[],20.73,unid,Tubo PVC marrom soldável predial classe A 32mm,1192846,"['tubo', 'pvc', 'marrom', 'soldavel', 'predial', 'classe', '32', 'mm']",Vazio,2018,7546701
5818874,"['tubo', 'soldavel', 'predial', 'classe']",['mm'],['25'],['marrom'],['pvc'],[],[],16.8,unid,Tubo PVC marrom soldável predial classe A 25mm,1192846,"['tubo', 'pvc', 'marrom', 'soldavel', 'predial', 'classe', '25', 'mm']",Vazio,2018,7546702
5818875,"['tubo', 'soldavel', 'predial', 'classe']",['mm'],['20'],['marrom'],['pvc'],[],[],11.89,unid,Tubo PVC marrom soldável predial classe A 20mm,1192846,"['tubo', 'pvc', 'marrom', 'soldavel', 'predial', 'classe', '20', 'mm']",Vazio,2018,7546703
5820088,['saco'],['kg'],['10'],['transparente'],['plastico'],[],[],14.5,kg,SACO PLASTICO TRANSPARENTE 10KG,1192862,"['saco', 'plastico', 'transparente', '10', 'kg']",Vazio,2018,7548166
5820089,['saco'],['gr'],['500'],['transparente'],['plastico'],[],[],14.8,kg,SACO PLASTICO TRANSPARENTE 500GR,1192862,"['saco', 'plastico', 'transparente', '500', 'gr']",Vazio,2018,7548167


# Building dataset

In [10]:
clusters_train = pd.read_csv('../dados/precificacao/fasttext_skip100/complete/baseline+embeddings/SUB+MED+unit+num_concat_umap_hdbscan_euclidean/items_clusters_train.csv.zip', sep=';')

In [11]:
clusters_train['first_token'] = clusters_train['cluster'].str.split('_').str[0]
clusters_train['label_id'] = clusters_train['cluster'].str.split('_').str[1]
clusters_train['divided'] = (clusters_train['cluster'] != clusters_train['first_token'])

In [12]:
clusters_count = pd.DataFrame(pd.value_counts(clusters_train['cluster'])).reset_index()
clusters_count = clusters_count.rename(columns={"index": "cluster", "cluster": "cluster_size"})

In [13]:
clusters_train = pd.merge(left=clusters_train, right=clusters_count, left_on='cluster', right_on='cluster')

In [14]:
clusters_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5524503 entries, 0 to 5524502
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   item_id             int64  
 1   seq_dim_licitacao   int64  
 2   outlier             int64  
 3   cluster             object 
 4   dsc_unidade_medida  object 
 5   description         object 
 6   original            object 
 7   areas               object 
 8   price               float64
 9   first_token         object 
 10  label_id            object 
 11  divided             bool   
 12  cluster_size        int64  
dtypes: bool(1), float64(1), int64(4), object(7)
memory usage: 553.2+ MB


In [15]:
clusters_train.head(50)

Unnamed: 0,item_id,seq_dim_licitacao,outlier,cluster,dsc_unidade_medida,description,original,areas,price,first_token,label_id,divided,cluster_size
0,899879,874710,1,trimetoprima,s.o,mg trimetoprima 40 5 ml frasco 50 sulfametoxazol200,SULFAMETOXAZOL200 MG TRIMETROPRIMA 40 MG/5ML FRASCO DE 50 ML -,Vazio,1.4,trimetoprima,,False,8
1,3543667,1076591,1,trimetoprima,frasco,trimetoprima,TRIMETOPRIMA,Administração,8.25,trimetoprima,,False,8
2,5685964,1186568,1,trimetoprima,s.o,mg trimetoprima 40 5 ml frasco 50 sulfametoxazol200,SULFAMETOXAZOL200 MG TRIMETROPRIMA 40 MG/5ML FRASCO DE 50 ML -,Vazio,1.4,trimetoprima,,False,8
3,5815235,1192790,1,trimetoprima,comprimido,trimetoprima sulfametoxazol 400 mg 80,TRIMETOPRIMA + SULFAMETOXAZOL 400 MG + 80 MG,Saúde,0.12,trimetoprima,,False,8
4,5819991,1192860,1,trimetoprima,comprimido,trimetoprima sulfametoxazol 400 mg 80,TRIMETOPRIMA + SULFAMETOXAZOL 400 MG + 80 MG,Vazio,0.13,trimetoprima,,False,8
5,5819992,1192860,1,trimetoprima,comprimido,trimetoprima sulfametoxazol 400 mg 80,TRIMETOPRIMA + SULFAMETOXAZOL 400 MG + 80 MG,Vazio,0.11,trimetoprima,,False,8
6,5820032,1192860,1,trimetoprima,frasco,trimetoprima sulfametoxazol 400 mg 80,TRIMETOPRIMA - SULFAMETOXAZOL 400 MG - 80 MG,Vazio,1.82,trimetoprima,,False,8
7,5820033,1192860,1,trimetoprima,frasco,trimetoprima sulfametoxazol 400 mg 80,TRIMETOPRIMA - SULFAMETOXAZOL 400 MG - 80 MG,Vazio,1.37,trimetoprima,,False,8
8,404718,781542,1,recarregavel_-1,unidade,po recarregavel para impressor hp laser jet 1010 1020,PO RECARREGAVEL PARA IMPRESSORA HP LASER JET 1010/1020,Vazio,3.0,recarregavel,-1.0,True,44
9,404719,781542,1,recarregavel_-1,unidade,po recarregavel para impressor hp laser jet 1010 1020,PO RECARREGAVEL PARA IMPRESSORA HP LASER JET 1010/1020,Vazio,3.0,recarregavel,-1.0,True,44


In [16]:
clusters_train = clusters_train[(clusters_train.first_token == "papel") & (clusters_train.outlier == 0)]

In [17]:
clusters_train = clusters_train[clusters_train.cluster_size > 100]

In [18]:
clusters_train.head()

Unnamed: 0,item_id,seq_dim_licitacao,outlier,cluster,dsc_unidade_medida,description,original,areas,price,first_token,label_id,divided,cluster_size
1219264,388,714563,0,papel_23,peca,papel higienico,PAPEL HIGIÊNICO (170056270004070),Administração | Assistência Social | Educação | Gestão Ambiental | Saúde | Urbanismo,4.17,papel,23,True,12490
1219265,1120,714614,0,papel_23,fd,papel higienico,PAPEL HIGIÊNICO (180013002080089),Administração | Assistência Social | Educação | Saúde | Urbanismo,42.55,papel,23,True,12490
1219266,1121,714614,0,papel_23,fd,papel higienico,PAPEL HIGIÊNICO (180013002080060),Administração | Assistência Social | Educação | Saúde | Urbanismo,42.55,papel,23,True,12490
1219267,1561,714632,0,papel_23,caixa,papel a4,PAPEL A4 (180047002248263),Administração | Assistência Social | Educação | Saúde | Urbanismo,152.0,papel,23,True,12490
1219268,1562,714632,0,papel_23,caixa,papel a4,PAPEL A4 (180047002248133),Administração | Assistência Social | Educação | Saúde | Urbanismo,152.25,papel,23,True,12490


In [19]:
len(clusters_train)

53316

In [20]:
len(set(clusters_train['label_id']))

133

In [21]:
# Get the categories of tokens descriptions
word_category = get_tokens_categories()

In [22]:
# Get the tags of tokens descriptions
word_class = get_tokens_tags(itemlist.unique_words, medications=False)

In [31]:
dataset = []
errors = 0

for index, row in clusters_train.iterrows():
    original_prep = row['description'].split(' ')
    label_id = int(row['label_id'])
    word_ids = []
    tags = []
    categories = []
    for word in original_prep:
        try:
            word_ids.append(itemlist.word_id[word])
        except:
            errors += 1
            continue
        if word in word_class:
            tags.append(word_class[word])
        else:
            tags.append('-')
    
        if word in word_category:
            categories.append(word_category[word])
        else:
            categories.append('WORD')
    
    dataset.append(((word_ids, tags, categories), label_id))

In [32]:
errors

3

In [33]:
dataset[:10]

[(([88458, 92482], ['N', 'A'], ['WORD', 'WORD']), 23),
 (([88458, 92482], ['N', 'A'], ['WORD', 'WORD']), 23),
 (([88458, 92482], ['N', 'A'], ['WORD', 'WORD']), 23),
 (([88458, 75479], ['N', '-'], ['WORD', 'SIZE']), 23),
 (([88458, 75479], ['N', '-'], ['WORD', 'SIZE']), 23),
 (([88458, 56212], ['N', 'N'], ['WORD', 'MATERIAL']), 23),
 (([88458, 51868], ['N', '-'], ['WORD', 'WORD']), 23),
 (([88458, 23008], ['N', 'A'], ['WORD', 'WORD']), 23),
 (([88458, 23008, 91834], ['N', 'A', 'N'], ['WORD', 'WORD', 'COLOR']), 23),
 (([88458, 23008, 103620], ['N', 'A', 'N'], ['WORD', 'WORD', 'COLOR']), 23)]

In [34]:
tags = list(set(word_class.values()))
tags.append('-')

In [35]:
tags

['PREP', 'DET', 'N', 'PRO', 'CONJ', 'PF', 'INTERJ', 'ADV', 'A', 'V', '-']

In [36]:
categories = list(set(word_category.values()))
categories.append('WORD')

In [37]:
categories

['UNIT_METRIC',
 'NUMBER',
 'QUALIFIER',
 'COLOR',
 'MATERIAL',
 'SIZE',
 'QUANTITY',
 'WORD']

In [38]:
# split train-dev-test sets
random.shuffle(dataset)
size = len(dataset)
train_size = int(0.7*size)
dev_size = int(0.1*size)

train_dataset, dev_dataset, test_dataset = dataset[0:train_size], dataset[train_size:dev_size], dataset[dev_size:]

In [39]:
paper_dataset = {}

paper_dataset["train"] = train_dataset
paper_dataset["dev"] = dev_dataset
paper_dataset["test"] = test_dataset

In [40]:
# Store dataset
with open('../dados/sample_dataset/paper_dataset.pickle', 'wb') as handle:
    pickle.dump(paper_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
# Store word_id
with open('../dados/sample_dataset/word2id.pickle', 'wb') as handle:
    pickle.dump(itemlist.word_id, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
# Store tags
with open('../dados/sample_dataset/tags.pickle', 'wb') as handle:
    pickle.dump(tags, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
# Store categories
with open('../dados/sample_dataset/categories.pickle', 'wb') as handle:
    pickle.dump(categories, handle, protocol=pickle.HIGHEST_PROTOCOL)

# One hot encoding

In [44]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [49]:
def get_onehot_encoding(words):

    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(words)
    
    # binary encode
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    onehot_encoded = list(onehot_encoded)
    
    onehot_encoding = {}
    for word_id, word in enumerate(words):
        onehot_encoding[word] = onehot_encoded[word_id]

    return onehot_encoding

In [46]:
def convert_to_onehot_encoding(sentence, onehot_encoding):

    return [onehot_encoding[word] for word in sentence]

In [47]:
tags = ['PREP', 'CONJ', 'A', 'DET', 'N', 'PF', 'ADV', 'V', 'PRO', 'INTERJ', '-']

In [48]:
categories = ['UNIT_METRIC', 'QUANTITY', 'NUMBER', 'COLOR', 'MATERIAL', 'SIZE', 'QUALIFIER', 'WORD']

In [50]:
tags_encoding = get_onehot_encoding(tags)

In [51]:
tags_encoding

{'PREP': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 'CONJ': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 'A': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'DET': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]),
 'N': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 'PF': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]),
 'ADV': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'V': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]),
 'PRO': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]),
 'INTERJ': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]),
 '-': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])}

In [54]:
categories_encoding = get_onehot_encoding(categories)

In [53]:
convert_to_onehot_encoding(['PREP', 'ADV', 'CONJ'], tags_encoding)

[array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])]

In [55]:
convert_to_onehot_encoding(['NUMBER', 'SIZE', 'WORD'], categories_encoding)

[array([0., 0., 1., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 1., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 1.])]

In [56]:
len(tags)

11

In [57]:
len(categories)

8