In [1]:
import pandas as pd
import numpy as np
import collections
import copy
import random
import matplotlib.pyplot as plt
import time
import multiprocessing
import nltk
import pickle
import json
from item.item_list import (
    ItemList
)
from nlp.word_embeddings import (
    load_word_embeddings,
    get_items_embeddings
)
from nlp.pos_tagging import (
    get_tokens_tags
)
from item.clustering.item_representation import (
    get_group_embeddings_matrix,
    get_group_embeddings_from_dict,
    save_items_embeddings,
    load_items_embeddings,
    normalize
)
from item.utils import (
    get_tokens_set,
    translate_id_to_descriptions
)
from item.clustering.utils import (
    save_clustering_results_pickle,
    save_models_pickle
)
from item.clustering.clustering import run_baseline_clustering

In [2]:
# It gets the descriptions processed:
itemlist = ItemList()
itemlist.load_items_from_file('../data/output/druid_fasttext/f03_items.csv.zip')

In [3]:
itemlist.items_df.head(10)

Unnamed: 0,palavras,unidades_medida,numeros,cores,materiais,tamanho,quantidade,preco,dsc_unidade_medida,original,licitacao,original_prep,funcao,ano,mes,data,municipio,orgao
0,"['torval', 'cr', 'com']",['mg'],"['500', '30']",[],[],[],['comprimido'],58.65,caixa,TORVAL CR 500MG COM 30 COMPRIMIDOS,297107,"['torval', 'cr', '500', 'mg', 'com', '30', 'co...",,2014,11,2014-09-19,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA
1,['complexo'],['ml'],['30'],[],[],[],[],3.415,frasco,COMPLEXO B 30 ML,297107,"['complexo', 'b', '30', 'ml']",,2014,11,2014-09-19,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA
2,"['estante', 'chapa', 'com', 'altura', 'mts']",[],"['22', '1', '98']",[],['aco'],[],[],196.0,und,"ESTANTE DE ACO CHAPA 22 COM ALTURA DE 1,98 MTS",297109,"['estante', 'aco', 'chapa', '22', 'com', 'altu...",,2014,11,2014-10-23,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA
3,"['broca', 'diamantado', 'cilindrico', 'plano',...",[],['3'],[],[],[],[],3.3333,und,3 BROCA DIAMANTADA CILINDRICA PLANA - ALTA,297110,"['broca', 'diamantado', 'cilindrico', 'plano',...",,2014,11,2014-09-02,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA
4,"['lustrar', 'mov', 'bas', 'silic', 'perf', 'sa...",[],[],[],[],[],[],4.0133,unid.,"LUSTRA, MOV, BAS SILIC, PERF SUAV, AC SEC RAPIDA",297112,"['lustrar', 'mov', 'bas', 'silic', 'perf', 'sa...",,2014,12,2014-12-02,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA
5,['termometro'],[],[],[],[],[],[],44.0,unidade,TERMOMETRO,297247,['termometro'],,2014,2,2014-02-06,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE
6,['cilindro'],[],[],[],[],[],[],649.075,unidade,CILINDRO.,297248,['cilindro'],,2014,2,2014-02-03,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE
7,"['sonda', 'nsg', 'longo']",[],['014'],[],[],[],[],0.53,unidade,SONDA NSG LONGA N14,297252,"['sonda', 'nsg', 'longo', '014']",,2014,2,2014-01-06,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE
8,"['atadura', 'gessar', 'sobre', 'algodao']",['cm'],"['20', '100']",[],"['gesso', 'tecido']",[],"['caixa', 'unid']",58.8,caixa,ATADURA GESSADA DE 20CM C/GESSO SOBRE TECIDO 1...,297252,"['atadura', 'gessar', '20', 'cm', 'c', 'gesso'...",,2014,2,2014-01-06,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE
9,"['luva', 'procedimento', 'extra']",[],['100'],[],[],[],"['caixa', 'unid']",12.69,caixa,"LUVA DE PROCEDIMENTO EXTRA P, CAIXA C/ 100 UNID.",297252,"['luva', 'procedimento', 'extra', 'p', 'caixa'...",,2014,2,2014-01-06,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE


In [4]:
itemlist.items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11482941 entries, 0 to 11482940
Data columns (total 18 columns):
 #   Column              Dtype  
---  ------              -----  
 0   palavras            object 
 1   unidades_medida     object 
 2   numeros             object 
 3   cores               object 
 4   materiais           object 
 5   tamanho             object 
 6   quantidade          object 
 7   preco               float64
 8   dsc_unidade_medida  object 
 9   original            object 
 10  licitacao           int64  
 11  original_prep       object 
 12  funcao              float64
 13  ano                 int64  
 14  mes                 int64  
 15  data                object 
 16  municipio           object 
 17  orgao               object 
dtypes: float64(2), int64(3), object(13)
memory usage: 1.5+ GB


In [None]:
len(itemlist.items_df)

In [None]:
set(itemlist.items_df['ano'])

In [None]:
# word embeddings file, each line contains a word embedding
word_embeddings_file = '../dados/embeddings/fasttext/skip_s100.txt'

In [None]:
# read word embeddings from file and store them in a map
word_embeddings = load_word_embeddings(word_embeddings_file, itemlist.unique_words)

In [None]:
# Get the tags of tokens descriptions
word_class = get_tokens_tags(itemlist.unique_words)

In [None]:
clusters, outliers, items_vec, clustering_model, \
reducer_model = run_baseline_clustering(itemlist, word_embeddings, word_class, algorithm='hdbscan', \
                                        categories=['unidades_medida', 'numeros'], embedding_type=['N', 'MED'], \
                                        operation='concatenate', n_threads=6)

In [None]:
save_clustering_results_pickle(clusters, outliers, '../dados/precificacao/fasttext_skip100/v3/baseline+embeddings/SUB+MED+unit+num_concat_pca_hdbscan_euclidean_test/')

In [None]:
save_models_pickle(clustering_model, reducer_model, '../dados/precificacao/fasttext_skip100/v3/baseline+embeddings/SUB+MED+unit+num_concat_pca_hdbscan_euclidean_test/')

In [None]:
save_items_embeddings(items_vec, '../dados/precificacao/fasttext_skip100/v3/baseline+embeddings/SUB+MED+unit+num_concat_pca_hdbscan_euclidean_test/embeddings.json')