In [1]:
import pandas as pd
import numpy as np
import collections
import copy
import random
import matplotlib.pyplot as plt
import time
import multiprocessing
import nltk
import pickle
import json
from item.item_list import (
    ItemList
)
from nlp.word_embeddings import (
    load_word_embeddings,
    get_items_embeddings
)
from nlp.pos_tagging import (
    get_tokens_tags
)
from item.clustering.item_representation import (
    get_group_embeddings_matrix,
    get_group_embeddings_from_dict,
    save_items_embeddings,
    load_items_embeddings,
    normalize
)
from item.utils import (
    get_tokens_set,
    translate_id_to_descriptions
)
from item.clustering.utils import (
    save_clustering_results_pickle,
    save_models_pickle
)
from item.clustering.clustering import run_baseline_clustering

In [2]:
# It gets the descriptions processed:
itemlist = ItemList()
itemlist.load_items_from_file('items_preprocessed_complete_test_druid.csv.zip')

In [3]:
itemlist.items_df.head(10)

Unnamed: 0,palavras,unidades_medida,numeros,cores,materiais,tamanho,quantidade,preco,dsc_unidade_medida,original,licitacao,original_prep,funcao,ano,item_id
0,"['microhbrida', 'a30', 'reposicao']",[],[],[],['resina'],[],[],69.8,unid,RESINA MICROHBRIDA COR A30 REPOSICAO,297110,"['resina', 'microhbrida', 'cor', 'a30', 'repos...",,2014,0
1,"['broca', 'diamantado', 'cilindrico', 'plano']",[],['10'],[],[],[],[],3.3333,und,10 BROCA DIAMANTADA CILINDRICA PLANA:,297110,"['broca', 'diamantado', 'cilindrico', 'plano',...",,2014,1
2,"['sonda', 'nsg', 'curtir']",[],['014'],[],[],[],[],0.35,unidade,SONDA NSG CURTA N14,297252,"['sonda', 'nsg', 'curtir', '014']",,2014,2
3,"['agulha', 'peridural']",[],['18'],[],[],[],[],2.94,unidade,AGULHA PERIDURAL N 18 G,297252,"['agulha', 'peridural', 'n', '18', 'g']",,2014,3
4,"['descarte', 'perfurar', 'cortante']",['lts'],"['5', '10']",[],[],[],"['caixa', 'unid']",19.8,caixa,CAIXA P/DESCARTE DE MATERIAL PERFURO-CORTANTE ...,297252,"['caixa', 'p', 'descarte', 'material', 'perfur...",,2014,4
5,"['detergente', 'desincrustantes', 'enzimatico'...",[],['4'],[],[],[],[],123.2,galao,DETERGENTE DESINCRUSTANTE ENZIMATICO PARA LIMP...,297252,"['detergente', 'desincrustantes', 'enzimatico'...",,2014,5
6,['batata'],[],[],[],[],[],[],1.99,unidade,BATATINHA.,297279,['batata'],,2014,6
7,"['refrigerante', 'guarana']",['lts'],['2'],[],[],[],[],17.445,fardo,"REFRIGERANTE DE GUARANA, 2 LTS",297279,"['refrigerante', 'guarana', '2', 'lts']",,2014,7
8,['suporte'],[],[],[],[],[],[],3.1,unidade,SUPORTE,297279,['suporte'],,2014,8
9,['refrigerador'],[],[],[],[],[],[],1890.0,unidade,REFRIGERADOR,297279,['refrigerador'],,2014,9


In [4]:
len(itemlist.items_df)

2217968

In [5]:
itemlist.items_df = itemlist.items_df.sample(1000)

In [6]:
len(itemlist.items_df)

1000

In [7]:
itemlist.items_df.head()

Unnamed: 0,palavras,unidades_medida,numeros,cores,materiais,tamanho,quantidade,preco,dsc_unidade_medida,original,licitacao,original_prep,funcao,ano,item_id
223493,['nile'],[],['2'],[],['pvc'],[],[],4.05,und,NIPLE PVC 2,152935,"['nile', 'pvc', '2']",,2014,223493
919471,"['pente', 'grosso', 'para', 'cabelo']",[],[],[],['plastico'],[],[],300.5,peca,"PENTE GROSSO, DE PLASTICO, PARA CABELO",382463,"['pente', 'grosso', 'plastico', 'para', 'cabelo']",,2016,919471
595345,"['prego', 'aspiral']",[],"['17', '21']",[],[],[],[],10.2,kilo,PREGO 17X21 ASPIRAL,233002,"['prego', '17', 'x', '21', 'aspiral']",,2015,595345
1589094,"['chocolate', 'granulado']",['gr'],['130'],[],[],[],[],1.0,un,CHOCOLATE GRANULADO 130 GR,443342,"['chocolate', 'granulado', '130', 'gr']",,2018,1589094
131371,"['biscoito', 'doce', 'rosca', 'cocar']",['kg'],"['1', '5']",[],[],[],['cx'],14.8667,caixa,"BISCOITO DOCE TIPO ROSQUINHA DE COCO CX C/ 1,5KG",36718,"['biscoito', 'doce', 'tipo', 'rosca', 'cocar',...",,2014,131371


In [8]:
set(itemlist.items_df['ano'])

{2014, 2015, 2016, 2017, 2018, 2019, 2020}

In [9]:
# word embeddings file, each line contains a word embedding
word_embeddings_file = '../dados/embeddings/fasttext/skip_s100.txt'

In [10]:
# read word embeddings from file and store them in a map
word_embeddings = load_word_embeddings(word_embeddings_file, itemlist.unique_words)

In [11]:
# Get the tags of tokens descriptions
word_class = get_tokens_tags(itemlist.unique_words)

In [12]:
itemlist.items_df = itemlist.items_df.reset_index()
itemlist.items_df = itemlist.items_df.drop(['index'], axis=1)

In [13]:
itemlist.items_df.head()

Unnamed: 0,palavras,unidades_medida,numeros,cores,materiais,tamanho,quantidade,preco,dsc_unidade_medida,original,licitacao,original_prep,funcao,ano,item_id
0,['nile'],[],['2'],[],['pvc'],[],[],4.05,und,NIPLE PVC 2,152935,"['nile', 'pvc', '2']",,2014,223493
1,"['pente', 'grosso', 'para', 'cabelo']",[],[],[],['plastico'],[],[],300.5,peca,"PENTE GROSSO, DE PLASTICO, PARA CABELO",382463,"['pente', 'grosso', 'plastico', 'para', 'cabelo']",,2016,919471
2,"['prego', 'aspiral']",[],"['17', '21']",[],[],[],[],10.2,kilo,PREGO 17X21 ASPIRAL,233002,"['prego', '17', 'x', '21', 'aspiral']",,2015,595345
3,"['chocolate', 'granulado']",['gr'],['130'],[],[],[],[],1.0,un,CHOCOLATE GRANULADO 130 GR,443342,"['chocolate', 'granulado', '130', 'gr']",,2018,1589094
4,"['biscoito', 'doce', 'rosca', 'cocar']",['kg'],"['1', '5']",[],[],[],['cx'],14.8667,caixa,"BISCOITO DOCE TIPO ROSQUINHA DE COCO CX C/ 1,5KG",36718,"['biscoito', 'doce', 'tipo', 'rosca', 'cocar',...",,2014,131371


In [14]:
len(itemlist.items_df)

1000

In [None]:
clusters, outliers, items_vec, clustering_model, \
reducer_model = run_baseline_clustering(itemlist, word_embeddings, word_class, algorithm='hdbscan', \
                                        categories=['unidades_medida', 'numeros'], embedding_type=['N', 'MED'], \
                                        operation='concatenate', n_process=6)

In [15]:
import mlflow

In [16]:
algorithm = 'hdbscan'
categories = ['unidades_medida', 'numeros']
embedding_type = ['N', 'MED']
operation = 'concatenate'
n_process = 6

In [17]:
params = {
    "algorithm" : algorithm,
    "categories" : categories,
    "embedding_type" : embedding_type,
    "operation" : operation,
    "n_process" : n_process
}

In [None]:
with mlflow.start_run(run_name="test-experiment"):

    mlflow.log_params(params)

    clusters, outliers, items_vec, clustering_model, \
    reducer_model = run_baseline_clustering(itemlist, word_embeddings, word_class, algorithm='hdbscan', \
                                            categories=['unidades_medida', 'numeros'], embedding_type=['N', 'MED'], \
                                            operation='concatenate', n_process=6)
    
    # log metrics
    mlflow.log_metrics({"davies-bouldin": 0.0, "calinski": 1.0})

In [None]:
save_clustering_results_pickle(clusters, outliers, '../dados/precificacao/fasttext_skip100/v3/baseline+embeddings/SUB+MED+unit+num_concat_pca_hdbscan_euclidean_test/')

In [None]:
save_models_pickle(clustering_model, reducer_model, '../dados/precificacao/fasttext_skip100/v3/baseline+embeddings/SUB+MED+unit+num_concat_pca_hdbscan_euclidean_test/')

In [None]:
save_items_embeddings(items_vec, '../dados/precificacao/fasttext_skip100/v3/baseline+embeddings/SUB+MED+unit+num_concat_pca_hdbscan_euclidean_test/embeddings.json')