In [1]:
from sklearn.datasets import fetch_20newsgroups
import collections
import time
from item.topic_modeling.top2vec import get_cluster_words
from item.item_list import (
    ItemList,
    Item
)
from item.clustering.utils import (
    load_clustering_results_pickle,
    load_clustering_results,
    load_models_pickle
)
from item.clustering.item_representation import (
    load_items_embeddings
)
from nlp.word_embeddings import (
    load_word_embeddings
)

# Load Items

In [2]:
# It gets the descriptions processed:
itemlist = ItemList()
itemlist.load_items_from_file('../data/output/druid_fasttext/f03_items.csv.zip')

In [3]:
itemlist.items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11482941 entries, 0 to 11482940
Data columns (total 19 columns):
 #   Column              Dtype  
---  ------              -----  
 0   palavras            object 
 1   unidades_medida     object 
 2   numeros             object 
 3   cores               object 
 4   materiais           object 
 5   tamanho             object 
 6   quantidade          object 
 7   preco               float64
 8   dsc_unidade_medida  object 
 9   original            object 
 10  licitacao           int64  
 11  original_prep       object 
 12  funcao              float64
 13  ano                 int64  
 14  mes                 int64  
 15  data                object 
 16  municipio           object 
 17  orgao               object 
 18  item_id             int64  
dtypes: float64(2), int64(4), object(13)
memory usage: 1.6+ GB


In [4]:
itemlist.items_df.head(50)

Unnamed: 0,palavras,unidades_medida,numeros,cores,materiais,tamanho,quantidade,preco,dsc_unidade_medida,original,licitacao,original_prep,funcao,ano,mes,data,municipio,orgao,item_id
0,"['torval', 'cr', 'com']",['mg'],"['500', '30']",[],[],[],['comprimido'],58.65,caixa,TORVAL CR 500MG COM 30 COMPRIMIDOS,297107,"['torval', 'cr', '500', 'mg', 'com', '30', 'co...",,2014,11,2014-09-19,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA,0
1,['complexo'],['ml'],['30'],[],[],[],[],3.415,frasco,COMPLEXO B 30 ML,297107,"['complexo', 'b', '30', 'ml']",,2014,11,2014-09-19,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA,1
2,"['estante', 'chapa', 'com', 'altura', 'mts']",[],"['22', '1', '98']",[],['aco'],[],[],196.0,unidade,"ESTANTE DE ACO CHAPA 22 COM ALTURA DE 1,98 MTS",297109,"['estante', 'aco', 'chapa', '22', 'com', 'altu...",,2014,11,2014-10-23,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA,2
3,"['broca', 'diamantado', 'cilindrico', 'plano',...",[],['3'],[],[],[],[],3.3333,unidade,3 BROCA DIAMANTADA CILINDRICA PLANA - ALTA,297110,"['broca', 'diamantado', 'cilindrico', 'plano',...",,2014,11,2014-09-02,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA,3
4,"['lustrar', 'mov', 'bas', 'silic', 'perf', 'sa...",[],[],[],[],[],[],4.0133,unidade,"LUSTRA, MOV, BAS SILIC, PERF SUAV, AC SEC RAPIDA",297112,"['lustrar', 'mov', 'bas', 'silic', 'perf', 'sa...",,2014,12,2014-12-02,VARZELANDIA,PREFEITURA MUNICIPAL DE VARZELANDIA,4
5,['termometro'],[],[],[],[],[],[],44.0,unidade,TERMOMETRO,297247,['termometro'],,2014,2,2014-02-06,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE,5
6,['cilindro'],[],[],[],[],[],[],649.075,unidade,CILINDRO.,297248,['cilindro'],,2014,2,2014-02-03,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE,6
7,"['sonda', 'nsg', 'longo']",[],['014'],[],[],[],[],0.53,unidade,SONDA NSG LONGA N14,297252,"['sonda', 'nsg', 'longo', '014']",,2014,2,2014-01-06,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE,7
8,"['atadura', 'gessar', 'sobre', 'algodao']",['cm'],"['20', '100']",[],"['gesso', 'tecido']",[],"['caixa', 'unid']",58.8,caixa,ATADURA GESSADA DE 20CM C/GESSO SOBRE TECIDO 1...,297252,"['atadura', 'gessar', '20', 'cm', 'c', 'gesso'...",,2014,2,2014-01-06,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE,8
9,"['luva', 'procedimento', 'extra']",[],['100'],[],[],[],"['caixa', 'unid']",12.69,caixa,"LUVA DE PROCEDIMENTO EXTRA P, CAIXA C/ 100 UNID.",297252,"['luva', 'procedimento', 'extra', 'p', 'caixa'...",,2014,2,2014-01-06,VAZANTE,PREFEITURA MUNICIPAL DE VAZANTE,9


In [5]:
results, outliers = load_clustering_results_pickle('../data/output/druid_fasttext/')

In [6]:
embeddings = load_items_embeddings('../data/output/druid_fasttext/items_vec.json')

In [7]:
# word embeddings file, each line contains a word embedding
word_embeddings_file = '../data/embeddings/models/fasttext/sg/druid/items_embeddings.vec'

# read word embeddings from file and store them in a map
# word_embeddings = load_word_embeddings(word_embeddings_file, itemlist.unique_words)
word_embeddings = load_word_embeddings(word_embeddings_file)

In [8]:
len(list(word_embeddings.values())[0])

100

In [9]:
len(results)

79224

In [10]:
count = 0

for group, items_list in results.items():
    if '_' in group and "-1" not in group:
        count += 1

In [11]:
count

39613

# Get cluster words

In [None]:
start = time.time()

cluster_words = get_cluster_words(itemlist, results, word_embeddings, embeddings, reducer_model=None,
                                  distance='cosine', num_words=15, n_process=1)

end = time.time()

Read ranges
([0], [39612])
0


In [None]:
(end - start)/60

In [None]:
len(cluster_words)

In [None]:
cluster_words["pneu_5"]

In [None]:
canon_desc = []
canon_desc_freq = collections.defaultdict(int)

for group, token_scores in cluster_words.items():
    if isinstance(token_scores, dict):
        token_scores = list(token_scores.items())
    token_scores = token_scores[:5]
    tokens = [token for token, score in token_scores]
    tokens.sort()
    description = " ".join(tokens)
    canon_desc.append(description)
    canon_desc_freq[description] += 1

In [None]:
canon_desc[:10]

In [None]:
len(canon_desc_freq)