In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import multiprocessing
import collections
from item.item_list import (
    ItemList,
    Item
)
from nlp.grouping import (
    groups_frequency_sort
)
from nlp.utils import (
    read_json_file,
    plot_histogram,
    get_completetext,
    plot_wordcloud,
    print_statistics)
from nlp.pos_tagging import *
from nlp.word_embeddings import *
from item.clustering.evaluate import *
from item.clustering.utils import *
from item.clustering.item_representation import *
from item.clustering.clustering import *
from item.pricing.utils import *
from item.pricing.pricing import *

# Find clusters of the items in the test set

In [2]:
clustering_model, reducer_model = load_models_pickle('../dados/output/druid/')

In [3]:
results_train, outliers_train, prices_train = load_clustering_results_pickle('../dados/output/druid/')

In [5]:
count = 0
total = 0

for cluster, items in results_train.items():
    if '_' not in cluster:
        count += len(items)
    total += len(items)

count

151196

In [6]:
total

10560031

In [7]:
results_train['gasolina_0']

[9921,
 10145,
 12083,
 20450,
 27651,
 37792,
 38699,
 44163,
 55552,
 59643,
 64519,
 73311,
 93311,
 117562,
 127135,
 131367,
 132475,
 132528,
 148630,
 149417,
 151100,
 151624,
 156226,
 164812,
 171722,
 183383,
 183605,
 187010,
 189309,
 215122,
 217981,
 230843,
 237090,
 247546,
 263692,
 276495,
 288246,
 289775,
 297358,
 310325,
 327562,
 330337,
 332907,
 339014,
 339095,
 342948,
 353348,
 373436,
 391209,
 394576,
 395394,
 418293,
 420845,
 422346,
 429619,
 435221,
 439070,
 449430,
 453183,
 458681,
 462503,
 471220,
 488890,
 497146,
 508985,
 517835,
 525079,
 527591,
 532451,
 535772,
 539712,
 550669,
 559849,
 572381,
 583307,
 591880,
 596488,
 596578,
 605125,
 619273,
 620314,
 628851,
 630226,
 638936,
 639800,
 645437,
 648902,
 659547,
 660157,
 662863,
 671029,
 674020,
 675422,
 679081,
 707069,
 718560,
 722731,
 734639,
 740526,
 752067,
 754717,
 756024,
 759495,
 763915,
 786468,
 795671,
 810713,
 813862,
 818257,
 824740,
 845412,
 849787,
 86780

In [None]:
# It gets the descriptions processed:
itemlist = ItemList()
itemlist.load_items_from_file('items_preprocessed_complete.csv.zip')

In [None]:
# Get the tags of tokens descriptions
word_class = get_tokens_tags(itemlist.unique_words)

In [5]:
# word embeddings file, each line contains a word embedding
word_embeddings_file = '../dados/embeddings/fasttext/skip_s100.txt'
# word_embeddings_file = '../dados/embeddings/word2vec/cbow_s50.txt'

In [6]:
# read word embeddings from file and store them in a map
# word_embeddings = load_word_embeddings(word_embeddings_file, itemlist.unique_words)
word_embeddings = load_word_embeddings(word_embeddings_file)

In [None]:
len(itemlist.items_df)

In [None]:
group_dsc_unidade_medida(itemlist.items_df)

In [None]:
results = predict_items_clusters(itemlist, word_embeddings, word_class, reducer_model, \
                                 clustering_model, categories=['unidades_medida', 'numeros'], \
                                 embedding_type=['N', 'MED'], operation='concatenate', n_process=3)

In [None]:
results

In [None]:
len(results)

In [None]:
len(itemlist.items_df) - len(results)

In [None]:
count = 0

for r in results:
    if r['cluster'] == '-2':
        count += 1

count

In [None]:
100*(count/len(results))

In [None]:
clusters_probs = []

for result in results:
    clusters_probs.append(result['cluster_prob'])

In [None]:
del word_embeddings
del word_class
del reducer_model
del clustering_model
del itemlist

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF
from matplotlib.pyplot import yticks

# fit a cdf
ecdf = ECDF(clusters_probs)

fig, (axis1) = plt.subplots(figsize=(10,8))
x_label = 'Cluster score'
y_label = 'Nº de itens (%)'

plt.plot(ecdf.x, ecdf.y)

axis1.set_yticks([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
locs, labels = yticks()
yticks([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 
       [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

axis1.set_xlabel(x_label, fontsize=20, weight='bold')
axis1.set_ylabel(y_label, fontsize=20, weight='bold')

plt.grid(axis='both', linestyle=':', linewidth=1.0)

plt.yticks(fontsize=14)
plt.xticks(fontsize=14)

plt.show()
plt.clf()

In [None]:
print('mean: %s' % np.mean(clusters_probs))
print('median: %s' % np.median(clusters_probs))
print('std: %s' % np.std(clusters_probs))

# PRICING: get the statistics for each cluster finded in the train set

In [None]:
# It gets the descriptions processed [TRAINING]:
itemlist_train = ItemList()
itemlist_train.load_items_from_file('items_preprocessed_complete_druid.csv.zip')

In [None]:
len(itemlist_train.items_df)

In [None]:
cluster_prices = get_clusters_prices(itemlist_train, results_train)

In [None]:
cluster_prices_statistics, cluster_prices_statistics_year, items_clusters_wo_outliers = pricing(itemlist_train, results_train, cluster_prices, remove_outliers=True, threshold=0.5, dsc_unidade=True, year=True)

In [16]:
cluster_prices_statistics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329539 entries, 0 to 329538
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   cluster             329539 non-null  object 
 1   dsc_unidade_medida  329539 non-null  object 
 2   mean                329539 non-null  float64
 3   count               329539 non-null  int64  
 4   max                 329539 non-null  float64
 5   min                 329539 non-null  float64
 6   median              329539 non-null  float64
 7   std                 231514 non-null  float64
 8   var                 231514 non-null  float64
 9   quantile_1          329539 non-null  float64
 10  quantile_3          329539 non-null  float64
 11  first_token         329539 non-null  object 
 12  outlier             329539 non-null  int64  
dtypes: float64(8), int64(2), object(3)
memory usage: 35.2+ MB


In [None]:
cluster_prices_statistics.to_csv("../dados/output/druid/cluster_prices_statistics.csv.zip", sep=';', index=False, compression='zip')

In [None]:
cluster_prices_statistics_year.to_csv("../dados/output/druid/cluster_prices_statistics_year.csv.zip", sep=';', index=False, compression='zip')

In [None]:
items_clusters_wo_outliers.to_csv("../dados/output/druid/items_clusters_train_wo_out.csv.zip", sep=';', index=False, compression='zip')

In [None]:
items_clusters_df = get_items_dataframe(itemlist_train, results_train)

In [None]:
items_clusters_df.head(20)

In [None]:
items_clusters_df.info()

In [None]:
items_clusters_df.to_csv("../dados/output/druid/items_clusters_train.csv.zip", sep=';', index=False, compression='zip')

In [17]:
items_clusters_df = pd.read_csv("../dados/output/druid/items_clusters_train.csv.zip", sep=';', low_memory=False)

In [19]:
items_clusters_df.head()

Unnamed: 0,item_id,seq_dim_licitacao,outlier,cluster,dsc_unidade_medida,ano,description,original,areas,price,first_token
0,171782,242872,1,fosa,unidade,2014,fosa septico alvenaria tijolo ceramico macico ...,FOSA SEPTICA EM ALVENARIA DE TIJOLO CERAMICO M...,,1021.87,fosa
1,64082,189529,1,carlota_-1,unidade,2014,carlota lote5,LOTE5-CARLOTA,,25.0,carlota
2,121806,189529,1,carlota_-1,unidade,2014,carlota lote7,LOTE7-CARLOTA,,25.0,carlota
3,189177,189550,1,carlota_-1,unidade,2014,carlota lote6,LOTE6-CARLOTA,,30.0,carlota
4,208588,189529,1,carlota_-1,unidade,2014,carlota lote6,LOTE6-CARLOTA,,25.0,carlota


In [20]:
items_clusters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10560288 entries, 0 to 10560287
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   item_id             int64  
 1   seq_dim_licitacao   object 
 2   outlier             int64  
 3   cluster             object 
 4   dsc_unidade_medida  object 
 5   ano                 int64  
 6   description         object 
 7   original            object 
 8   areas               float64
 9   price               float64
 10  first_token         object 
dtypes: float64(2), int64(3), object(6)
memory usage: 886.3+ MB


# Get reference prices for the items in the test set

In [None]:
items_test_df = get_reference_prices(results, cluster_prices_statistics, dsc_unidade=True)

In [None]:
items_test_df.head(10)

In [None]:
len(items_test_df)

In [None]:
len(items_test_df[items_test_df['cluster'] == '-2'])

In [None]:
items_test_df.to_csv("../dados/precificacao/fasttext_skip100/complete/baseline+embeddings/SUB+MED+unit+num_concat_umap_hdbscan_euclidean/items_clusters_test.csv.zip", sep=';', index=False, compression='zip')

# Get reference price for an arbitray item

In [8]:
from nlp.preprocessing import PreprocessingText

In [9]:
preprocessing = PreprocessingText()

In [11]:
def get_item_vec(_item, word_embeddings, word_class, categories=None, embedding_type=None,
                 norm=True, operation='mean'):
    
    if operation == 'mean':
        item_vec = get_item_embedding(_item.get_item_dict(), word_embeddings, word_class, \
                    categories=categories, embedding_type=embedding_type)
    elif operation == 'weighted':
        item_vec = get_item_embedding_weighted(_item.get_item_dict(), word_embeddings, word_class, \
                    categories=categories, embedding_type=embedding_type)
    elif operation == 'concatenate':
        item_vec = get_words_plus_categories_embeddings(_item.get_item_dict(), word_embeddings, word_class, \
                    categories=categories, embedding_type=embedding_type)
        
    if norm:
        item_vec = normalize(item_vec.reshape(1, -1))

    return item_vec

In [14]:
from item.pricing.pricing import get_prices_statistics_df

def pricing_item(description, word_embeddings, word_class, preprocessing, reducer_model,
                 clustering_model, items_clusters_df, categories=[], embedding_type=['N', 'MED'],
                 operation='mean', dsc_unidade=None, year=None):

    doc = preprocessing.preprocess_document(description)
    
    item = Item()
    itemslist = ItemList()
    item.extract_entities(doc, None, None, None, None, description, None, None, itemslist.set_unit_metrics,
                          itemslist.set_colors, itemslist.set_materials, itemslist.set_sizes,
                          itemslist.set_quantities, itemslist.set_qualifiers, itemslist.set_numbers)
    
    embedding_size = len(list(word_embeddings.values())[0])
    item_emb = get_item_vec(item, word_embeddings, word_class, categories=categories,
                            embedding_type=embedding_type, operation=operation)

    item_dict = item.get_item_dict()
    group = item_dict['palavras'][0]
    
    # It gets the reduced vector for the item
    item_emb_red = reducer_model[group].transform(item_emb)
    # It gets the item cluster
    cluster = approximate_predict(clustering_model[group], item_emb_red)
    cluster = group + '_' + str(cluster[0][0])

    if dsc_unidade != None and year != None:
        items_clusters_df = items_clusters_df[(items_clusters_df.cluster == cluster) & \
                                          (items_clusters_df.dsc_unidade_medida == dsc_unidade) &
                                          (items_clusters_df.ano == year)]
    elif dsc_unidade != None:
        items_clusters_df = items_clusters_df[(items_clusters_df.cluster == cluster) & \
                                          (items_clusters_df.dsc_unidade_medida == dsc_unidade)]
    elif year != None:
        items_clusters_df = items_clusters_df[(items_clusters_df.cluster == cluster) & \
                                          (items_clusters_df.ano == year)]
    else:
        items_clusters_df = items_clusters_df[(items_clusters_df.cluster == cluster)]

    if len(items_clusters_df) == 0:
        statistics = {
            'cluster': cluster,
            'mean': -1,
            'median': -1,
            'var': -1,
            'std': -1
        }
        return statistics

    dsc_unidade = True if dsc_unidade != None else False
    year = True if year != None else False
    cluster_statistics = get_prices_statistics_df(items_clusters_df, dsc_unidade, year)
    statistics = cluster_statistics.iloc[0].to_dict()

    return statistics

In [29]:
item = "MASCARA DE NEBULIZACAO ADULTO COMPLETO"
dsc_unidade_medida = 'unidade'
year_obj = 2018

In [30]:
pricing_item(item, word_embeddings, preprocessing.word_class, preprocessing, reducer_model,
             clustering_model, items_clusters_df, categories=['unidades_medida', 'numeros'],
             embedding_type=['N', 'MED'], operation='concatenate', dsc_unidade=dsc_unidade_medida,
             year=year_obj)

{'cluster': 'mascara_28',
 'dsc_unidade_medida': 'unidade',
 'ano': 2018,
 'mean': 102.06266363636364,
 'count': 22,
 'max': 862.0,
 'min': 0.01,
 'median': 13.34,
 'std': 248.38243824140866,
 'var': 61693.83562674719,
 'quantile_1': 8.102500000000001,
 'quantile_3': 45.55}

In [None]:
itemlist_train.items_df[itemlist_train.items_df.cluster == 'dipirona_20']