In [None]:
import pandas as pd
import numpy as np
import collections
import copy
import random
import re
import matplotlib.pyplot as plt
from nlp.utils import (
    plot_histogram,
    get_completetext,
    plot_wordcloud,
    print_statistics,
    groups_frequency_sort)
from nlp.text_statistics import (
    count_tokens,
    unique_tokens
)
from utils.read_files import (
    get_items)
from item.item_list import (
    ItemList,
    Item
)
from item.utils import get_tokens_set
from item.clustering.utils import *

# Load price statistics

In [None]:
prices = pd.read_csv('../data/output/druid_fasttext/cluster_prices_statistics.csv.zip', sep=';')

In [None]:
prices.info()

In [None]:
prices[prices.grupo == 'canula_0'].head(200)

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
prices[prices.grupo == 'mascara_28'].set_index(['grupo']).drop(['primeiro_termo'], axis=1)

In [None]:
len(set(prices[prices.primeiro_termo == 'abobora']['grupo']))

In [None]:
len(set(prices["cluster"]))

# Load train set

In [None]:
train_prices = pd.read_csv('../data/output/druid_fasttext/items_clusters_train_wo_out.csv.zip', sep=';', low_memory=False)

In [None]:
train_prices.info()

In [None]:
len(train_prices[train_prices.grupo == 'gasolina_1'])

In [None]:
len(train_prices[(train_prices.grupo == 'canula_1')])

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
# train_prices[(train_prices.grupo == 'mascara_28') & (train_prices.dsc_unidade_medida == 'kit')]
train_prices[(train_prices.primeiro_termo == 'gasolina') & (train_prices.grupo_ruido == 0)].head(1000)

In [None]:
train_prices['divided'] = (train_prices['cluster'] != train_prices['first_token'])

In [None]:
len(set(train_prices[(train_prices.first_token == "papel") & (train_prices.outlier == 0)]['cluster']))

In [None]:
train_prices.head()

## Groups

In [None]:
groups_train = train_prices[['cluster', 'first_token', 'divided', 'outlier']]

In [None]:
groups_train.head()

In [None]:
groups_train = groups_train.drop_duplicates()

In [None]:
groups_train.info()

In [None]:
groups_train.head()

In [None]:
groups_train['count'] = groups_train['first_token'].map(groups_train['first_token'].value_counts())

In [None]:
groups_train['label'] = "Subdividido"

In [None]:
groups_train.loc[groups_train['outlier'] == 1, 'label'] = 'Ruído'

In [None]:
groups_train.loc[groups_train['divided'] == False, 'label'] = 'Não Subdividido'

In [None]:
groups_train.loc[(groups_train['divided'] == True) & (groups_train['count'] == 1), 'label'] = 'Apenas Ruído'

In [None]:
groups_train.head()

In [None]:
import seaborn as sns
sns.set_style("white")

fig, (axis1) = plt.subplots(figsize=(12,8))

sns.countplot(x="label", data=groups_train, color='dodgerblue', order=['Subdividido', 'Não Subdividido', 'Ruído', 'Apenas Ruído'])

axis1.set_xlabel("Grupo", fontsize=20, weight='bold')
axis1.set_ylabel("Nº de grupos", fontsize=20, weight='bold')
plt.grid(False)

total = len(groups_train)
for p in axis1.patches:
    height = p.get_height()
    axis1.text(p.get_x()+0.07, height+1, '(%d)'%(height), fontsize=15)
    axis1.text(p.get_x()+0.45, height+1, '%.2f%%'%(100*float(height)/total), fontsize=15)

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

plt.show()
plt.clf()

In [None]:
from nlp.utils import (
    plot_histogram,
    get_completetext,
    plot_wordcloud,
    print_statistics,
    groups_frequency_sort)

In [None]:
subgroups = list(groups_train['first_token'].value_counts())
plot_histogram(subgroups, 80, 'Nº de grupos', 'Nº de subgrupos (log)', log=True)

In [None]:
groups_train[['first_token', 'count']].drop_duplicates().sort_values(['count'], ascending=False).head()

In [None]:
len(groups_train[['first_token', 'label']][groups_train.label == "Apenas Ruído"].drop_duplicates())

In [None]:
len(groups_train[['first_token', 'label']].drop_duplicates())

## Get group descriptions

In [None]:
results_train, outliers_train, prices_train = load_clustering_results_pickle('../data/output/druid/')

In [None]:
itemlist_train = ItemList()
itemlist_train.load_items_from_file('../data/output/druid_fasttext/f03_items.csv.zip')

In [None]:
items_train = itemlist_train.items_df.copy()

In [None]:
items_train.head()

In [None]:
items_train_clusters = pd.merge(left=train_prices[['item_id', 'grupo', 'primeiro_termo', 'grupo_ruido']],
                                right=items_train[['item_id', 'original', 'palavras', 'unidades_medida', 'numeros', 'cores', 'materiais', 'tamanho', 'quantidade', 'preco']],
                                left_on='item_id', right_on='item_id')

In [None]:
items_train_clusters[(items_train_clusters.primeiro_termo == 'oleo') & (items_train_clusters.numeros != '[]') & (items_train_clusters.grupo_ruido == 0)].head(1000)

In [None]:
results_train['canula_1']

In [None]:
itemlist_train.items_df['description'] = [' '.join(eval(description)) for description in list(itemlist_train.items_df['original_prep'])]

In [None]:
itemlist_train.items_df.head()

In [None]:
len(results_train['canula_36'])

In [None]:
group_name = 'mascara_17'
items = itemlist_train.items_df.iloc[results_train[group_name], :]

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
items

# Load test set

In [None]:
itemlist = ItemList()
itemlist.load_items_from_file('../dados/items_preprocessed_v3_complete_test.csv.zip')

In [None]:
items_test = itemlist.items_df.copy()

In [None]:
test_prices = pd.read_csv('../dados/precificacao/fasttext_skip100/complete/baseline+embeddings/SUB+MED+unit+num_concat_umap_hdbscan_euclidean/items_clusters_test.csv.zip', sep=';')

In [None]:
test_prices.info()

In [None]:
items_test.info()

In [None]:
len(test_prices)

In [None]:
test_prices.head()

In [None]:
100*(float(len(test_prices[test_prices.cluster == '-2']))/len(test_prices))

In [None]:
100*(float(len(test_prices[(test_prices.outlier == 1) & (test_prices.cluster != '-2')]))/len(test_prices))

In [None]:
100*(float(len(test_prices[(test_prices.outlier == 0) & (test_prices['count'] == -1.00)]))/len(test_prices))

In [None]:
100*(float(len(test_prices[(test_prices['count'] != -1.00) & (test_prices['count'] < 20)]))/len(test_prices))

In [None]:
100*(float(len(test_prices_wonan[(test_prices_wonan.alert == True) & (test_prices_wonan['count'] >= 20)]))/len(test_prices))

In [None]:
items_test['id'] = range(len(items_test))
items_test = items_test[items_test.palavras != '[]']

In [None]:
test_prices = pd.merge(left=test_prices, right=items_test[['id', 'original']], left_on='item_id', right_on='id')

In [None]:
test_prices.head()

In [None]:
test_prices_wonan = test_prices.dropna(axis=0, subset=['std'])

In [None]:
test_prices_wonan['alert'] = np.where((test_prices_wonan['price'] > test_prices_wonan['mean'] + 2*test_prices_wonan['std']) | (test_prices_wonan['price'] < test_prices_wonan['mean'] - 2*test_prices_wonan['std']), True, False)

In [None]:
test_prices[test_prices.cluster == 'gasolina_4'].head(200)

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None)
test_prices[test_prices.cluster == 'dea_0'].set_index(['cluster']).drop(['item_id', 'id', 'seq_dim_licitacao', 'outlier', 'cluster_prob', 'areas', 'mean', 'count', 'max', 'min', 'median', 'std', 'var', 'quantile_1', 'quantile_3'], axis=1)

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None)
test_prices_wonan[(test_prices_wonan.cluster == 'pneu_1') & (test_prices_wonan.alert == True)].set_index(['cluster']).drop(['id', 'seq_dim_licitacao', 'outlier', 'cluster_prob', 'areas', 'count', 'max', 'min', 'median', 'var', 'quantile_1', 'quantile_3'], axis=1)

In [None]:
len(test_prices[test_prices.cluster == 'pneu_1'])

In [None]:
len(test_prices_wonan[(test_prices_wonan.cluster == 'pneu_1') & (test_prices_wonan.alert == True)])

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None)
test_prices_wonan[(test_prices_wonan.cluster == 'pneu_1') & (test_prices_wonan.alert == False)].set_index(['cluster']).drop(['id', 'seq_dim_licitacao', 'outlier', 'cluster_prob', 'areas', 'count', 'max', 'min', 'median', 'var', 'quantile_1', 'quantile_3'], axis=1)