In [1]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import multiprocessing
import collections
from item.item_list import (
    ItemList,
    Item
)
from nlp.utils import (
    plot_histogram,
    get_completetext,
    plot_wordcloud,
    print_statistics,
    groups_frequency_sort)
from item.clustering.evaluate import (
    get_score_pickle,
    evaluate_results_pickle,
    evaluate_results,
    number_of_outliers_dict,
    get_score_baseline_pickle
)
from item.clustering.utils import (
    load_clustering_results_pickle,
    load_clustering_results
)
from item.clustering.item_representation import (
    load_items_embeddings
)

# Evaluate results

In [None]:
# It gets the descriptions processed:
itemlist = ItemList()
itemlist.load_items_from_file('items_preprocessed_complete_druid.csv.zip')

In [None]:
len(itemlist.items_df)

In [None]:
groups = itemlist.get_first_token_groups()

In [None]:
len(groups)

## Load results and embeddings

In [2]:
results, outliers, prices = load_clustering_results_pickle('../dados/output/druid/')

In [3]:
embeddings = load_items_embeddings('../dados/output/druid/embeddings.json')

In [4]:
len(results)

78566

## Number of outliers

In [5]:
outliers_items, outliers_groups, total = number_of_outliers_dict(results, outliers, baseline=True, total_cov=True)

In [6]:
outliers_groups

38934

In [7]:
total

11099038

In [8]:
outliers_items

2168110

In [9]:
100*(outliers_items/total)

19.534215487864802

## Metrics

In [None]:
# get_score_pickle(results, embeddings, score='silhouette', metric='cosine', baseline=True, norm=True)
get_score_pickle(results, embeddings, score='calinski', baseline=True, norm=True)

In [17]:
# scores = get_score_baseline_pickle(results, embeddings, score='silhouette', metric='cosine', sample_size=None, norm=False) 
scores = get_score_baseline_pickle(results, embeddings, score='calinski', sample_size=None, norm=False)

In [18]:
np.mean(scores)

137309.03471932435

In [None]:
intracluster_distance = evaluate_results_pickle(results, embeddings, n_threads=32)

In [None]:
distances = []

for group, distance in intracluster_distance.items():
    distances.append(distance['mean'])

In [None]:
np.mean(distances)

# Group sizes

In [None]:
num_subgroups = collections.defaultdict(int)
groups = {}
groups_sizes_list = []

for group, items in results.items():
    num_subgroups[group.split('_')[0]] += 1
    groups[group] = items + outliers[group]
    groups_sizes_list.append(len(items))

In [None]:
groups_sizes_list.count(1)

## Top-10 groups

In [None]:
group_sample = []

for group, items in results.items():
    size = len(items)
    if size == 1:
        group_sample.append(group)

group_sample[:10]

In [None]:
groups_names_size = groups_frequency_sort(groups)

In [None]:
groups_names_size[:10]

## ECDF

In [None]:
items_group_size = []

for group_size in groups_sizes_list:
    for i in range(group_size):
        items_group_size.append(group_size)

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF

# fit a cdf
ecdf = ECDF(items_group_size)

fig, (axis1) = plt.subplots(figsize=(10,8))
x_label = 'Tamanho de grupo'
y_label = 'Nº de itens (%)'


# kwargs = {'cumulative': True}
# sns.distplot(vlr_recurso, hist_kws=kwargs, kde_kws=kwargs)
plt.plot(ecdf.x, ecdf.y)

axis1.set_yticks([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
axis1.set_xticks([30, 250, 500, 750, 1000, 1250, 1500, 1750, 2000])
plt.xlim(0, 2000)
plt.axvline(x=30, linestyle='--', color='r')

axis1.set_xlabel(x_label, fontsize=20, weight='bold')
axis1.set_ylabel(y_label, fontsize=20, weight='bold')

plt.grid(axis='both', linestyle=':', linewidth=1.0)
# axis1.legend(loc='upper right')

plt.yticks(fontsize=14)
plt.xticks(fontsize=14)

plt.show()
plt.clf()

In [None]:
i = 0
for v in ecdf.x:
    if v > 30.0:
        print(100*ecdf.y[i])
        break
    i += 1

In [None]:
groups_size = []

for group, items in results.items():
    groups_size.append(len(items))

In [None]:
count = 0

for size in groups_size:
    if size <= 30:
        count += size

count

In [None]:
print_statistics(groups_sizes_list)

## Distribution of group sizes

In [None]:
plot_histogram(groups_sizes_list, 500, 'Nº de objetos', 'Nº de grupos', log=True)

In [None]:
count_interval = {}
count_interval['1'] = 0
count_interval['(1,5]'] = 0
count_interval['(5,10]'] = 0
count_interval['(10,100]'] = 0
count_interval['(100,1000]'] = 0
count_interval['(1000,5000]'] = 0
count_interval['(5000,10000]'] = 0
count_interval['>10000'] = 0

In [None]:
for p in groups_sizes_list:

    if p == 1:
        count_interval['1'] += 1
    elif p > 1 and p <= 5:
        count_interval['(1,5]'] += 1
    elif p > 5 and p <= 10:
        count_interval['(5,10]'] += 1
    elif p > 10 and p <= 100:
        count_interval['(10,100]'] += 1
    elif p > 100 and p <= 1000:
        count_interval['(100,1000]'] += 1
    elif p > 1000 and p <= 5000:
        count_interval['(1000,5000]'] += 1
    elif p > 5000 and p <= 10000:
        count_interval['(5000,10000]'] += 1
    else:
        count_interval['>10000'] += 1

In [None]:
total = 0
lines = []
aux = 0
for interval, value in count_interval.items():
    print(interval, ':', value)
    for i in range(0, value):
        lines.append([interval])
        aux += 1
    total += value

In [None]:
intervals_df = pd.DataFrame(lines, columns=["size"])
intervals_df.info()

In [None]:
import seaborn as sns
sns.set_style("white")

fig, (axis1) = plt.subplots(figsize=(12,8))

sns.countplot(y="size", data=intervals_df, color='dodgerblue')

axis1.set_xlabel("Nº de grupos", fontsize=20, weight='bold')
axis1.set_ylabel("Nº de objetos", fontsize=20, weight='bold')
plt.grid(False)

total = len(intervals_df)
for p in axis1.patches:
    width = p.get_width()
    axis1.text(width, p.get_y()+0.7, '(%d)'%(width), fontsize=15)
    axis1.text(width, p.get_y()+0.4, '%.2f%%'%(100*float(width)/total), fontsize=15)

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

plt.show()
plt.clf()

## Number of subgroups

In [None]:
subgroups = [num for group, num in num_subgroups.items()]
plot_histogram(subgroups, 80, 'Nº de grupos', 'Nº de subgrupos', log=True)

## Examples

In [None]:
size = 100

for group, items in results.items():
    if len(items) == size:
        print(group)

In [None]:
groups_sample = {}

for group, items in results.items():
    first_token = group.split('_')
    if first_token[0] == 'sabao':
        groups_sample[group] = items

In [None]:
len(groups_sample)

In [None]:
total = 0
sample_size = 5

for group, items in groups_sample.items():
    total += len(items)
    descs = []
    for id_ in items:
        descs.append(' '.join(eval(itemlist.items_df.iloc[id_]['original_prep'])))
    print('*********************')
    print(group)
    print(len(descs))
    descs = list(set(descs))
    print(len(descs))
    if len(descs) <= sample_size:
        sample = descs
    else:
        sample = random.sample(descs, sample_size)
    for d in descs:
        print(d)

In [None]:
total

In [None]:
num_subgroups['pneu']

In [None]:
itemlist.get_group_items(results['pneu_0'])