In [42]:
import ast
import pandas as pd
import json
import numpy as np
from collections import defaultdict
from item.item_list import (ItemList, Item)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from nlp.preprocessing import (get_canonical_words, get_stopwords, preprocess_document, spellcheck_document)
from IPython.display import display

### Files reading

In [2]:
jfile =  open('../dados/palavras/right_words_nilc.json', "r")
right_word = json.load(jfile)
jfile.close()

In [3]:
jfile =  open('../dados/palavras/right_words_nilc.json', "r")
right_word = json.load(jfile)
jfile.close()

itemList = ItemList()
itemList.load_items_from_file('items_preprocessed_complete_druid.csv.zip')
itemsDf = itemList.items_df

### Useful data

In [4]:
categories = ['palavras', 'unidades_medida', 'numeros', 'cores', 'materiais', 'tamanho', 'quantidade']
canonicalForm = get_canonical_words()
stopwords_ = get_stopwords()
relevantStopwords = {'para', 'com', 'nao', 'mais', 'muito', 'so', 'sem', \
                          'mesmo', 'mesma', 'ha', 'haja', 'hajam', 'houver', \
                          'houvera', 'seja', 'sejam', 'fosse', 'fossem', 'forem', \
                          'sera', 'serao', 'seria', 'seriam', 'tem', 'tinha', \
                          'teve', 'tinham', 'tenha', 'tiver', 'tiverem', 'tera', \
                          'terao', 'teria', 'teriam', 'uma', 'mais', 'entre', \
                          'te'}
stopwords_ = stopwords_ - relevantStopwords

### Analysis Methods

In [5]:
def word_cloud2(frequencies, title, prefer_horizontal=0.90):
    wordcloud = WordCloud(prefer_horizontal=prefer_horizontal, collocations=False, width=1600, height=800).generate_from_frequencies(frequencies)
#     fig, ax = plt.subplots(figsize=(16,8))            
#     ax.imshow(wordcloud)
#     ax.set_axis_off()
#     plt.imshow(wordcloud)
#     plt.title(title)
    return wordcloud

In [6]:
def word_cloud(frequencies, title, prefer_horizontal=0.90):
    wordcloud = WordCloud(prefer_horizontal=prefer_horizontal, collocations=False, width=1600, height=800).generate_from_frequencies(frequencies)
    fig, ax = plt.subplots(figsize=(16,8))            
    ax.imshow(wordcloud)
    ax.set_axis_off()
    plt.imshow(wordcloud)
    plt.title(title)
    return wordcloud

In [7]:
def get_items_without_first_token(itemsDf):
    # Filtering items without first token
    noWordsTokensFreq = defaultdict(int)
    tokensDescriptions = defaultdict(list)
    firstTokenFreq = defaultdict(int)
    secondTokenFreq = defaultdict(int)
    thirdTokenFreq = defaultdict(int)
    noWords = itemsDf.loc[itemsDf.palavras == '[]']
    noWordsFreq = round((len(noWords)/itemsDf.size)*100, 2)
    
    print(str(f'Quantidade de items sem palavras: {len(noWords)} ({noWordsFreq}%)'))
    
    for item in noWords['original_prep']:
        tokens = ast.literal_eval(item)
        if len(tokens) >= 3:
            firstTokenFreq[tokens[0]] += 1
            secondTokenFreq[tokens[1]] += 1
            thirdTokenFreq[tokens[2]] += 1
        elif len(tokens) >= 2:
            firstTokenFreq[tokens[0]] += 1
            secondTokenFreq[tokens[1]] += 1
        elif len(tokens) == 1:
            firstTokenFreq[tokens[0]] += 1
        else:
            print('*****', tokens)
        for token in tokens:
            tokensDescriptions[token].append(' '.join(tokens))
            noWordsTokensFreq[token] += 1
    
    noWordsTokensFreq = dict(sorted(noWordsTokensFreq.items(), key=lambda x: x[1], reverse=True))
    firstTokenFreq = dict(sorted(firstTokenFreq.items(), key=lambda x: x[1], reverse=True))
    secondTokenFreq = dict(sorted(secondTokenFreq.items(), key=lambda x: x[1], reverse=True))
    thirdTokenFreq = dict(sorted(thirdTokenFreq.items(), key=lambda x: x[1], reverse=True))
    
    return noWords, noWordsTokensFreq, tokensDescriptions, firstTokenFreq, secondTokenFreq, thirdTokenFreq

In [8]:
def proprocess_dataframe(itemsDf):
    removedTokens = defaultdict(int)
    changedTokens = defaultdict(int)
    corrections = defaultdict(str)
    correctionPairs = defaultdict(int)
    for index,row in itemsDf.iterrows():
        original = row['original'].lower().strip()
        preprocessed = preprocess_document(row['original'], remove_numbers=False, stopwords=stopwords_).strip()
        if(original != preprocessed):
            print('** ORIGINAL DESCRIPTION:', original)
            print('** PREPROCESSED DESCRIPTION:', preprocessed)
            removed = list(set(original).difference(set(preprocessed)))
            for token in removed:
                removedTokens[token] += 1
        corrected = ' '.join(spellcheck_document(preprocessed.split(' '), right_word)).strip()
        if(preprocessed != corrected):
            print('** CORRECTED DESCRIPTION:', preprocessed)
            preprocessed = preprocessed.split(' ')
            corrected = corrected.split(' ')
            for i in range(len(preprocessed)):
                if preprocessed[i] != corrected[i]:
                    pair = preprocessed[i] + '(' + corrected[i] + ')'
                    corrections[preprocessed[i]] = corrected[i]
                    correctionPairs[pair] += 1
                    changedTokens[preprocessed[i]] += 1
        print('-'*40)
    return [removedTokens, changedTokens, corrections, correctionPairs]

In [9]:
def get_category_tokens_frequency(df, category):
    category_tokens = defaultdict(int)
    for label, row in df.iterrows():
        for token in ast.literal_eval(row[category]):
            category_tokens[token] += 1
    return dict(sorted(category_tokens.items(), key=lambda x: x[1], reverse=True))

In [10]:
def get_desc_category_frequency(df, category):
    desc_category_freq = defaultdict(int)
    for label, row in df.iterrows():
        tokens = ast.literal_eval(row[category])
        desc_category_freq[len(tokens)] += 1
    return dict(sorted(desc_category_freq.items(), key=lambda x: x[1], reverse=True))

In [11]:
def get_description_lenght_frequency(df):
    original = defaultdict(int)
    prep = defaultdict(int)
    for label, row in df.iterrows():
        original_lenght = len(row['original'].split(' '))
        prep_lenght = len(ast.literal_eval(row['original_prep']))
        original[original_lenght] += 1
        prep[prep_lenght] += 1
    return [original, prep]

In [12]:
def plot_hist_from_dict(data, title, text, customInterval=False):
    interval = np.arange(0, 10, 1)
    labels, values = zip(*data.items())
    plt.rcParams["figure.figsize"] = (16,8)
    plt.title(title)
    plt.figtext(0.7,0.5,text)
    if(max(data.keys()) > 10 and customInterval):
        interval = np.arange(0, max(data.keys()), 5)
        plt.xticks(interval)
    plt.xlim(0, max(data.keys()))
    plt.bar(labels, values)
    plt.show()

### Analysis

In [13]:
itemsDf.head()

Unnamed: 0,palavras,unidades_medida,numeros,cores,materiais,tamanho,quantidade,preco,dsc_unidade_medida,original,licitacao,original_prep,funcao,ano
0,['medroxiprogesterona'],"['mg', 'ml']","['150', '1']",[],[],[],[],25.635,ampola,MEDROXIPROGESTERONA 150 MG/ML 1 ML,297107,"['medroxiprogesterona', '150', 'mg', 'ml', '1']",,2014
1,"['amiodarona', 'com']",['mg'],['200'],[],[],[],"['cx', 'comprimido']",74.0,cx,AMIODARONA 200 MG CX COM 200 COMPRIMIDOS,297107,"['amiodarona', '200', 'mg', 'cx', 'com', 'comp...",,2014
2,"['aerolin', 'spray']",[],[],[],[],[],[],22.7225,frasco,AEROLIN SPRAY,297107,"['aerolin', 'spray']",,2014
3,"['microhbrida', 'a30', 'reposicao']",[],[],[],['resina'],[],[],69.8,unid,RESINA MICROHBRIDA COR A30 REPOSICAO,297110,"['resina', 'microhbrida', 'cor', 'a30', 'repos...",,2014
4,"['broca', 'diamantado', 'cilindrico', 'plano']",[],['10'],[],[],[],[],3.3333,und,10 BROCA DIAMANTADA CILINDRICA PLANA:,297110,"['broca', 'diamantado', 'cilindrico', 'plano',...",,2014


In [14]:
noWordsDf, noWordsTokensFreq, tokensDescriptions, firstTokenFreq, secondTokenFreq, thirdTokenFreq = get_items_without_first_token(itemsDf)

Quantidade de items sem palavras: 32163 (0.02%)


In [17]:
noWordsDf.head()

Unnamed: 0,palavras,unidades_medida,numeros,cores,materiais,tamanho,quantidade,preco,dsc_unidade_medida,original,licitacao,original_prep,funcao,ano
299,[],['cm'],"['8', '16', '23']",[],['espuma'],[],['bloco'],6.2125,unidade,ESPUMA EM BLOCO 8 X 16X23CM,35800,"['espuma', 'bloco', '8', 'x', '16', '23', 'cm']",,2014
319,[],['mm'],"['8', '0', '5', '16', '635']",[],['aco'],[],[],19.5,unidade,00000635 ACO 8.0MM (5/16),347067,"['aco', '8', '0', 'mm', '5', '16', '635']",,2014
913,[],['mm'],"['42', '60', '2']",['amarelo'],[],[],[],0.9,unidade,E.V.A. AMARELO-42X60X2MM,38375,"['v', 'amarelo', '42', 'x', '60', '2', 'mm']",,2014
1113,[],['mt'],"['4', '100']",['preto'],['lona'],[],[],0.0,m,LONA PRETA 4X100 MT,378463,"['lona', 'preto', '4', 'x', '100', 'mt']",,2014
1330,[],[],"['7', '8']",[],['ferro'],[],[],18.775,kg,FERRO T 7/8,40069,"['ferro', 't', '7', '8']",,2014


In [20]:
noWordsDf[noWordsDf['original_prep'].str.contains("x")][['palavras']]

pandas.core.frame.DataFrame

In [87]:
tokens_category_df = pd.DataFrame(columns=["unidades_medida","numeros","cores","materiais", "tamanho","quantidade", "original_prep"])

In [91]:
tokens_category_df.to_csv('../dados/categoria_tokens_sem_palavras.csv')

In [89]:
for token in tokens_desc.keys():
    print('token', token)
    for desc in tokens_desc[token].keys():
        tokenDf = noWordsDf[noWordsDf['original_prep'] == ( str(desc.split(' ')))][["unidades_medida","numeros","cores","materiais", "tamanho","quantidade", "original_prep"]]
#         display(tokenDf)
        tokens_category_df = pd.concat([tokens_category_df,tokenDf[:1]])

token x
token cm
token rolo
token 1
token espuma
token lona
token 4
token 5
token preto
token aco


In [None]:
word_cloud(firstTokenFreq, 'Primeiro token das descrições sem token')

In [None]:
word_cloud(secondTokenFreq, 'Segundo token das descrições sem token')

In [None]:
word_cloud(thirdTokenFreq, 'Terceiro token das descrições sem token')

In [22]:
tokens_desc = defaultdict(lambda: defaultdict(int))

for token in list(noWordsTokensFreq.keys())[:10]:
#     print(f'#### {token} ####')
    for desc in tokensDescriptions[token]:
        tokens_desc[token][desc] +=1
#         print(f' - {desc}')
    tokens_desc[token] = dict(sorted(tokens_desc[token].items(), key=lambda x: x[1], reverse=True)[:10])

In [None]:
for token in tokens_desc:
    print(f'# 10 descrições mais frequentes para o token {token}')
    for desc in tokens_desc[token]:
        print(f'- {desc}')

In [None]:
removedTokens, changedTokens, corrections, correctionPairs = proprocess_dataframe(noWordsDf)

In [None]:
for i in range(len(categories)):
    freq = []
    freq = get_category_tokens_frequency(noWordsDf, categories[i])
    if len(freq) > 0:
        word_cloud(freq, str(f'Tokens de descrições sem tokens em {categories[i].upper()}'))
    else:
        print(f'Categoria {categories[i].upper()} não possui tokens')

In [None]:
for i in range(len(categories)):
    freq = []
    freq = get_desc_category_frequency(noWordsDf, categories[i])
    title = str(f'Distribuição de frequência dos tokens em {categories[i].upper()}')
    text = str(f'Média: {str(round(sum([k*v for k,v in freq.items()])/sum(v for v in freq.values()), 2))}')
    if len(freq) > 0:
        plot_hist_from_dict(freq, title, text, customInterval=True)
    else:
        print(f'Categoria {categories[i].upper()} não possui tokens')

In [None]:
original, prep = get_description_lenght_frequency(noWordsDf)

In [None]:
title = str('Quantidade de tokens do texto original')
text = str(f'Média: {str(round(sum([k*v for k,v in original.items()])/sum(v for v in original.values()), 2))}')
plot_hist_from_dict(original, title, text)

In [None]:
title = str('Quantidade de tokens do texto original')
text = str(f'Média: {str(round(sum([k*v for k,v in prep.items()])/sum(v for v in prep.values()), 2))}')
plot_hist_from_dict(prep, title, text)

### Anslysis Result

In [None]:
word_cloud(noWordsTokensFreq, 'Tokens de descrições sem primeiro token')
## Tokens de descrições sem primeiro token excluindo tokens classificados

In [None]:
word_cloud(removedTokens, 'Tokens removidos no pré-processamento', prefer_horizontal=1)

In [None]:
word_cloud(correctionPairs, 'Correções mais frequentes')

In [None]:
word_cloud(changedTokens, 'Palavras mais corrigidas')

### Old

In [None]:
fig = plt.figure()
for i in range(len(categories[:2])):
    freq = []
    freq = get_category_tokens_frequency(itemsDf, categories[i])
    ax = fig.add_subplot(1,2,i+1)
    wordcloud = word_cloud2(freq,  categories[i])
    ax.imshow(wordcloud)
    ax.axis('off')

In [None]:
noWordsFrequentTokens = dict(sorted(noWordsTokensFreq.items(), key=lambda x: x[1], reverse=True)[:50])
for index, row in noWords[['original', 'original_prep']].iterrows():
    tokens = ast.literal_eval(row['original_prep'])
    for token in tokens:
        if token in list(noWordsFrequentTokens.keys()):
            print('original:', row['original'])  