In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
import json
import multiprocessing
import json
from item.item_list import (
    ItemList,
    Item
)
from nlp.utils import (
    plot_histogram,
    get_completetext,
    plot_wordcloud,
    print_statistics,
    groups_frequency_sort,
    read_json_file,
    get_tokens_set
)
from nlp.grouping import (
    get_groups,
    get_groups_size,
    get_unigram_groups,
    get_two_tokens_groups,
    get_first_token_groups,
    get_bigram_groups,
    get_first_two_groups,
    groups_frequency_sort
)
from nlp.pos_tagging import (
    get_tokens_tags
)
from nlp.word_embeddings import (
    load_word_embeddings,
    get_item_embedding,
    get_items_embeddings,
    get_items_similarities
)
from nlp.spellcheckeropt import SpellcheckerOpt

# Loading word embeddings

In [None]:
#  word embeddings file, each line contains an embedding
word_embeddings_file = '../../../embeddings/cbow_s50.txt'

In [None]:
# read word embeddings from file and store them in a map
word_embeddings = load_word_embeddings(word_embeddings_file)

In [None]:
# It gets the descpitons processed:
itemlist = ItemList()
itemlist.load_items_from_file('../dados/items_preprocessed.zip', just_words=True)

In [None]:
# Get the tags of tokens descriptions
word_class = get_tokens_tags()

In [None]:
count = 0
empty = 0

for doc in itemlist.items_list:
    flag = False
    for tok in doc:
        if tok not in word_class:
            continue
        elif word_class[tok] in {'N', 'MED'} and tok in word_embeddings:
            flag = True
    
    if flag:
        count += 1
    else:
        print(doc)
        empty += 1

print(count)
print(empty)

In [None]:
len(itemlist.items_list)

In [None]:
# Build the vector representation for an item using the word embeddings
items_embeddings = get_items_embeddings(itemlist.items_list, word_embeddings, word_class, embedding_type=['N', 'MED'])

In [None]:
len(items_embeddings)

In [None]:
count = 0
items_woembedding = set()

for i in range(0, len(items_embeddings)):
    embedding = items_embeddings[i]
    zero_vector = np.array(embedding) == np.zeros(len(embedding))
    if zero_vector.all():
        items_woembedding.add(i)
        count += 1

count

In [None]:
first_token_groups = itemlist.get_first_token_groups(just_words=True)

In [None]:
len(first_token_groups)

In [None]:
file = './results/baseline+embeddings/embeddings50_SUB+MED_xmeans.pkl'
a_file = open(file, "rb")
output = pickle.load(a_file)

In [None]:
count = 0
groups_woembedding = set()

for group, items in first_token_groups.items():
    items_set = set(items)
    intersection = items_set.intersection(items_woembedding)
    if len(intersection) >= len(items_set) - 1:
        groups_woembedding.add(group)
        count += 1

count

In [None]:
count = 0

for group, items in first_token_groups.items():
    if group in groups_woembedding:
        count += len(items)

count

# Spellchecker

In [None]:
unique_words = itemlist.unique_words

In [None]:
len(unique_words)

In [None]:
words_set_file = '../dados/palavras/words_nilc_preprocessed.json'

In [None]:
words_set = set(read_json_file(words_set_file))

In [None]:
medical = get_tokens_set('../dados/palavras/medications.txt')
medical = set(medical)

In [None]:
count = 0
tokens_woembedding = set()

for token in unique_words:
    if token not in word_embeddings and token not in words_set and token not in medical:
        tokens_woembedding.add(token)
        count += 1

count

In [None]:
count = 0

for token in unique_words:
    if token in word_embeddings and (token not in words_set and token not in medical):
        count += 1

count

In [None]:
words_set = list(words_set) + list(medical) + list(word_embeddings.keys())

In [None]:
spellchecker = SpellcheckerOpt()
spellchecker.load_words(words_set)

In [None]:
token_woembedding_similar = {}
words_checked = 0

distance = 2
verbose = True

for token in tokens_woembedding:
    words_list = spellchecker.search(token, distance)
    if len(words_list) > 0:
        words_list.sort(key=lambda x:(x[1], x[0]))
        token_woembedding_similar[token] = words_list[0][0]
    words_checked += 1
    if verbose and words_checked%1000 == 0:
        print('%d words checked' % (words_checked))

In [None]:
len(token_woembedding_similar)

In [None]:
token_woembedding_similar

In [None]:
count = 0

for token, similar in token_woembedding_similar.items():
    if similar in word_embeddings:
        count += 1

count

In [None]:
with open("../dados/palavras/right_words_nilc.json", "w") as JFile:
    json.dump(token_woembedding_similar, JFile)

# Build word embedding from set of  public procurements

In [None]:
from gensim.models import FastText

In [None]:
licitacao_items = read_json_file("../dados/licitacao_items_preprocessed.json")

In [None]:
sample = list(licitacao_items.values())

In [None]:
sample[:10]

In [None]:
items_licitacao = []

for licitcao, items in licitacao_items.items():
    licitacao_items_list = []
    for item in items:
        licitacao_items_list += item
    items_licitacao.append(licitacao_items_list)

In [None]:
len(items_licitacao)

In [None]:
items_licitacao[:10]

In [None]:
model = FastText(items_licitacao[:100], size=300, window=10, batch_words=1000, sg=1, workers=3, iter=20, min_count=0, word_ngrams=1)
model.save("fasttext_s300.model")