In [18]:
import re
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import time
import json
from nlp.preprocessing import (
    clean_text,
    preprocess,
    tokenize,
    preprocess_document,
    tokenize_document,
    get_stopwords,
    lemmatization)
from nlp.text_statistics import (
    number_tokens,
    tokens_length,
    unique_tokens,
    count_numbers,
    number_stopwords,
    print_statistics,
    count_tokens
)
from nlp.grouping import (
    get_groups,
    get_groups_size,
    get_first_token_groups
)
from nlp.utils import (
    read_dictionary
)
from nlp.spellcheckeropt import SpellcheckerOpt
from gensim.parsing.preprocessing import (
    strip_multiple_whitespaces,
    strip_non_alphanum,
    strip_punctuation2,
    strip_short)

In [None]:
file_recurso = '/Users/Pedro/Desktop/projeto-mp/dados/licitacao_vlr_recurso_funcao.csv'
data_recurso = pd.read_csv(file_recurso, sep=';')

In [None]:
data_recurso.info()

In [None]:
licitacoes_saude = data_recurso.loc[(data_recurso['nom_funcao'] == 'Saúde') & (data_recurso['proporcao_vlr'] >= 5.0)]

In [None]:
seq_dim_licitacao_list = list(licitacoes_saude['seq_dim_licitacao'])

In [None]:
len(set(seq_dim_licitacao_list))

In [None]:
file = '/Users/Pedro/Desktop/projeto-mp/dados/itens_pregao_pitem_saude.csv'
data = pd.read_csv(file, sep=';')

data.info()

In [None]:
data = data.loc[data['seq_dim_licitacao'].isin(seq_dim_licitacao_list)]

In [None]:
len(set(list(data['seq_dim_licitacao'])))

In [None]:
items = list(data['nom_item'])
len(items)

In [None]:
items_descriptions = preprocess(items)

In [None]:
len(items_descriptions)

In [22]:
words_file = '/Users/Pedro/Desktop/projeto-mp/dados/palavras/words_nilc.txt'
portuguese_words, all_words_nilc = read_dictionary(words_file, preprocess=True)

In [23]:
all_words_nilc = list(set(all_words_nilc))

In [24]:
with open("/Users/Pedro/Desktop/projeto-mp/dados/palavras/words_nilc_preprocessed.json", "w") as JFile:
    json.dump(all_words_nilc, JFile)

In [None]:
for letter, words_list in portuguese_words.items():
    print(letter, ':', len(words_list))

In [None]:
len(portuguese_words)

In [None]:
spell2 = SpellChecker(language='en', case_sensitive=False)

In [None]:
unique = unique_tokens(items_descriptions)

In [None]:
unique_words = collections.defaultdict(list)

for token in unique:
    unique_words[token[0]].append(token)

In [None]:
for letter, words_list in unique_words.items():
    print(letter, ':', len(words_list))

In [15]:
all_words = open('/Users/Pedro/Desktop/projeto-mp/dados/palavras/words_nilc_preprocess.txt', "r").readlines()

In [16]:
all_words = list(set(all_words))

In [17]:
all_words[:10]

['ccomcex\n',
 'paroxistica\n',
 'arreganhar\n',
 'interrogativo\n',
 'pulmonar\n',
 'versas\n',
 'cancele\n',
 'dumaine\n',
 'papando\n',
 'smart\n']

In [None]:
len(all_words)

In [None]:
all_words_nilc = list(set(all_words_nilc))

In [None]:
len(all_words_nilc)

In [None]:
first_tokens = get_first_token_groups(items_descriptions)

In [None]:
first_tokens_groups = get_groups(first_tokens)

In [None]:
len(first_tokens_groups)

In [None]:
spellchecker = SpellcheckerOpt()
spellchecker.load_words(list(first_tokens_groups.keys()))

In [None]:
groups_words = {}
i = 0

for group, count in first_tokens_groups.items():
    groups_words[group] = spellchecker.search(group, 2)
    i += 1
    if i%1000 == 0:
        print(i)

In [None]:
count = 0
list_words_sizes = []

for group, words_list in groups_words.items():
    words_list.sort(key=lambda x:(x[1],x[0]))
    list_words_sizes.append(len(words_list))
    if len(words_list) > 1:
        count += 1
    print(group, ':', words_list)

count

In [None]:
count = 0

for group, list_words in groups_words.items():
    if len(list_words) > 1:
        count += 1
        
count

In [None]:
count = 0

for group, list_words in groups_words.items():
    if first_tokens_groups[group] == 1:
        if len(list_words) > 1:
            count += 1
        
count

In [None]:
print_statistics(list_words_sizes)

In [None]:
new_groups = collections.defaultdict(int)

for group, words_list in groups_words.items():
    
    maxi = 1
    op = group
    for word in words_list:
        if word[1] > maxi:
            maxi = word[1]
            op = word[0]

    new_groups[op] += first_tokens_groups[op]
    first_tokens_groups[op] = 0
    new_groups[op] += first_tokens_groups[group]
    first_tokens_groups[group] = 0

In [None]:
len(new_groups)

In [None]:
new_groups_sizes = get_groups_size(new_groups)

In [None]:
new_groups_sizes.count(1)

In [None]:
spellchecker = SpellcheckerOpt()
spellchecker.load_words(unique)

In [None]:
start = time.time()

search_words = {}
i = 0

for word in unique[:100]:
    search_words[word] = spellchecker.search(word, 2)
    i += 1
    if i%1000 == 0:
        print(i)

end = time.time()

In [None]:
end - start

In [None]:
for word, words_list in search_words.item():
    print(word, ':', words_list)

In [None]:
new_groups2 = collections.defaultdict(int)

for group, words_list in groups_words.items():
    if first_tokens_groups[group] == 1: 
        new_groups2[group] += first_tokens_groups[group]
        first_tokens_groups[group] = 0
        for word in words_list:
            new_groups2[group] += first_tokens_groups[word[0]]
            first_tokens_groups[word[0]] = 0
    else:
        new_groups2[group] += first_tokens_groups[group]
        first_tokens_groups[group] = 0

In [None]:
new_groups3 = collections.defaultdict(int)
count = 0

for group, size in new_groups2.items():
    if size == 0:
        count += 1
    else:
        new_groups3[group] = size

count

In [None]:
len(new_groups3)

In [None]:
count = 0

for group, size in new_groups3.items():
    if size == 1:
        count += 1

count

In [None]:
count = 0

for group, size in first_tokens_groups.items():
    if size >= 1:
        count += 1

count