In [1]:
import pandas as pd
import numpy as np
import collections
import copy
import random
import matplotlib.pyplot as plt
from nlp.preprocessing import (
    clean_text,
    preprocess,
    tokenize,
    preprocess_document,
    tokenize_document,
    get_stopwords, 
    lemmatization_document,
    get_canonical_words)
from nlp.utils import (
    plot_histogram,
    get_completetext,
    plot_wordcloud,
    print_statistics,
    groups_frequency_sort)
from nlp.text_statistics import (
    count_tokens,
    unique_tokens,
    sort_frequency_tokens
)
from nlp.grouping import (
    get_groups,
    get_groups_size,
    get_unigram_groups,
    get_two_tokens_groups,
    get_first_token_groups,
    get_bigram_groups,
    get_first_two_groups,
    groups_frequency_sort
)
from utils.read_files import (
    get_items)
from item.item_list import (
    ItemList,
    Item
)
from item.spellcheckeropt import SpellcheckerOpt
from item.utils import get_tokens_set
from textpp_ptbr.preprocessing import TextPreProcessing as tpp
from gensim.parsing.preprocessing import (
    strip_multiple_whitespaces,
    strip_non_alphanum,
    strip_punctuation2,
    strip_short)

medicamentos_file = pd.read_csv('../dados/medicamentos.csv', delimiter='_', encoding='utf-8')

medicamentos_file.info()

substancias = list(medicamentos_file['SUBSTÂNCIA'])
produtos = list(medicamentos_file['PRODUTO'])

medicamentos_set = set()
stopwords_ = get_stopwords()
medicamentos_list = substancias + produtos

for med in medicamentos_list:
    doc = preprocess_document(med, remove_numbers=False)
    doc = tokenize_document(doc, stopwords_)
    print(doc)
    for tok in doc:
        medicamentos_set.add(tok)

len(medicamentos_set)

count = 0
for med in medicamentos_set:
    if len(med) >= 0:
        count += 1
        print(med)

In [2]:
itemlist = ItemList()
itemlist.load_items_from_file('../dados/items_preprocessed.zip')

In [3]:
medicamentos = get_tokens_set('../dados/palavras/medications.txt')

In [4]:
canonical_form, word_class = get_canonical_words()

In [None]:
len(word_class)

In [None]:
tags = set()

for token, tag in word_class.items():
    tags.add(tag)

In [None]:
len(tags)

In [None]:
tags

In [None]:
items_list = itemlist.items_list

In [None]:
items_words = []

for item in items_list:
    item_dict = item.get_item_dict()
    items_words.append(item_dict['palavras'])

In [None]:
count = 0

for doc in items_words:
    count += len(doc)
    
count

In [None]:
word_tags = []
not_tagged = 0
tag_count = collections.defaultdict(int)

for doc in items_words:
    for tok in doc:
        if tok in word_class:
            word_tags.append((tok, word_class[tok]))
            tag_count[word_class[tok]] += 1
        elif tok in medicamentos:
            word_tags.append((tok, 'MED'))
            tag_count['MED'] += 1
        else:
            word_tags.append((tok, 'UNTAGGED'))
            not_tagged += 1
            
not_tagged

In [None]:
count = 0
for doc in items_words:
    for tok in doc:
        if tok in word_class and word_class[tok] == 'N':
            count += 1
            break

count

In [None]:
len(items_words)

In [None]:
len(word_tags)

In [None]:
tag_name_count = sort_frequency_tokens(tag_count)

In [None]:
tag_name_count[:10]

In [None]:
dataframe = pd.DataFrame(word_tags, columns=['word', 'tag'])
dataframe.info()

In [None]:
import seaborn as sns
sns.set_style("white")

fig, (axis1) = plt.subplots(figsize=(12,8))

sns.countplot(y="tag", data=dataframe, color='dodgerblue', order=dataframe['tag'].value_counts().index)

axis1.set_xlabel("Nº de tokens", fontsize=20, weight='bold')
axis1.set_ylabel("Tag", fontsize=20, weight='bold')
plt.grid(False)
plt.xscale('log')

total = len(dataframe)
for p in axis1.patches:
    width = p.get_width()
    axis1.text(width, p.get_y()+0.5, '%.2f%%'%(100*float(width)/total), fontsize=15)

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

plt.show()
plt.clf()

In [None]:
words = []

for doc in items_words:
    for tok in doc:
        words.append(tok)
        
words = list(set(words))

In [None]:
len(words)

In [None]:
unique_word_tags = []
not_tagged = 0

for tok in words:
    if tok in word_class:
        unique_word_tags.append((tok, word_class[tok]))
    elif tok in medicamentos:
        unique_word_tags.append((tok, 'MED'))
    else:
        unique_word_tags.append((tok, 'UNTAGGED'))
        not_tagged += 1
        
not_tagged

In [None]:
len(unique_word_tags)

In [None]:
dataframe = pd.DataFrame(unique_word_tags, columns=['word', 'tag'])
dataframe.info()

In [None]:
import seaborn as sns
sns.set_style("white")

fig, (axis1) = plt.subplots(figsize=(12,8))

sns.countplot(y="tag", data=dataframe, color='dodgerblue', order=dataframe['tag'].value_counts().index)

axis1.set_xlabel("Nº de tokens", fontsize=20, weight='bold')
axis1.set_ylabel("Tag", fontsize=20, weight='bold')
plt.grid(False)
plt.xscale('log')

total = len(dataframe)
for p in axis1.patches:
    width = p.get_width()
    axis1.text(width, p.get_y()+0.5, '%.2f%%'%(100*float(width)/total), fontsize=15)

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

plt.show()
plt.clf()

In [5]:
first_token_groups = itemlist.get_first_token_groups()

In [6]:
groups = list(first_token_groups.keys())

In [7]:
groups[:10]

['colagenase',
 'campo',
 'kit',
 'tinta',
 'termometro',
 'diclofenaco',
 'panfleto',
 'bota',
 'ciprofloxacino',
 'dea']

In [8]:
len(groups)

18035

In [9]:
firstt_groups_size = itemlist.get_groups_size(first_token_groups)

In [10]:
for group, items in first_token_groups.items():
    first_token_groups[group] = len(items)

In [11]:
firstt_groups_size.count(1)

6337

In [12]:
firstt_groups_size.sort(reverse=True)

In [13]:
firstt_groups_size[:10]

[26250, 23035, 21414, 21261, 20958, 20285, 16153, 14528, 14484, 12557]

In [14]:
firstt_groups_names_size = groups_frequency_sort(first_token_groups)

In [15]:
firstt_groups_names_size[:10]

[('papel', 26250),
 ('broca', 23035),
 ('pneu', 21414),
 ('luva', 21261),
 ('sonda', 20958),
 ('filtro', 20285),
 ('oleo', 16153),
 ('fita', 14528),
 ('tubo', 14484),
 ('fio', 12557)]

In [16]:
def group_size(size):

    if size == 1:
        interval = '1'
    elif size > 1 and size <= 5:
        interval = '(1,5]'
    elif size > 5 and size <= 10:
        interval = '(5,10]'
    elif size > 10 and size <= 100:
        interval = '(10,100]'
    elif size > 100 and size <= 1000:
        interval = '(100,1000]'
    elif size > 1000 and size <= 5000:
        interval = '(1000,5000]'
    elif size > 5000 and size <= 10000:
        interval = '(5000,10000]'
    else:
        interval = '>10000'

    return interval

In [17]:
group_tags = []
not_tagged = 0
items_in_untagged = 0

for group in groups:
    if group in word_class:
        group_tags.append((group, word_class[group], group_size(first_token_groups[group])))
    elif group in medicamentos:
        group_tags.append((group, 'MED', group_size(first_token_groups[group])))
    else:
        group_tags.append((group, 'UNTAGGED', group_size(first_token_groups[group])))
        items_in_untagged += first_token_groups[group]
        not_tagged += 1

not_tagged

11391

In [18]:
items_in_untagged

116287

In [20]:
len(itemlist.items_list)

1508992

In [22]:
100*(items_in_untagged/len(itemlist.items_list))

7.706270145898719

In [None]:
group_tags_sample = []

for word, tag, group_size in group_tags:
    if tag in {'UNTAGGED', 'N', 'V', 'A', 'MED'}:
        group_tags_sample.append((word, tag, group_size))

In [None]:
dataframe = pd.DataFrame(group_tags, columns=['word', 'tag', 'group_size'])
dataframe.info()

In [None]:
import seaborn as sns
sns.set_style("white")

fig, (axis1) = plt.subplots(figsize=(12,8))

sns.countplot(y="group_size", hue="tag", data=dataframe, order=dataframe['group_size'].value_counts().index)

axis1.set_xlabel("Nº de tokens", fontsize=20, weight='bold')
axis1.set_ylabel("Tamanho do grupo", fontsize=20, weight='bold')
plt.grid(False)
plt.xscale('log')
plt.legend(loc='lower right', title='Classe', fontsize='large', title_fontsize='x-large')

# total = len(dataframe)
# for p in axis1.patches:
#     width = p.get_width()
#     axis1.text(width, p.get_y()+0.5, '%.2f%%'%(100*float(width)/total), fontsize=15)

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

plt.show()
plt.clf()

In [None]:
import seaborn as sns
sns.set_style("white")

fig, (axis1) = plt.subplots(figsize=(12,8))

sns.countplot(y="tag", data=dataframe, color='dodgerblue', order=dataframe['tag'].value_counts().index)

axis1.set_xlabel("Nº de tokens", fontsize=20, weight='bold')
axis1.set_ylabel("Tag", fontsize=20, weight='bold')
plt.grid(False)
plt.xscale('log')

total = len(dataframe)
for p in axis1.patches:
    width = p.get_width()
    axis1.text(width, p.get_y()+0.5, '%.2f%%'%(100*float(width)/total), fontsize=15)

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

plt.show()
plt.clf()

In [None]:
token_count = count_tokens(items_words)

In [None]:
token_name_count = sort_frequency_tokens(token_count)

In [None]:
top1000_tokens = token_name_count[:1000]
top1000_tokens = [tok for tok, count in top1000_tokens]

In [None]:
top_tags = []
not_tagged = 0

for token in top1000_tokens:
    if token in word_class:
        top_tags.append((token, word_class[token]))
    elif token in medicamentos:
        top_tags.append((token, 'MED'))
    else:
        top_tags.append((token, 'UNTAGGED'))
        not_tagged += 1

not_tagged

In [None]:
dataframe = pd.DataFrame(top_tags, columns=['word', 'tag'])
dataframe.info()

In [None]:
import seaborn as sns
sns.set_style("white")

fig, (axis1) = plt.subplots(figsize=(12,8))

sns.countplot(y="tag", data=dataframe, color='dodgerblue', order=dataframe['tag'].value_counts().index)

axis1.set_xlabel("Nº de tokens", fontsize=20, weight='bold')
axis1.set_ylabel("Tag", fontsize=20, weight='bold')
plt.grid(False)
plt.xscale('log')

total = len(dataframe)
for p in axis1.patches:
    width = p.get_width()
    axis1.text(width, p.get_y()+0.5, '%.2f%%'%(100*float(width)/total), fontsize=15)

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

plt.show()
plt.clf()

In [None]:
word_embeddings_file = '/Users/Pedro/Desktop/projeto-mp/dados/word embeddings/glove_s50.txt'

In [None]:
def load_word_embeddings(file):

    word_embeddings = {}

    with open(file, 'r') as data:

        data.readline()
        lines = data.readlines()

        for line in lines:
            line = line.strip('\n')
            line = line.split(' ', maxsplit=1)
            token = line[0]
            token_preprocess = tpp.remove_accents(token.lower())
            embedding = line[1].split(' ')
            embedding = [float(num) for num in embedding]
            word_embeddings[token_preprocess] = embedding

    return word_embeddings

In [None]:
word_embeddings = load_word_embeddings(word_embeddings_file)

In [None]:
words_set = set(word_embeddings.keys())

In [None]:
len(words_set)

In [None]:
word_embedding_tags = []
not_tagged = 0

for token in words_set:
    if token in word_class:
        word_embedding_tags.append((token, word_class[token]))
    elif token in medicamentos:
        word_embedding_tags.append((token, 'MED'))
    else:
        word_embedding_tags.append((token, 'UNTAGGED'))
        not_tagged += 1

not_tagged

In [None]:
dataframe = pd.DataFrame(word_embedding_tags, columns=['word', 'tag'])
dataframe.info()

In [None]:
import seaborn as sns
sns.set_style("white")

fig, (axis1) = plt.subplots(figsize=(12,8))

sns.countplot(y="tag", data=dataframe, color='dodgerblue', order=dataframe['tag'].value_counts().index)

axis1.set_xlabel("Nº de tokens", fontsize=20, weight='bold')
axis1.set_ylabel("Tag", fontsize=20, weight='bold')
plt.grid(False)
plt.xscale('log')

total = len(dataframe)
for p in axis1.patches:
    width = p.get_width()
    axis1.text(width, p.get_y()+0.5, '%.2f%%'%(100*float(width)/total), fontsize=15)

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

plt.show()
plt.clf()

In [None]:
word_tag_embedding = []

for token, tag in word_tags:
    if token in words_set:
        word_tag_embedding.append((token, tag, 'Sim'))
    else:
        word_tag_embedding.append((token, tag, 'Não'))

In [None]:
dataframe = pd.DataFrame(word_tag_embedding, columns=['word', 'tag', 'word embedding'])
dataframe.info()

In [None]:
import seaborn as sns
sns.set_style("white")

fig, (axis1) = plt.subplots(figsize=(12,8))

sns.countplot(y="tag", data=dataframe, hue='word embedding', order=dataframe['tag'].value_counts().index)

axis1.set_xlabel("Nº de tokens", fontsize=20, weight='bold')
axis1.set_ylabel("Tag", fontsize=20, weight='bold')
plt.grid(False)
plt.xscale('log')
plt.legend(loc='lower right', title='word embedding', fontsize='x-large', title_fontsize='x-large')

total = len(dataframe)
for p in axis1.patches:
    width = p.get_width()
    axis1.text(width, p.get_y()+0.35, '%.2f%%'%(100*float(width)/total), fontsize=15)

plt.yticks(fontsize=16)
plt.xticks(fontsize=16)

plt.show()
plt.clf()

In [None]:
count = 0

for doc in items_words:
    for tok in doc:
        if tok in words_set:
            count += 1
            
count

In [None]:
num_sub = 0
num_verb = 0
num_adj = 0
num_med = 0

for doc in items_words:
    sub = False
    verb = False
    adj = False
    med = False
    for tok in doc:
        if tok in word_class and word_class[tok] == 'N':
            sub = True
        elif tok in word_class and word_class[tok] == 'V':
            verb = True
        elif tok in word_class and word_class[tok] == 'A':
            adj = True
        elif tok in medicamentos:
            med = True

    if sub:
        num_sub += 1
    if verb:
        num_verb += 1
    if adj:
        num_adj += 1
    if med:
        num_med += 1

In [None]:
print(num_sub)
print(num_verb)
print(num_adj)
print(num_med)

In [None]:
len(items_words)

In [None]:
words_untagged = []
words_untagged_woembedding = []

for word, tag in word_tags:
    if tag == 'UNTAGGED' and word in words_set:
        words_untagged.append(word)
    elif tag == 'UNTAGGED':
        words_untagged_woembedding.append(word)

In [None]:
len(words_untagged)

In [None]:
len(words_untagged_woembedding)

In [None]:
random.sample(words_untagged, 20)

In [None]:
random.sample(words_untagged_woembedding, 20)