In [1]:
import pandas as pd
import numpy as np
import collections
import copy
import random
import matplotlib.pyplot as plt
from nlp.preprocessing import (
    clean_text,
    preprocess,
    tokenize,
    preprocess_document,
    tokenize_document,
    get_stopwords, 
    lemmatization_document,
    get_canonical_words)
from nlp.utils import (
    plot_histogram,
    get_completetext,
    plot_wordcloud,
    print_statistics,
    groups_frequency_sort)
from nlp.text_statistics import (
    count_tokens,
    unique_tokens
)
from nlp.grouping import (
    get_groups,
    get_groups_size,
    get_unigram_groups,
    get_two_tokens_groups,
    get_first_token_groups,
    get_bigram_groups,
    get_first_two_groups,
    groups_frequency_sort
)
from utils.read_files import (
    get_items)
from item.item_list import (
    ItemList,
    Item
)
from item.spellcheckeropt import SpellcheckerOpt
from item.utils import get_tokens_set
from textpp_ptbr.preprocessing import TextPreProcessing as tpp
from gensim.parsing.preprocessing import (
    strip_multiple_whitespaces,
    strip_non_alphanum,
    strip_punctuation2,
    strip_short)

#Import xmeans through pyclustering library:
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
from pyclustering.cluster.xmeans import xmeans

In [2]:
# It gets the descpitons processed:
itemlist = ItemList()
itemlist.load_items_from_file('../dados/items_preprocessed.zip')

In [3]:
#It gets the list of preprocessed descriptions:
items_list = itemlist.items_list

In [4]:
# It gets the first tokens of each description and groups
# based on this approach:
first_token_groups = itemlist.get_first_token_groups()

In [5]:
# total number of groups:
len(first_token_groups)

18035

In [6]:
#It creates a list of the the keys of these groups:
groups = list(first_token_groups.keys())

In [7]:
# It gets the values of each group (i.e., the id of the descriptions into that group):
group_descriptions = list(first_token_groups.values())

In [8]:
groups[0]
group_descriptions[0]

[0,
 5397,
 6121,
 9458,
 15702,
 19193,
 19735,
 20034,
 21082,
 22880,
 25711,
 28490,
 32120,
 32182,
 32944,
 36888,
 38819,
 41181,
 41762,
 43768,
 45599,
 49124,
 51157,
 56107,
 56897,
 59936,
 59959,
 60560,
 67022,
 67167,
 70748,
 74153,
 76287,
 76855,
 78235,
 82736,
 83447,
 85959,
 88728,
 90844,
 91238,
 94270,
 95632,
 96928,
 98906,
 100731,
 102871,
 107503,
 108234,
 114893,
 118373,
 119705,
 122732,
 123123,
 123816,
 127104,
 131248,
 142926,
 146718,
 153824,
 157309,
 162036,
 162417,
 162958,
 163312,
 168818,
 175315,
 176437,
 177094,
 181874,
 182344,
 183951,
 184123,
 184784,
 188475,
 188843,
 189925,
 195327,
 198283,
 199351,
 199677,
 200842,
 205269,
 208983,
 209647,
 214597,
 221900,
 222282,
 223146,
 225666,
 225894,
 226972,
 227348,
 227692,
 232618,
 232920,
 239366,
 240420,
 244013,
 250111,
 251733,
 252427,
 253254,
 254081,
 254262,
 254580,
 254934,
 256490,
 257845,
 257975,
 258586,
 258846,
 260102,
 260592,
 264120,
 269070,
 274308,

In [9]:
# Get the list of words (medicines and nouns) from the list of descriptions
# in a specific group:
def get_list_of_words(group_desc):
    list_words = list()
    medical= get_tokens_set('../dados/palavras/medications.txt')
    canonical_form, word_class = get_canonical_words()
    
    for desc_id in group_desc:
        words = itemlist.items_list[desc_id].get_item_dict()['palavras']
        for p in words:
            if((p not in list_words)): 
                if ((p in medical) or ((p in word_class) and (word_class[p] == 'N'))):
                    list_words.append(p)
                
    list_words.sort()
    
    return list_words

In [10]:
# Define a zero matrix based on the size of the number 
# of descriptions in that group (row) and the number of 
# words (only medicines and nouns) from all descriptions
# in that group:
def define_zero_matrix(group_desc):
    list_words = get_list_of_words(group_desc)
    rows = len(group_desc)
    columns = len(list_words)
    print(str(rows) + ',' + print(columns))
    matrix_bow = np.zeros((rows, columns))
    
    return matrix_bow, list_words

In [11]:
# Get the number of rows and columns generated by the Bag-of-Words
def get_size_rows_columns(group_desc):
    list_words = get_list_of_words(group_desc)
    rows = len(group_desc)
    columns = len(list_words)
    
    return rows, columns

In [12]:
# Define the bag-of-words matrix:
def define_description_bow(group_desc):
    matrix_list = define_zero_matrix(group_desc)
    zeros = matrix_list[0]
    list_words = matrix_list[1]    
    i = 0
    for desc_id in group_desc:
        words = itemlist.items_list[desc_id].get_item_dict()['palavras']           
        for w in words:
            if(w in list_words):
                k = list_words.index(w)
                zeros[i, k]  = 1.0
        i = i + 1
    return zeros

In [23]:
def analyze_on_first_token_groups_bow(first_token_groups, itemlist, it_thread, lower, upper, Return_Rows, Return_Cols):
   
    # It creates a list of the the keys of these groups:
    groups = list(first_token_groups.keys())
    # It gets the values of each group (i.e., the id of the descriptions into that group):
    group_descriptions = list(first_token_groups.values())
    # It defines the dictionary that will have the clustering with first token
    # together with x-means considering a bag-of-words of the descriptions 
    # grouped by the first token approach:
    first_token_plus_bow_xmeans = {}
    # Iterator of the first token groups:
    ft_it = lower
    arr_cols = []
    arr_rows = []

    while ft_it <= upper:
        if(len(group_descriptions[ft_it]) >= 30):
            print(str(it_thread) + ': ' + str(ft_it) + '/' + str(upper))
            # Bag of words for the group 0:
            bow = get_size_rows_columns(group_descriptions[ft_it])
            arr_rows.append(bow[0])
            arr_cols.append(bow[1])
            
        ft_it = ft_it + 1
        
    Return_Rows[it_thread] = arr_rows
    Return_Cols[it_thread] = arr_cols
    

In [24]:
def get_ranges(group_len, n_threads):
    total_len = group_len
    num_threads = n_threads
    lower = []
    upper = []
    step = int(total_len/num_threads)

    for k in range(num_threads):
        lower.append(0)
        upper.append(0)

    lower[0] = 0
    upper[0] = step
  
    i = 1
    j = 0
    while (i < num_threads):    
        upper[i]  = upper[j] + step
        lower[i]  = upper[j] +  1
        if(i%2 != 0):
            upper[i] = upper[i] + 1
        
        i = i + 1
        j = j + 1
        
    upper[n_threads - 1] = upper[n_threads - 1] - 1
    return lower, upper

In [50]:
import multiprocessing
manager = multiprocessing.Manager()
Return_Rows = manager.dict()
Return_Cols = manager.dict()
jobs = []
n_threads = 7
# It gets the first tokens of each description and groups
# based on this approach:
first_token_groups = itemlist.get_first_token_groups()
group_len = len(first_token_groups)
first_token_groups_new = {}
keys_ft = list(first_token_groups.keys())

random.shuffle(keys_ft)
for k in keys_ft:
    first_token_groups_new[k] = first_token_groups[k]
    
# It defines the ranges (of the groups) the threads will work on:
thread_ranges = get_ranges(group_len, n_threads)
print('Read ranges')
print(thread_ranges) 




Read ranges
([0, 2577, 5154, 7730, 10307, 12883, 15460], [2576, 5153, 7729, 10306, 12882, 15459, 18034])


In [36]:
import multiprocessing
manager = multiprocessing.Manager()
Return_Rows = manager.dict()
Return_Cols = manager.dict()
jobs = []
n_threads = 14
# It gets the first tokens of each description and groups
# based on this approach:
first_token_groups = itemlist.get_first_token_groups()
group_len = 14
first_token_groups_new = {}
keys_ft = list(first_token_groups.keys())

random.shuffle(keys_ft)
for k in keys_ft:
    first_token_groups_new[k] = first_token_groups[k]
    
# It defines the ranges (of the groups) the threads will work on:
thread_ranges = get_ranges(group_len, n_threads)
print('Read ranges')
print(thread_ranges) 

Read ranges
([0, 2, 4, 5, 7, 8, 10, 11, 13, 14, 16, 17, 19, 20], [1, 3, 4, 6, 7, 9, 10, 12, 13, 15, 16, 18, 19, 20])


In [37]:
for i in range(n_threads):
    p = multiprocessing.Process(target=analyze_on_first_token_groups_bow, args=(first_token_groups_new, itemlist, i, thread_ranges[0][i], thread_ranges[1][i], Return_Rows, Return_Cols))
    jobs.append(p)
    p.start()

for proc in jobs:
    proc.join()

3: 6/6
13: 20/20


In [38]:
cols_res = []
rows_res = []

for i in range(n_threads):
    cols_res = cols_res + Return_Cols[i]
    rows_res = rows_res + Return_Rows[i]

In [49]:
import statistics 
print('min cols: ' + str(min(cols_res)))
print('max cols: ' + str(max(cols_res)))
print('min rows: ' + str(min(rows_res)))
print('max rows: ' + str(max(rows_res)))

print('avg cols: ' + str(statistics.mean(cols_res)))
print('stdev cols: ' + str(statistics.stdev(cols_res)))
print('avg rows: ' + str(statistics.mean(rows_res)))
print('stdev rows: ' + str(statistics.stdev(rows_res)))

min cols: 15
max cols: 84
min rows: 71
max rows: 71
avg cols: 49.5
stdev cols: 48.79036790187178
avg rows: 71
stdev rows: 0.0


In [40]:
rows_res

[71, 71]

In [None]:
arr_col = []
arr_row = []
len_gd = len(group_descriptions)
i = 1
for gd in group_descriptions:
    print(str(i) + '/' + str(len_gd))
    
    if(len(gd) > 30):
        r, c = get_size_rows_columns(gd)
        arr_row.append(r)
        arr_col.append(c)
    i = i + 1

1/18035
2/18035
3/18035
4/18035
5/18035
6/18035
7/18035
8/18035
9/18035
10/18035
11/18035
12/18035
13/18035
14/18035
15/18035
16/18035
17/18035
18/18035
19/18035
20/18035
21/18035
22/18035
23/18035
24/18035
25/18035
26/18035
27/18035
28/18035
29/18035
30/18035
31/18035
32/18035
33/18035
34/18035
35/18035
36/18035
37/18035
38/18035
39/18035
40/18035
41/18035
42/18035
43/18035
44/18035
45/18035
46/18035
47/18035
48/18035
49/18035
50/18035
51/18035
52/18035
53/18035
54/18035
55/18035
56/18035
57/18035
58/18035
59/18035
60/18035
61/18035
62/18035
63/18035
64/18035
65/18035
66/18035
67/18035
68/18035
69/18035
70/18035
71/18035
72/18035
73/18035
74/18035
75/18035
76/18035
77/18035
78/18035
79/18035
80/18035
81/18035


In [None]:
print('min col: ' + str(min(arr_col)))
print('min row: ' + str(min(arr_row)))
print('max col: ' + str(max(arr_col)))
print('max row: ' + str(max(arr_row)))
print('avg row: ' + str((sum(arr_row)/len(arr_row))))
print('avg col: ' + str((sum(arr_col)/len(arr_col))))

In [None]:
avg_value = 0 if len(somelist) == 0 else sum(somelist)/len(somelist)

In [None]:
#It saves the bag of words into a file:
np.savetxt("bow.csv", bow, delimiter=";")

In [None]:
# It gets the list of words from the group description[0]
list_words = get_list_of_words(group_descriptions[0])
list_words

In [None]:
for desc_id in group_descriptions[0]:
    words = itemlist.items_list[desc_id].get_item_dict()['palavras']
    print(words)

In [None]:
#It runs xmeans on the bag of words and returns clusters:
def cluster_by_xmeans(bow):
    xmeans_instance = xmeans(bow, ccore=False)
    xmeans_instance.process();
    clusters = xmeans_instance.get_clusters();
    
    return clusters

In [None]:
def translate_id_to_descriptions(ids, descriptions_ids):
    arr = []
    
    for i in ids:
        arr.append(descriptions_ids[i])
    return arr

In [None]:
def cluster_on_first_token_groups_bow(items_list):
    # It emplys the first token approach to group the descriptions:
    first_token_groups = itemlist.get_first_token_groups()
    # It creates a list of the the keys of these groups:
    groups = list(first_token_groups.keys())
    # It gets the values of each group (i.e., the id of the descriptions into that group):
    group_descriptions = list(first_token_groups.values())
    # It defines the dictionary that will have the clustering with first token
    # together with x-means considering a bag-of-words of the descriptions 
    # grouped by the first token approach:
    first_token_plus_bow_xmeans = {}
    # Iterator of the first token groups:
    ft_it = 0
    
    while ft_it < len(groups):
        if(len(group_descriptions[ft_it]) > 30):
            # Bag of words for the group 0:
            bow = define_description_bow(group_descriptions[ft_it])
    
            #It applies the clusters on the bow of the descriptions - group 0:
            clusters_bow = cluster_by_xmeans(bow)
            it = 0
            for c in clusters_bow:
                # It translates ids from x-means to actual descriptions (new groups):
                desc_ids = translate_id_to_descriptions(c, group_descriptions[ft_it])
                # It defines the key of the map:
                new_key = groups[ft_it] + '_' + str(it)
                # It sets the maps:
                first_token_plus_bow_xmeans[new_key] = desc_ids
                it = it + 1
        else:
            first_token_plus_bow_xmeans[groups[ft_it]] = group_descriptions[ft_it]
        
        ft_it = ft_it + 1
        
    return first_token_plus_bow_xmeans