In [1]:
#Importing common libraries
import pandas as pd
import numpy as np
import collections
import copy
import random
import matplotlib.pyplot as plt

#Importing text preprocessing methods:
from nlp.preprocessing import (
    clean_text,
    preprocess,
    tokenize,
    preprocess_document,
    tokenize_document,
    get_stopwords, 
    lemmatization_document,
    get_canonical_words)
from textpp_ptbr.preprocessing import TextPreProcessing as tpp
from gensim.parsing.preprocessing import (
    strip_multiple_whitespaces,
    strip_non_alphanum,
    strip_punctuation2,
    strip_short)

#Importing libraries to check spelling:
from item.spellcheckeropt import SpellcheckerOpt
from item.utils import get_tokens_set


#Importing text analysis:
from nlp.utils import (
    plot_histogram,
    get_completetext,
    plot_wordcloud,
    print_statistics,
    groups_frequency_sort)

#Importing text statistics:
from nlp.text_statistics import (
    count_tokens,
    unique_tokens
)

#Importing baseline approaches for clustering:
from nlp.grouping import (
    get_groups,
    get_groups_size,
    get_unigram_groups,
    get_two_tokens_groups,
    get_first_token_groups,
    get_bigram_groups,
    get_first_two_groups,
    groups_frequency_sort
)

#Importing the stucture of the descriptions:
from utils.read_files import (
    get_items)
from item.item_list import (
    ItemList,
    Item
)

#Importing xmeans through pyclustering library:
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
from pyclustering.cluster.xmeans import xmeans

#Importing the HDBSCAN stand-alone method:
import hdbscan


#Importing the multiprocessing library:
import multiprocessing

#Importing the libraries to save the final resutls and making it possible to load them:
import nltk
import pickle


#Get the list of words (medicines and nouns) from the list of descriptions
#in a specific group.
def get_list_of_words(group_desc, itemlist, medicines, canonical_form, word_class):
    list_words = list()
   
    for desc_id in group_desc:
        words = itemlist.items_list[desc_id].get_item_dict()['palavras']
        for p in words:
            if((p not in list_words)): 
                if ((p in medicines) or ((p in word_class) and (word_class[p] == 'N'))):
                    list_words.append(p)
                
    list_words.sort()
    
    return list_words

#Define a zero matrix based on the size of the number 
#of descriptions in that group (row) and the number of 
#words (only medicines and nouns) from all descriptions
#in that group.
def define_zero_matrix(group_desc, itemlist, medicines, canonical_form, word_class):
    list_words = get_list_of_words(group_desc, itemlist, medicines, canonical_form, word_class)
    rows = len(group_desc)
    columns = len(list_words)
    matrix_bow = np.zeros((rows, columns))
    
    return matrix_bow, list_words, rows, columns

# Define the bag-of-words matrix.
def define_description_bow(group_desc, itemlist, medicines, canonical_form, word_class):
    matrix_list = define_zero_matrix(group_desc, itemlist, medicines, canonical_form, word_class)
    zeros = matrix_list[0]
    list_words = matrix_list[1]
    rows = matrix_list[2]
    columns = matrix_list[3]   
    i = 0
    for desc_id in group_desc:
        words = itemlist.items_list[desc_id].get_item_dict()['palavras']           
        for w in words:
            if(w in list_words):
                k = list_words.index(w)
                zeros[i, k]  = 1.0
        i = i + 1
    return zeros, rows, columns

#It applies x-means on the bag of words.
def cluster_by_xmeans(bow, number_of_descriptions):
    cluster_size_limit = round(number_of_descriptions/30)
    xmeans_instance = xmeans(bow, kmax=cluster_size_limit, ccore=False)
    xmeans_instance.process();
    clusters = xmeans_instance.get_clusters();
    
    return clusters

#It just transfors the sklearn output to the pyclustering output
#as they differ in terms of representation.
def transform_sklearn_to_pyclustering(output):
    output_dict = {}
    i = 0
    
    while i < len(output):     
        if(output[i] not in output_dict):
            aux_arr = []
            aux_arr.append(i)
            output_dict[output[i]] = aux_arr
        else:
            aux_arr = output_dict[output[i]]
            aux_arr.append(i)
            output_dict[output[i]] = aux_arr       
        
        i = i + 1
        
    output_arr = []
    
    for key in output_dict:
        output_arr.append(output_dict[key])
    
    return output_arr


#It applies hdbscan on the bag of words.
def cluster_by_hdbscan(bow, employed_metric, groups_ft):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=30, metric=employed_metric)
    cluster_labels = clusterer.fit_predict(bow)
    cluster_labels_post = cluster_labels
    i = 0

    while (i < len(cluster_labels)):
        if(cluster_labels[i] == -1):
            cluster_labels_post[i] = groups_ft + '_' + str(cluster_labels[i])
        i = i + 1

    print('before changing the default output')
    print(cluster_labels)

    print('after changing the default output')
    print(cluster_labels_post)
    clusters = transform_sklearn_to_pyclustering(cluster_labels_post)
    
    print('building the final output')
    print(clusters)

    return clusters


#It calls the specific method depending on 'cluster_option' parameter.
#groups_ft is used when we have outliers, so we can separate these outliers
#considering the groups (from the First Token approach) they actually represent.
def general_clustering(bow, groups_ft, number_of_descriptions, cluster_option):
    clusters = None
    
    #cluster_option = 0, it employs x_means with the Euclidean distance:
    if(cluster_option == 0):
       clusters = cluster_by_xmeans(bow, number_of_descriptions)
    #cluster_option = 1, it employs  hdbscan with the Euclidean distance (normalized by l2):
    elif(cluster_option == 1):
        clusters = cluster_by_hdbscan(bow, 'l2', groups_ft)
    #cluster_option = 1, it employs hdbscan with the Hamming distance:
    elif(cluster_option == 2):
        clusters = cluster_by_hdbscan(bow, 'hamming', groups_ft)
    #otherwise,  it employs x_means with the Euclidean distance:
    else:
        clusters = cluster_by_xmeans(bow, number_of_descriptions)

    return clusters


#Translate the generated ids of the clustering approach to actual description ids.
def translate_id_to_descriptions(ids, descriptions_ids):
    arr = []
    
    for i in ids:
        arr.append(descriptions_ids[i])
    return arr


#It clusters again the groups generated by the first token approach. For now, this method only accepts X-Means and HDBScan with specific characteristics.
def cluster_on_first_token_groups_bow(first_token_groups, itemlist, it_thread, lower, upper, medicines, canonical_form, word_class, Return_dict, cluster_option):
    print(it_thread)
    #It creates a list of the the keys of these groups:
    groups = list(first_token_groups.keys())
    #It gets the values of each group (i.e., the id of the descriptions into that group):
    group_descriptions = list(first_token_groups.values())
    #It defines the dictionary that will have the clustering with first token
    #together with x-means considering a bag-of-words of the descriptions 
    #grouped by the first token approach:
    first_token_plus_bow_xmeans = {}
    #Iterator of the first token groups:
    ft_it = lower

    while ft_it <= upper:
        print(str(it_thread) + ': ' + str(ft_it) + '/' + str(upper))
        #It only considers to cluster again if the number of descritptions of that group has more than 30 descriptions
        if(len(group_descriptions[ft_it]) >= 30):
            
            #Bag of words for the group 0:
            bow = define_description_bow(group_descriptions[ft_it], itemlist, medicines, canonical_form, word_class)
            
            #It only applies the traditional clustering methods if the number of rows and columns of the bow are greater than zero:
            if(bow[1] > 0 and bow[2] > 0):        
                #It applies the clusters on the bow of the descriptions - group 0:
                
                clusters_bow = general_clustering(bow[0], groups[ft_it], len(group_descriptions[ft_it]), cluster_option)
                it = 0
                for c in clusters_bow:
                    #It translates ids from x-means to actual descriptions (new groups):
                    desc_ids = translate_id_to_descriptions(c, group_descriptions[ft_it])
                    #It defines the key of the map:
                    new_key = groups[ft_it] + '_' + str(it)
                    #It sets the maps:
                    first_token_plus_bow_xmeans[new_key] = desc_ids
                    it = it + 1
            else:
                first_token_plus_bow_xmeans[groups[ft_it]] = group_descriptions[ft_it]
        else:
            first_token_plus_bow_xmeans[groups[ft_it]] = group_descriptions[ft_it]
        ft_it = ft_it + 1
        
    Return_dict[it_thread] = first_token_plus_bow_xmeans


#It gets the ranges of the clusters generated by the First Token approach
#This is done in order to the processes work on.
def get_ranges(group_len, n_threads):
    if(n_threads == 1):
        return 0, (group_len - 1)

    total_len = group_len
    num_threads = n_threads
    lower = []
    upper = []
    step = int(total_len/num_threads)

    for k in range(num_threads):
        lower.append(0)
        upper.append(0)

    lower[0] = 0
    upper[0] = step
  
    i = 1
    j = 0
    while (i < num_threads):    
        upper[i]  = upper[j] + step
        lower[i]  = upper[j] +  1
        if(i%2 != 0):
            upper[i] = upper[i] + 1
        
        i = i + 1
        j = j + 1
    
    #Please, check if the final cluster range ends with 18,034 clusters
    #(i.e., the number of clusters generated by First Token).
    #Depending of the number of processes, you may have to change this "-1"
    #for something else.
    upper[n_threads - 1] = upper[n_threads - 1] - 1 
    return lower, upper

In [3]:
manager = multiprocessing.Manager()
Return_dict = manager.dict()
jobs = []
n_threads = 10

#It loads the medical terms (medicines, drugs, etc):
medicines = get_tokens_set('../dados/palavras/medications.txt')
#It loads the canonical forms and their classes
canonical_form, word_class = get_canonical_words()

#It gets the descpitons processed:
itemlist = ItemList()
itemlist.load_items_from_file('../dados/items_preprocessed.zip')
#It gets the list of preprocessed descriptions:

print('Read data preprocessed')
#It gets the first tokens of each description and groups
#based on this approach:
first_token_groups = itemlist.get_first_token_groups()
group_len = len(first_token_groups)
first_token_groups_new = {}

#It shuffles the itens based on their keys:
keys_ft = list(first_token_groups.keys())
random.shuffle(keys_ft)
random.shuffle(keys_ft)

Read data preprocessed


In [5]:
cluster_option = 2

#It fills another dictionary with the shuffled keys:
for k in keys_ft:
    first_token_groups_new[k] = first_token_groups[k]
    
#It defines the ranges (of the groups) the processes will work on:
thread_ranges = get_ranges(group_len, n_threads)
print('Read ranges')
print(thread_ranges) 
    
#It creates the processes (balanced by shuffling the keys of the dictionary:
for i in range(n_threads):
    p = multiprocessing.Process(target=cluster_on_first_token_groups_bow, args=(first_token_groups_new, itemlist, i, thread_ranges[0][i], thread_ranges[1][i], medicines, canonical_form, word_class, Return_dict, cluster_option))
    jobs.append(p)
    p.start()

In [None]:
#It joins the results
for proc in jobs:
    proc.join()
    
#It gets all the results of the processes by accessing the Return_dict of each process:
dictionary_clusters = {}    
for i in range(n_threads):
    dictionary_clusters.update(Return_dict[i])