In [22]:
import nltk
import re
import numpy as np
from gensim import corpora, models, similarities
from openpyxl import load_workbook
from openpyxl import Workbook
from pprint import pprint
from sklearn.cluster import KMeans

#only need to do the following once
#nltk.download('punkt')
#nltk.download('stopwords')

In [29]:
'''
Approach #1: topic modeling: each doc belongs to the topic with biggest probability. Output saved in OUTPUT_FILE.
Approach #2: k-means based on topic distribution vector. Output saved in OUTPUT_FILE_K.

@param
NUM_OF_TOPICS: number of topics, used in Approach #1 topic modeling
NUM_OF_CLUSTERS: number of clusters, used in Approach #2 k-means
FILE: 'healthcare' or 'security'
'''

NUM_OF_TOPICS = 5
NUM_OF_CLUSTERS = 5
FILE = 'healthcare'
INPUT_FILE = FILE + '.xlsx'
OUTPUT_FILE = FILE + '_output_tm.xlsx'
OUTPUT_FILE_K = FILE + '_output_kmeans.xlsx'

# workbook and first sheet
#@param: workbook name: healthcare.xlsx, or security.xlsx
wb = load_workbook(INPUT_FILE)
ws = wb[wb.get_sheet_names()[0]]
# convert to list of tuples, each tuple is a row of cells in the excel
ws_list = list(ws.rows)
#ignore the first row (header)
del ws_list[0]

documents = []
for row in ws_list:
    tmp = ""
    if(row[3].value != None):
        tmp = row[3].value + " "
    if(row[4].value != None):
        tmp = tmp + row[4].value
    documents.append(tmp)
#print(documents[0:1])
#print(len(documents))

In [18]:
# substitute with real document later, need to tokenize based on space, comma, period, question mark, and quotes.
'''
# test data
documents = ["Human machine. interface for lab abc computer applications",
             "A survey of,user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]
'''

# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

# a tokenizer and stemmer, convert to lower case
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
    #return filtered_tokens

# tokenize and stemming
tokenized_text = [tokenize_and_stem(doc) for doc in documents]

# remove stopwords; texts is a list of list of tokenized words
stopwords = nltk.corpus.stopwords.words('english')
texts = [[word for word in text if word not in stopwords] 
         for text in tokenized_text]
#pprint(texts)


# create a Gensim dictionary
dictionary = corpora.Dictionary(texts)
#print(dictionary)
# remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)
#print(dictionary)

# show words and their ids
#print(dictionary.token2id)

# convert documents to vectors, e.g. [[(0, 1), (1, 1)],[...]]
corpus = [dictionary.doc2bow(text) for text in texts]
#print("Corpus:")
#pprint(corpus)

#Latent Dirichlet Allocation(LDA) topic modeling
#lda = models.LdaModel(corpus, num_topics=NUM_OF_TOPICS,id2word=dictionary,update_every=5, chunksize=10000, passes=5)
#lda.show_topics()

lda = models.LdaModel(corpus, num_topics = NUM_OF_TOPICS,id2word = dictionary)
topics_matrix = lda.show_topics(formatted=False, num_words=20)
#print(topics_matrix[0])

#topics is a list of list of topics, e.g.: [['peopl', 'healthcar', 'help'],...]
topics = []
for topic in topics_matrix:
    topics.append([words for (words,freq) in topic[1]])
print(topics[0])

['peopl', 'help', 'healthcar', 'improv', 'villag', 'doctor', 'provid', 'would', 'basic', 'sauri', 'educ', 'livelihood', 'could', 'clean', 'free', 'fund', 'hospit', 'health', 'awar', 'diseas']


In [71]:
# clustering
doc_lda = []
for doc_bow in corpus:
    #doc_lda.append(lda[doc_bow]) #works too.
    doc_lda.append(lda.get_document_topics(doc_bow))
print(doc_lda[0:5])

[[(0, 0.011346224051494751), (1, 0.011313701854032174), (2, 0.95463117822926513), (3, 0.011356962119535859), (4, 0.011351933745672005)], [(0, 0.015592579010758247), (1, 0.015649729840445183), (2, 0.93760785978776673), (3, 0.01565136467616204), (4, 0.015498466684867797)], [(0, 0.18733965528971688), (4, 0.78471461274136123)], [(0, 0.012766312963106706), (1, 0.012630910931382544), (2, 0.012678852502441945), (3, 0.94919250030058733), (4, 0.012731423302481396)], [(4, 0.98227955820529067)]]


In [82]:
# Approach #1: cluster a document to the cluster with the highest topic score
clusters = [[] for i in range(0,NUM_OF_TOPICS)] # each element is a cluster, [[doc_id, ...]...]
doc_topic_prob = [] #[[topic_id, prob]...], each element corresponds to a doc

for topic_distr in doc_lda:
    score = max(topic_distr,key=lambda x:x[1])
    doc_topic_prob.append(score)
#print(doc_topic_prob[0:2])

for (index,item) in enumerate(doc_topic_prob):
    clusters[item[0]].append(index)
print(clusters[0:5])

[[5, 7, 17, 22, 27, 38, 46, 48, 58, 63, 64, 65, 68, 71, 72, 81, 82, 84, 97, 100, 101, 102, 103, 104, 105, 106, 109, 115, 119, 124, 126, 128, 129, 131, 135, 143, 151, 158, 163, 165, 168, 169, 176, 180, 181, 186, 189, 199, 205], [15, 23, 24, 29, 35, 36, 37, 42, 47, 54, 74, 80, 99, 112, 120, 122, 130, 134, 138, 141, 145, 148, 149, 153, 161, 192, 200, 209], [0, 1, 11, 13, 16, 18, 26, 28, 31, 33, 40, 51, 66, 70, 78, 89, 92, 96, 111, 113, 121, 123, 127, 133, 137, 144, 147, 150, 159, 171, 172, 173, 182, 190, 194, 198, 201, 202, 204, 207], [3, 6, 8, 10, 12, 14, 19, 20, 21, 25, 41, 44, 45, 49, 50, 53, 55, 56, 60, 61, 62, 67, 73, 75, 83, 85, 87, 88, 90, 94, 95, 98, 107, 132, 136, 139, 142, 146, 152, 156, 157, 160, 162, 164, 166, 167, 170, 175, 177, 179, 183, 184, 193, 195, 196, 203, 208], [2, 4, 9, 30, 32, 34, 39, 43, 52, 57, 59, 69, 76, 77, 79, 86, 91, 93, 108, 110, 114, 116, 117, 118, 125, 140, 154, 155, 174, 178, 185, 187, 188, 191, 197, 206]]


In [83]:
#put original data values to a list, prepare for write
#print(ws_list[0:2])
ws_list_o = []
for row in ws_list:
    row_o = []
    for cell in row:
        row_o.append(cell.value)
    ws_list_o.append(row_o)
#print(ws_list_o[0:2])

# Approach #1: write clusters in the OUTPUT_FILE. Clusters seperated by an empty line.

def write_output(clusters, output_file, topics=None):
    wb_o = Workbook()
    ws_o = wb_o.active  #the first spreadsheet
    for index, cluster in enumerate(clusters):
        if topics != None:
            ws_o.append(topics[index])
        for document_id in cluster:
            ws_o.append(ws_list_o[document_id])
        ws_o.append([])
    wb_o.save(output_file)
    
write_output(clusters, OUTPUT_FILE, topics)

In [81]:
# Approach #2: k-means clustering, using topic distribution vector as input vector
km_input = []

for doc in doc_lda:
    vec = []
    # some len(doc) < NUM_OF_TOPICS, need to patch them with 0s. -> needed in km.fix() function.
    if len(doc) == NUM_OF_TOPICS:
        vec = [y for (x,y) in doc]
    else:        
        for i in range(0, NUM_OF_TOPICS):
            tmp = [x for x, y in enumerate(doc) if y[0] == i]
            if tmp == []:
                vec.append(0)
            else:
                vec.append(doc[tmp[0]][1])
    km_input.append(vec)
#print(km_input[0:5])

#to check whether all elements in km_input has the same length
#print(np.unique(list(map(len, km_input))))
X = np.array(km_input)
km = KMeans(n_clusters=NUM_OF_CLUSTERS, random_state=0).fit(X)

kmeans_labels = km.labels_.tolist()
print(kmeans_labels)

clusters_kmeans = [[] for i in range(0, NUM_OF_CLUSTERS)] # each element is a cluster, [[doc_id, ...]...]
for index, cluster in enumerate(kmeans_labels):
    clusters_kmeans[cluster].append(index)
print(clusters_kmeans[0:5])

[0, 0, 3, 2, 3, 1, 2, 1, 2, 3, 2, 0, 2, 0, 2, 4, 0, 1, 0, 2, 2, 2, 1, 4, 4, 2, 0, 1, 0, 4, 3, 0, 3, 0, 3, 4, 4, 4, 1, 3, 0, 2, 4, 3, 2, 2, 1, 4, 1, 2, 2, 0, 3, 2, 4, 2, 2, 3, 1, 3, 2, 2, 2, 1, 1, 1, 0, 2, 1, 3, 0, 1, 1, 2, 4, 2, 3, 3, 0, 3, 4, 1, 1, 2, 1, 2, 3, 2, 2, 0, 2, 3, 0, 3, 2, 2, 0, 1, 2, 4, 1, 1, 3, 1, 1, 1, 1, 2, 3, 1, 3, 0, 4, 0, 3, 1, 3, 3, 3, 1, 4, 0, 4, 0, 1, 3, 1, 0, 1, 1, 4, 1, 2, 0, 4, 1, 2, 0, 4, 2, 3, 4, 2, 1, 0, 4, 2, 0, 4, 4, 0, 1, 2, 4, 3, 3, 2, 2, 1, 0, 2, 4, 2, 1, 2, 1, 2, 2, 1, 1, 2, 0, 0, 0, 3, 2, 1, 2, 3, 2, 1, 1, 0, 2, 2, 3, 1, 3, 3, 1, 0, 3, 4, 2, 0, 2, 2, 3, 0, 1, 4, 0, 0, 2, 0, 1, 3, 0, 2, 4]
[[0, 1, 11, 13, 16, 18, 26, 28, 31, 33, 40, 51, 66, 70, 78, 89, 92, 96, 111, 113, 121, 123, 127, 133, 137, 144, 147, 150, 159, 171, 172, 173, 182, 190, 194, 198, 201, 202, 204, 207], [5, 7, 17, 22, 27, 38, 46, 48, 58, 63, 64, 65, 68, 71, 72, 81, 82, 84, 97, 100, 101, 103, 104, 105, 106, 109, 115, 119, 124, 126, 128, 129, 131, 135, 143, 151, 158, 163, 165, 168, 169, 1

In [84]:
# Approach #2: write output in OUTPUT_FILE_K.
write_output(clusters_kmeans, OUTPUT_FILE_K)