In [1]:
import gensim
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models
import numpy as np
import pandas as pd
from collections import OrderedDict
from math import floor,ceil
from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full

function to load the files and do basic pre-processing

In [2]:
def loadAndProcess(filename): 
    f=open(filename)
    text=f.read()
    sents=sent_tokenize(text) #break the document into smaller sentences
    
    #remove sentences that have less than 5 words in total
    #i=0
    #while(i<len(sents)):
     #   sent=nltk.word_tokenize(sents[i])
      #  if len(sent)<5:
       #     del sents[i]
       # i=i+1
        
    #remove stop words
    for i in range(len(sents)):
        words=nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sents[i])
        words=[w.lower() for w in words if w not in stopwords.words('english')]
        words=" ".join(words)
        sents[i]=words
        
    return(sents)

function to load the files without any pre-processing, i.e. original sentences

In [3]:
def loadOnly(filename):
    f=open(filename)
    text=f.read()
    sents=sent_tokenize(text) #break the document into smaller sentences
    
    #remove sentences that have less than 5 words in total
    #i=0
    #while(i<len(sents)):
     #   sent=nltk.word_tokenize(sents[i])
      #  if len(sent)<5:
       #     del sents[i]
        #i=i+1
    return(sents)        

prepare a dictionary and corpus for lda

In [4]:
def prepareCorpus(sentences):
    text=list()
    for i in range(len(sentences)):
        words=nltk.word_tokenize(sentences[i])
        text.append(words)
    dictionary = corpora.Dictionary(text)
    corpus = [dictionary.doc2bow(t) for t in text]
    return(dictionary,corpus)

Functions that we will use later

In [5]:
def generateSimilarityMatrix():
    similarityMatrix=np.zeros((numOfTopics,numOfTopics))
    for i in range(numOfTopics):
        ti_dist=sorted(ldamodel.get_topic_terms(i,topn=ldamodel.num_terms))
        for j in range(numOfTopics):
            tj_dist=sorted(ldamodel.get_topic_terms(j,topn=ldamodel.num_terms))
            sim=1-hellinger(ti_dist,tj_dist)
            similarityMatrix[i,j]=sim
    return(similarityMatrix)

In [6]:
def greaterThanThreshhold(matrix,thresh):
    m,n=matrix.shape
    greaterThan=False
    for i in range(m):
        for j in range(n):
            if i==j:
                continue
            else:
                if matrix[i,j]>thresh:
                    greaterThan=True
    return(greaterThan) 

load and run lda

In [7]:
original=loadOnly("data")  #original sentences without any pre processing
sents=loadAndProcess("data") #processed sentences
dic,corp=prepareCorpus(sents) #dictionary and corpus

start with an arbitrary number of topics

In [8]:
numOfTopics=10          #change as per need

In [10]:
i=0
while(True):
    print(i,"iteration")
    ldamodel = gensim.models.ldamodel.LdaModel(corpus=corp, num_topics=numOfTopics, id2word = dic, passes=20)    
    sm=generateSimilarityMatrix()
    greaterThan=greaterThanThreshhold(sm,0.55)
    
    if(greaterThan):
        numOfTopics=numOfTopics-1
    else:
        print(numOfTopics)
        break
 
    i=i+1

0 iteration
1 iteration
2 iteration
3 iteration
7


create a sentences-topic matrix which is stored in a numpy array

In [11]:
docsTopicMatrix=np.zeros((len(sents),numOfTopics))
for i in range(len(sents)):
    topicsList=ldamodel.get_document_topics(corp[i],minimum_probability=0)
    for j in range(len(topicsList)):
        docsTopicMatrix[i,j]=topicsList[j][1]  

For each topic we pick out the best (2\*n/3k). This gives us a reduced set of sentences for each topic. These sentence numbers are stored in a numpy array.  

In [12]:
numOfSents=docsTopicMatrix.shape[0]
reducedNumOfSents=floor((2*numOfSents)/(3*numOfTopics))
reduced_sent_matrix=np.zeros((reducedNumOfSents,numOfTopics))

Loop through each column of the docs-Topic matrix and pick out the top 2n/3k sentence numbers for each topic. 

In [13]:
for j in range(numOfTopics):
    #for each column, store sentence number and weight pairs in a matrix
    temp_dict={}
    for i in range(numOfSents):
        temp_dict[i]=docsTopicMatrix[i,j]
    
    #choose the top 2n/3k sentences by finding the max element and deleting it from the dictionary 2n/3k times     
    for i in range(reducedNumOfSents):
        k=max(temp_dict, key=temp_dict.get)
        v=temp_dict.pop(k, None)
        reduced_sent_matrix[i,j]=k

In [14]:
reduced_sent_matrix

array([[ 42.,  99.,  92.,   3.,  87.,  45.,  60.],
       [ 54.,  56.,  74.,  61.,  95.,  84.,  14.],
       [ 40.,  80.,   1.,  51.,  43.,  15.,  35.],
       [ 88.,  98.,  46.,  66.,  34.,  67.,  83.],
       [ 16.,  75.,  38.,  78.,  10.,  65.,  63.],
       [ 47.,  62.,  86.,  21.,  41.,  27.,  44.],
       [ 59.,  71.,  49.,  77.,  90.,  55.,  89.],
       [ 69.,   8.,  94.,  76.,  23.,  13.,  64.],
       [ 29.,  96.,  57.,  91.,   5.,   2.,  48.]])