In [1]:
import gensim
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models
import numpy as np
import pandas as pd
from collections import OrderedDict
from math import floor,ceil
from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full
import os

function to load individual document from a folder

In [2]:
def loadBook(folder,filename):
    filepath=folder + "/" + filename
    f=open(filepath)
    raw=f.read()
    return(raw)

function to load all the books from a folder, break them into sentences and remove stopwords from each sentence. 

In [11]:
def loadAndProcess(folder): 
    docs=list()
    for file in os.listdir(folder):
        #print(file)
        book=loadBook(folder,file)
        docs.append(book)
    
    text=" ".join(docs)
    sents=sent_tokenize(text) #break the document into smaller sentences
    
    #remove stop words
    for i in range(len(sents)):
        words=nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sents[i])
        words=[w.lower() for w in words if w not in stopwords.words('english')]
        words=" ".join(words)
        sents[i]=words
        
    return(sents)

function to load the docs without any pre-processing, i.e. original sentences

In [12]:
def loadOnly(folder):
    docs=list()
    for file in os.listdir(folder):
        #print(file)
        book=loadBook(folder,file)
        docs.append(book)
    
    text=" ".join(docs)
    sents=sent_tokenize(text) #break the document into smaller sentences

    return(sents)        

prepare a dictionary and corpus for lda

In [5]:
def prepareCorpus(sentences):
    text=list()
    for i in range(len(sentences)):
        words=nltk.word_tokenize(sentences[i])
        text.append(words)
    dictionary = corpora.Dictionary(text)
    corpus = [dictionary.doc2bow(t) for t in text]
    return(dictionary,corpus)

Functions that we will use later

In [6]:
def generateSimilarityMatrix():
    similarityMatrix=np.zeros((numOfTopics,numOfTopics))
    for i in range(numOfTopics):
        ti_dist=sorted(ldamodel.get_topic_terms(i,topn=ldamodel.num_terms))
        for j in range(numOfTopics):
            tj_dist=sorted(ldamodel.get_topic_terms(j,topn=ldamodel.num_terms))
            sim=1-hellinger(ti_dist,tj_dist)
            similarityMatrix[i,j]=sim
    return(similarityMatrix)

In [7]:
def greaterThanThreshhold(matrix,thresh):
    m,n=matrix.shape
    greaterThan=False
    for i in range(m):
        for j in range(n):
            if i==j:
                continue
            else:
                if matrix[i,j]>thresh:
                    greaterThan=True
    return(greaterThan) 

load and run lda

In [13]:
original=loadOnly("docs")
#original sentences without any pre processing
sents=loadAndProcess("docs") #processed sentences
dic,corp=prepareCorpus(sents) #dictionary and corpus

set number of topics to 2 times the number of documents

In [23]:
numOfDocs=len(os.listdir("docs"))   

In [24]:
numOfTopics=2*numOfDocs         #change as per need

In [25]:
i=0
while(True):
    print(i,"iteration")
    ldamodel = gensim.models.ldamodel.LdaModel(corpus=corp, num_topics=numOfTopics, id2word = dic, passes=20)    
    sm=generateSimilarityMatrix()
    greaterThan=greaterThanThreshhold(sm,0.55)
    
    if(greaterThan):
        numOfTopics=numOfTopics-1
    else:
        print(numOfTopics)
        break
 
    i=i+1

0 iteration
22


create a sentences-topic matrix which is stored in a numpy array

In [26]:
docsTopicMatrix=np.zeros((len(sents),numOfTopics))
for i in range(len(sents)):
    topicsList=ldamodel.get_document_topics(corp[i],minimum_probability=0)
    for j in range(len(topicsList)):
        docsTopicMatrix[i,j]=topicsList[j][1]  

For each topic we pick out the best (2\*n/3k). This gives us a reduced set of sentences for each topic. These sentence numbers are stored in a numpy array.  

In [27]:
numOfSents=docsTopicMatrix.shape[0]
reducedNumOfSents=floor((2*numOfSents)/(3*numOfTopics))
reduced_sent_matrix=np.zeros((reducedNumOfSents,numOfTopics))

Loop through each column of the docs-Topic matrix and pick out the top 2n/3k sentence numbers for each topic. 

In [28]:
for j in range(numOfTopics):
    #for each column, store sentence number and weight pairs in a matrix
    temp_dict={}
    for i in range(numOfSents):
        temp_dict[i]=docsTopicMatrix[i,j]
    
    #choose the top 2n/3k sentences by finding the max element and deleting it from the dictionary 2n/3k times     
    for i in range(reducedNumOfSents):
        k=max(temp_dict, key=temp_dict.get)
        v=temp_dict.pop(k, None)
        reduced_sent_matrix[i,j]=k

In [29]:
reduced_sent_matrix

array([[  49.,  225.,  222.,  184.,  128.,   54.,   99.,  316.,  365.,
         416.,  329.,  422.,   84.,  101.,  233.,  361.,  358.,  197.,
         227.,  406.,    0.,  130.],
       [ 183.,  298.,  295.,  394.,  205.,  119.,  145.,  307.,  260.,
         347.,  344.,  336.,  443.,  408.,  309.,   42.,  265.,  273.,
         350.,  352.,  423.,   86.],
       [ 123.,   17.,  291.,  148.,  396.,   89.,  440.,  231.,    1.,
         311.,    4.,  181.,  417.,  409.,   61.,   23.,  229.,  133.,
         261.,  211.,  402.,   70.],
       [ 228.,   95.,  155.,   30.,  131.,  274.,   81.,    2.,  302.,
         346.,  112.,   43.,  370.,  413.,   12.,  430.,  305.,  444.,
          26.,  339.,  244.,  196.],
       [ 304.,  257.,  403.,  117.,  162.,  154.,  193.,   96.,  399.,
         191.,  364.,  103.,  323.,  437.,  425.,  219.,   32.,  264.,
         327.,  404.,  120.,  362.],
       [  20.,    6.,  212.,   60.,  100.,  386.,   18.,   33.,  151.,
         224.,  267.,  202.,  359.