In [1]:
import gensim
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models
import numpy as np
import pandas as pd
from collections import OrderedDict
from math import floor,ceil
from gensim.matutils import kullback_leibler, jaccard, hellinger, cossim, jensen_shannon
import os
from gensim.models.doc2vec import TaggedDocument
from nltk.probability import FreqDist
from scipy import spatial

function to load individual document from a folder

In [2]:
def loadBook(folder,filename):
    filepath=folder + "/" + filename
    f=open(filepath)
    raw=f.read()
    return(raw)

function to load all the books from a folder, break them into sentences and remove stopwords from each sentence. 

In [3]:
def loadAndProcess(folder): 
    docs=list()
    for file in os.listdir(folder):
        #print(file)
        book=loadBook(folder,file)
        docs.append(book)
    
    text=" ".join(docs)
    sents=sent_tokenize(text) #break the document into smaller sentences
    
    #remove stop words
    for i in range(len(sents)):
        words=nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sents[i])
        words=[w.lower() for w in words if w.lower() not in stopwords.words('english')]
        words=" ".join(words)
        sents[i]=words
        
    return(sents)

function to load the docs without any pre-processing, i.e. original sentences

In [4]:
def loadOnly(folder):
    docs=list()
    for file in os.listdir(folder):
        #print(file)
        book=loadBook(folder,file)
        docs.append(book)
    
    text=" ".join(docs)
    sents=sent_tokenize(text) #break the document into smaller sentences

    return(sents)        

prepare a dictionary and corpus for lda

In [6]:
def prepareCorpus(sentences):
    text=list()
    for i in range(len(sentences)):
        words=nltk.word_tokenize(sentences[i])
        text.append(words)
    dictionary = corpora.Dictionary(text)
    corpus = [dictionary.doc2bow(t) for t in text]
    return(dictionary,corpus)

Functions that we will use later

In [7]:
def generateSimilarityMatrix():
    similarityMatrix=np.zeros((numOfTopics,numOfTopics))
    for i in range(numOfTopics):
        ti_dist=sorted(ldamodel.get_topic_terms(i,topn=ldamodel.num_terms))
        for j in range(numOfTopics):
            tj_dist=sorted(ldamodel.get_topic_terms(j,topn=ldamodel.num_terms))
            sim1=1-hellinger(ti_dist,tj_dist)
            sim2=1-jensen_shannon(tj_dist,ti_dist)
            sim=(sim1+sim2)/2
            similarityMatrix[i,j]=sim
    return(similarityMatrix)

In [8]:
def greaterThanThreshhold(matrix,thresh):
    m,n=matrix.shape
    greaterThan=False
    for i in range(m):
        for j in range(n):
            if i==j:
                continue
            else:
                if matrix[i,j]>thresh:
                    greaterThan=True
    return(greaterThan) 

load and run lda

In [9]:
original=loadOnly("docs")
#original sentences without any pre processing
sents=loadAndProcess("docs") #processed sentences
dic,corp=prepareCorpus(sents) #dictionary and corpus

set number of topics to 2 times the number of documents

In [9]:
numOfDocs=len(os.listdir("docs"))   

In [10]:
numOfTopics=2*numOfDocs         #change as per need

In [18]:
i=0
while(True):
    print(i,"iteration")
    ldamodel = gensim.models.ldamodel.LdaModel(corpus=corp, num_topics=numOfTopics, id2word = dic, passes=20)    
    sm=generateSimilarityMatrix()
    greaterThan=greaterThanThreshhold(sm,0.55)
    
    if(greaterThan):
        numOfTopics=numOfTopics-1
    else:
        print(numOfTopics)
        break
 
    i=i+1

0 iteration
22


create a sentences-topic matrix which is stored in a numpy array

In [12]:
docsTopicMatrix=np.zeros((len(sents),numOfTopics))
for i in range(len(sents)):
    topicsList=ldamodel.get_document_topics(corp[i],minimum_probability=0)
    for j in range(len(topicsList)):
        docsTopicMatrix[i,j]=topicsList[j][1]  

For each topic we pick out the best (2\*n/3k). This gives us a reduced set of sentences for each topic. These sentence numbers are stored in a numpy array.  

In [13]:
numOfSents=docsTopicMatrix.shape[0]
reducedNumOfSents=floor((2*numOfSents)/(3*numOfTopics))
reduced_sent_matrix=np.zeros((reducedNumOfSents,numOfTopics))

Loop through each column of the docs-Topic matrix and pick out the top 2n/3k sentence numbers for each topic. 

In [14]:
for j in range(numOfTopics):
    #for each column, store sentence number and weight pairs in a matrix
    temp_dict={}
    for i in range(numOfSents):
        temp_dict[i]=docsTopicMatrix[i,j]
    
    #choose the top 2n/3k sentences by finding the max element and deleting it from the dictionary 2n/3k times     
    for i in range(reducedNumOfSents):
        k=max(temp_dict, key=temp_dict.get)
        v=temp_dict.pop(k, None)
        reduced_sent_matrix[i,j]=k

In [15]:
reduced_sent_matrix

array([[  30.,  337.,   99.,  350.,  365.,  270.,    0.,  390.,   95.,
         422.,  130.,  127.,  347.,  282.,  101.,   26.,   66.,  228.,
         117.,   84.,  133.,  124.,  274.,  225.,  386.,  358.,  423.,
         231.,  265.,  233.,   89.,  197.,  346.,  119.,  408.,   21.,
         136.,  394.,  148.,  203.,  246.,  316.,  261.],
       [ 443.,  177.,   20.,  396.,  290.,  205.,  128.,  260.,   63.,
         333.,  399.,  109.,  361.,    2.,  336.,  214.,  204.,  292.,
         264.,   36.,  140.,  191.,  150.,  298.,  161.,  404.,  416.,
         307.,  421.,  309.,  348.,   87.,  403.,   74.,  339.,  209.,
         243.,  210.,  406.,  155.,  279.,   54.,  121.],
       [ 344.,  278.,   17.,  362.,   86.,  430.,  248.,  412.,   24.,
         212.,  141.,    9.,  257.,  213.,  219.,  284.,   23.,  163.,
         413.,   19.,   70.,  398.,  185.,  144.,   18.,  387.,  300.,
         166.,   32.,  374.,  349.,  354.,   33.,   96.,  323.,  440.,
         444.,  437.,  221.,  24

doc2vec feature

In [19]:
taggeddoc=[]
text=[]

In [20]:
for i in range(len(sents)):
    t=nltk.word_tokenize(sents[i])
    text.append(t)
    td = TaggedDocument(gensim.utils.to_unicode(str.encode(' '.join(t))).split(),tags=[u'Sent_{:d}'.format(i)])
    taggeddoc.append(td)

In [21]:
model = gensim.models.Doc2Vec(taggeddoc,alpha=0.025, size= 200, min_alpha=0.025, min_count=0)

In [22]:
for epoch in range(20):
    if(epoch%5==0):
        print('Now training epoch %s'%epoch)
    model.train(taggeddoc,total_examples=model.corpus_count,epochs=model.iter)
    model.alpha -= 0.002 
    model.min_alpha = model.alpha

Now training epoch 0
Now training epoch 5
Now training epoch 10
Now training epoch 15


In [23]:
features=np.zeros((len(sents),200))
for i in range(len(sents)):
    features[i]=model.docvecs[u'Sent_{:d}'.format(i)]

In [46]:
def get_query_sent_doc2vec(sentences):
    #get length of query: average number of non stop words per sentence
    #most frequent words
    temp_text=" ".join(sentences)
    temp_words=nltk.word_tokenize(temp_text)
    length=floor((len(temp_words)/len(sentences)))
    fdist = FreqDist(temp_words)
    importantWords=fdist.most_common(length)
    temp=[]
    for i in importantWords:
        temp.append(i[0])
    return(temp)

In [47]:
query=get_query_sent_doc2vec(sents)

In [48]:
query

['sununu',
 'bush',
 'mr',
 'president',
 'house',
 'staff',
 'said',
 'white',
 'new',
 'chief',
 'political',
 'washington',
 'hampshire',
 'adams',
 'one',
 'john']

In [49]:
query_vector=model.infer_vector(query,alpha=0.025,steps=20)

In [50]:
for i in range(len(sents)):
    result = 1 - spatial.distance.cosine(features[i], query_vector)
    print(result)

0.0602877448641
0.0273545885373
0.0590498724867
0.0512722804145
0.0524075454053
0.0435306677154
0.0391607759014
0.0463311798779
0.0470840224827
-0.0803105710284
0.0709154963809
0.0487430510317
0.0417586031362
0.0504012609099
0.0502347227474
0.0651553221329
0.0828852702038
0.0586229969776
0.047382763338
0.0451092341595
0.0589195813205
0.0518746466255
0.0344467229317
0.0431434178049
0.0473675764026
0.0511344228504
0.0437612638737
0.0466482748658
0.0451318474414
0.0484651658786
0.0536791538711
0.0469064519355
0.0485201562877
0.0305496596124
0.0605561496712
0.0460347270379
0.0448030689406
0.0558040054603
-0.0276911293263
0.0693175362998
0.064419781054
0.0550341708228
0.0475347529404
0.0477032787794
0.0530631341174
0.0460704069569
0.0538938666693
0.0497952110688
0.0433285084845
0.0726313696606
0.0375919350224
0.0581750665516
0.0566075461857
0.0507675484794
0.0528136699033
0.0498606043202
0.0573921677367
0.044165379311
0.0440429157232
0.0482566639128
0.0532533630337
0.0547147534916
0.0410220