In [3]:
import pandas as pd
import time
import os
import re
import string
import numpy as np
import nltk

In [4]:
#Read all the files and saved in fileSeries Dataframe

def readFilesFromDirectory(directoryName):     
    fileSeries = pd.Series(data=[], index=[])
    
    with os.scandir(directoryName) as entries:
        for entry in entries:
            f = open(directoryName + "/" + entry.name, "rb")
            content = f.read().decode('ISO-8859-1').replace("\n", " ")
            fileSeries[entry.name] = content.replace("\r","")
            f.close()
    return fileSeries

In [26]:
start = time.time()

#Will be using fileSeries dataframe to access document and its text
fileSeries = readFilesFromDirectory("./ACL txt")

end = time.time()

print("Execution Time:", end - start) 
print("Total files read:", len(fileSeries))

Execution Time: 31.85650610923767
Total files read: 21941


In [28]:
def removePunc(doc):
    stopset = set(nltk.corpus.stopwords.words('english'))
    tokens = nltk.WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and  len(token) > 2 and token.isalpha()]
    return clean

In [29]:
def termFrequencyInDoc(wordList):
    return {word:wordList.count(word) for word in set(wordList)}

In [30]:
#created termFrequencyDocuments Dictionary for each document 

#removed stopword and punctuations then calculated term freqency for each document and created dictionary of dictionaries

# termFrequencyDocuments dictionary will be used for rest of the tasks
start = time.time()
termFrequencyDocuments = {}

for index, value in fileSeries.items():
    processedDoc = removePunc(value)
    termFrequencyDocuments[index]=termFrequencyInDoc(processedDoc)
    
end = time.time()

print("Execution Time:", end - start) 

Execution Time: 786.435583114624


In [31]:
#each word document frequency calculated

def wordDocFre(termFreqDictionary): 
    documentFrequencies = {}
    
    for docId, docValue in termFreqDictionary.items():
        for word, value in docValue.items():
            try:
                documentFrequencies[word]+=1
            except:
                documentFrequencies[word] = 1

    return documentFrequencies

In [32]:
documentFrequencies = wordDocFre(termFrequencyDocuments)

In [33]:
def inverseDocFre(wordDocFreqDictionary): 
    M = len(termFrequencyDocuments)
    return {key: np.log2((M+1)/value)  for key, value in wordDocFreqDictionary.items() }

In [34]:
idfs= inverseDocFre(documentFrequencies)

In [35]:
vocabulary = list(set([key for key in documentFrequencies.keys()]))
if("lda" in  list(documentFrequencies.keys())):
    print("yas")
#Created dictionay to saved each vocabulary word index in dictionary so i can populate document and query vectors with idf and freq

vocabIndexDictionary = { value: index for index,value in enumerate(vocabulary) }

yas


In [36]:
def tfIdf(termFreq, inverseDocFreq):
    
    docTfIdfVector = np.zeros(len(vocabulary))
    
    for token in termFreq.keys():
        
        tfidf  = termFreq[token] * inverseDocFreq[token]
        
        index = vocabIndexDictionary[token]
        
        docTfIdfVector[index] = tfidf
        
    return docTfIdfVector

In [37]:
#creater frequency vector for query 
def getQueryVector(query):
    queryFreqVector = np.zeros(len(vocabulary))
    
    queryTermFreq = termFrequencyInDoc(removePunc(query))
    
    for token, freq in queryTermFreq.items():
        
        index = vocabIndexDictionary[token]
        queryFreqVector[index] = freq
        
    return queryFreqVector 

In [38]:
def VSM(query):
    #created term frequency of query
    
    queryVector = getQueryVector(query)
    
    docSimilarityScore = {}
    
    for docId, docTermFreq in termFrequencyDocuments.items():
        
        docVector = tfIdf(docTermFreq, idfs) #per doc tfId calculated
        docSimilarityScore[docId] = np.dot(queryVector, docVector, out=None)  
    
    return docSimilarityScore

In [39]:
import operator
def sortByScore(scoreDict):
    return sorted(scoreDict.items(), key=operator.itemgetter(1), reverse=True)


In [40]:
query1Result = VSM("LDA")

sortedByRank = sortByScore(query1Result)


print(sortedByRank[:5])


[('D11-1050.pdf.txt', 752.3030201019424), ('D13-1172.pdf.txt', 676.1204357878216), ('D09-1026.pdf.txt', 628.5063205914962), ('Q15-1022.pdf.txt', 538.0395017184778), ('W11-2506.pdf.txt', 538.0395017184778)]


In [42]:
query2Result = VSM("Topic modelling")

sortedByRank = sortByScore(query2Result)

print(sortedByRank[:5])


[('P12-1079.pdf.txt', 450.811570284626), ('J14-2003.pdf.txt', 406.44035273692657), ('N15-1074.pdf.txt', 315.9230689396198), ('Q15-1004.pdf.txt', 312.37337153580387), ('Q15-1022.pdf.txt', 289.30033841100015)]


In [43]:
query3Result = VSM("Generative models")

sortedByRank = sortByScore(query3Result)

print(sortedByRank[:5])


[('W06-1668.pdf.txt', 187.19916366688074), ('W11-0100.pdf.txt', 182.22356277111322), ('J03-4003.pdf.txt', 151.16826455117942), ('D09-1111.pdf.txt', 135.30130968802501), ('D09-1058.pdf.txt', 131.9145646377607)]


In [44]:
query4Result = VSM("Semantic relationships between terms")

sortedByRank = sortByScore(query4Result)

print(sortedByRank[:5])


[('W11-0100.pdf.txt', 1000.0019722646931), ('J08-2004.pdf.txt', 196.68702411982164), ('W15-3808.pdf.txt', 182.405180869523), ('W04-1801.pdf.txt', 171.76074940901188), ('J09-2003.pdf.txt', 161.91553812211853)]


In [128]:
query5Result = VSM("Natural Language Processing")

sortedByRank = sortByScore(query5Result)

print(sortedByRank[:5])


[('W11-0100.pdf.txt', 155.38103787162342), ('J14-1005.pdf.txt', 88.99281941873132), ('J87-1020.pdf.txt', 87.51531375973897), ('W14-55.x.pdf.txt', 60.85516369287274), ('J87-3010.pdf.txt', 53.78284302032024)]


In [129]:
query6Result = VSM("Text Mining")

sortedByRank = sortByScore(query6Result)

print(sortedByRank[:5])


[('P06-1062.pdf.txt', 161.61688577674002), ('D09-1162.pdf.txt', 161.36684200961096), ('P12-1062.pdf.txt', 122.68305517578517), ('W09-2609.pdf.txt', 119.45461191336705), ('W11-0100.pdf.txt', 114.51126888597759)]


In [130]:
query7Result = VSM("Translation model")

sortedByRank = sortByScore(query7Result)

print(sortedByRank[:5])


[('J85-2006.pdf.txt', 447.89495301256284), ('J03-3003.pdf.txt', 421.06976616764905), ('J06-4004.pdf.txt', 352.2254777771003), ('J03-3004.pdf.txt', 329.58272945986806), ('W14-3302.pdf.txt', 327.6765620156608)]


In [47]:
query8Result = VSM("Learning procedures for the lexicon")

sortedByRank = sortByScore(query8Result)

print(sortedByRank[:5])


[('W11-0100.pdf.txt', 242.5921091987321), ('J87-3007.pdf.txt', 211.77949355101745), ('W06-1647.pdf.txt', 197.3033530148868), ('W10-2505.pdf.txt', 196.50598280569395), ('W99-0630.pdf.txt', 190.92134755129433)]


In [46]:
query9Result = VSM("Semantic evaluations")

sortedByRank = sortByScore(query9Result)

print(sortedByRank[:5])


[('W11-0100.pdf.txt', 867.6438855111824), ('J09-4008.pdf.txt', 196.78731141091293), ('J08-2004.pdf.txt', 195.88572195636618), ('J09-2003.pdf.txt', 158.7103294682967), ('J14-1002.pdf.txt', 135.32721096794992)]


In [45]:
query10Result = VSM("System results and combination")

sortedByRank = sortByScore(query10Result)

print(sortedByRank[:5])


[('W11-0100.pdf.txt', 218.2504424692338), ('N10-1141.pdf.txt', 154.05405347061145), ('J01-2002.pdf.txt', 153.8651240339329), ('P11-1127.pdf.txt', 131.17937259694367), ('J11-3003.pdf.txt', 129.13058664334983)]
