In [1]:
import glob 
import os
from nltk.corpus import stopwords     # NLTK library is used to remove stop words and punctuations marks
from nltk.tokenize import word_tokenize 
from collections import Counter
import numpy as np
from collections import OrderedDict

In [16]:
'''Overall in this block of code we read every file in the dataset folder and by using nested loop
each line in a text file is appended into the headers dictionary along with keys'''
def Initializedata(filepaths):
    
    headers = {}   #A dictionary which stores all words along with keys
    count = 0
     # iterate for each file path in the list
    for i in filepaths :
        #Open the file in read mode
        with open(i, 'r') as f:
            #words = list(f) #help to read file line by line
            while True:
                theline = f.read()             # try to read next line
                if len(theline) == 0:               # if there are no more lines
                    break
                headers[count] = theline
        count = count + 1            
    return headers

In [17]:
def stringconversion(headers):
    
    word_list = [] #An empty list which is used to store strings
    for doc in headers.values(): #go through each line in a dictionary
        for word in doc.split(): #go through each word in the current line
            word_list.append(word) #this will give list of strings in all 20k text files
                
    return word_list

In [18]:
'''As we have to remove stopwords and punctuation marks so this block of code simply select each line 
in a dictionary and by using splitting method it split into single word and by appending those 
single words in a list we make a long list of all strings in all documents'''

def stopwordandsymbols(word_list):
    
    new_text = []
    updated_list = []  #empty lists to store the finalize list which has no stopword and punctuation marks

    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    stop_words = set(stopwords.words('english')) #NLTK library function
    for word in word_list:
        #this loop will remove all symbols from the list of strings we have
        if word not in symbols:
            new_text.append(word)
    for word in new_text:
        #this loop work on the list with removed punctuation marks and it will remove stopwords now
        if word not in stop_words:
            updated_list.append(word) #here is the refined list of strings
            
    return updated_list

In [19]:
#count = 0
def vacabulary(updated_list):
    
    corpus = []  #this corpus contain the vacabulary, it adds all the words in the list we have above and exclude all repeating words
    for word in updated_list: #go through each word in the current doc
        #print(count)    
        if not word in corpus: 
                corpus.append(word)
        #count = count + 1
        
    return corpus

In [25]:
'''To find the document frequency here is the piece of code which works and store all document frequency
and how many documents the word repeats''' 
def documentfreq(headers, corpus):
    
    df_corpus = {} #document frequency for every word in corpus

    for word in corpus:
        k = 0 #initial document frequency set to 0
        for doc in headers.values(): #iterate through documents
            if word in doc.split(): #check if word in doc
                k+=1 
        df_corpus[word] = k
    
    return df_corpus


In [32]:
def inversedocumentfreq(headers, corpus, df_corpus):
    M = len(headers) #number of documents in the collection
    idf_corpus = {} #inverse_document frequency for every word in corpus
    for word in corpus:
        idf_corpus[word] = np.log2((M+1) / df_corpus[word]) #log_2 ((M+1)/k) i.e inverse document frequency
    
    return idf_corpus

In [27]:
def termfrequency(headers, corpus):
    
    tf_corpus = {} #empty dictionary
    
    for doc_id in headers.keys(): #iterate through doc# (d1,d2,...,d5)
        tf_corpus[doc_id] = {} #create empty dictionary for each doc# key
    for word in corpus: #iterate through words in the corpus
        for doc_id,doc in headers.items(): #iterate through documents dictionary
            tf_corpus[doc_id][word] = doc.count(word) #store term-frequency for the word in each doc
    
    return tf_corpus
            

In [38]:
def tfidf_corpus(corpus, tf_corpus, idf_corpus, headers):
    tf_idf = {} #will store tf_idf scores for document words
    for doc_id in headers.keys():
        tf_idf[doc_id] = {} #initialize empty dictionary for each doc_id
    for word in corpus:
        for doc_id,doc in headers.items(): #iterate through key,value pairs where key = doc_id and value = doc content
            tf_idf[doc_id][word] = tf_corpus[doc_id][word] * idf_corpus[word] #C(W_i,doc) * IDF(W_i)
    return tf_idf

In [46]:
def vectorSpaceModel(query, headers, tf_idf):
    query_vocab = [] # will store the unique words that occur in the query
    for word in query.split():
        if word not in query_vocab:
            query_vocab.append(word)

    query_dic = {} # a dictionary to store count of a word in the query (i.e x_i according to lecture slides terminology)
    for word in query_vocab:
        query_dic[word] = query.split().count(word)
    
    relevance_scores = {} # a dictionary that will store the relevance score for each doc
                            # doc_id will be the key and relevance score the value for this dictionary
    for doc_id in headers.keys():
        score = 0 #initialze the score for the doc to 0 at the start
        for word in query_vocab:
            score += query_dic[word] * tf_idf[doc_id][word] # count of word in query * term_freq of the word
        relevance_scores[doc_id] = score
    sorted_value = OrderedDict(sorted(relevance_scores.items(), key=lambda x: x[1], reverse = True))
    top_5 = {k: sorted_value[k] for k in list(sorted_value)[:5]}
    return top_5

In [11]:
#Folder contain the path of our ACL text folder located in your system
folder = r"C:\Users\Mudasser Afzal\Desktop\IR Assignment\ACL txt"  #

# Get filepaths for all files which end with ".txt" :
filepaths = glob.glob(os.path.join(folder, '*.txt'))


d = Initializedata(filepaths)  #give the dataset in form of list of strings in files
s = stringconversion(d)
s   #display strings

['Association',
 'for',
 'Computational',
 'Linguistics',
 '6',
 'th',
 'Applied',
 'Natural',
 'Language',
 'Processing',
 'Conference',
 'Proceedings',
 'of',
 'the',
 'Conference',
 'April',
 '29--May',
 '4,',
 '2000',
 'Seattle,',
 'Washington,',
 'USA',
 'ANLP',
 '2000-PREFACE',
 '131',
 'papers',
 'were',
 'submitted',
 'to',
 'ANLP-2000.',
 '46',
 'were',
 'accepted',
 'for',
 'presentation',
 'at',
 'the',
 'conference.',
 'Papers',
 'came',
 'from',
 '24',
 'countries:',
 'fifty',
 'eight',
 'from',
 'the',
 'United',
 'States',
 'of',
 'America,',
 'eleven',
 'each',
 'from',
 'Germany',
 'and',
 'United',
 'Kingdom,',
 'nine',
 'from',
 'Canada,',
 'eight',
 'from',
 'Japan,',
 'four',
 'each',
 'from',
 'Italy',
 'and',
 'Spain,',
 'three',
 'ach',
 'from',
 'France,',
 'Korea',
 'and',
 'Switzerland,',
 'two',
 'each',
 'from',
 'Australia,',
 'China,',
 'The',
 'Netherlands',
 'and',
 'Sweden',
 'and',
 'one',
 'each',
 'from',
 'Czech',
 'Republic,',
 'Denmark,',
 'Finla

In [12]:
z = stopwordandsymbols(s)  #remove stop words and punctuation marks from list of strings
z    #display list

['Association',
 'Computational',
 'Linguistics',
 '6',
 'th',
 'Applied',
 'Natural',
 'Language',
 'Processing',
 'Conference',
 'Proceedings',
 'Conference',
 'April',
 '29--May',
 '4,',
 '2000',
 'Seattle,',
 'Washington,',
 'USA',
 'ANLP',
 '2000-PREFACE',
 '131',
 'papers',
 'submitted',
 'ANLP-2000.',
 '46',
 'accepted',
 'presentation',
 'conference.',
 'Papers',
 'came',
 '24',
 'countries:',
 'fifty',
 'eight',
 'United',
 'States',
 'America,',
 'eleven',
 'Germany',
 'United',
 'Kingdom,',
 'nine',
 'Canada,',
 'eight',
 'Japan,',
 'four',
 'Italy',
 'Spain,',
 'three',
 'ach',
 'France,',
 'Korea',
 'Switzerland,',
 'two',
 'Australia,',
 'China,',
 'The',
 'Netherlands',
 'Sweden',
 'one',
 'Czech',
 'Republic,',
 'Denmark,',
 'Finland,',
 'Greece,',
 'India,',
 'Hong',
 'Kong,',
 'Malaysia,',
 'Norway,',
 'Russia',
 'Taiwan.',
 '40',
 'papers',
 'submitted',
 'industry.',
 '85',
 'papers',
 'came',
 'academia.',
 '2',
 'papers',
 'submitted',
 'government',
 'organizatio

In [13]:
v = vacabulary(z)  #it makes a vacabulary and avoid repeating same string
v  #dictionary display

['Association',
 'Computational',
 'Linguistics',
 '6',
 'th',
 'Applied',
 'Natural',
 'Language',
 'Processing',
 'Conference',
 'Proceedings',
 'April',
 '29--May',
 '4,',
 '2000',
 'Seattle,',
 'Washington,',
 'USA',
 'ANLP',
 '2000-PREFACE',
 '131',
 'papers',
 'submitted',
 'ANLP-2000.',
 '46',
 'accepted',
 'presentation',
 'conference.',
 'Papers',
 'came',
 '24',
 'countries:',
 'fifty',
 'eight',
 'United',
 'States',
 'America,',
 'eleven',
 'Germany',
 'Kingdom,',
 'nine',
 'Canada,',
 'Japan,',
 'four',
 'Italy',
 'Spain,',
 'three',
 'ach',
 'France,',
 'Korea',
 'Switzerland,',
 'two',
 'Australia,',
 'China,',
 'The',
 'Netherlands',
 'Sweden',
 'one',
 'Czech',
 'Republic,',
 'Denmark,',
 'Finland,',
 'Greece,',
 'India,',
 'Hong',
 'Kong,',
 'Malaysia,',
 'Norway,',
 'Russia',
 'Taiwan.',
 '40',
 'industry.',
 '85',
 'academia.',
 '2',
 'government',
 'organizations',
 'submissions',
 'combined.',
 'reviewing',
 'process',
 'supported',
 'web-based',
 'reviewer',
 'in

In [29]:
df = documentfreq(d, v) #document frequency of corpus by using dictionary and vacabulary
df

{'Association': 22,
 'Computational': 35,
 'Linguistics': 18,
 '6': 38,
 'th': 11,
 'Applied': 20,
 'Natural': 40,
 'Language': 43,
 'Processing': 24,
 'Conference': 44,
 'Proceedings': 47,
 'April': 2,
 '29--May': 2,
 '4,': 17,
 '2000': 7,
 'Seattle,': 4,
 'Washington,': 15,
 'USA': 10,
 'ANLP': 4,
 '2000-PREFACE': 2,
 '131': 3,
 'papers': 2,
 'submitted': 7,
 'ANLP-2000.': 1,
 '46': 3,
 'accepted': 9,
 'presentation': 6,
 'conference.': 3,
 'Papers': 1,
 'came': 5,
 '24': 7,
 'countries:': 1,
 'fifty': 3,
 'eight': 10,
 'United': 9,
 'States': 6,
 'America,': 1,
 'eleven': 2,
 'Germany': 4,
 'Kingdom,': 1,
 'nine': 4,
 'Canada,': 10,
 'Japan,': 2,
 'four': 25,
 'Italy': 1,
 'Spain,': 1,
 'three': 40,
 'ach': 4,
 'France,': 3,
 'Korea': 1,
 'Switzerland,': 1,
 'two': 50,
 'Australia,': 2,
 'China,': 1,
 'The': 50,
 'Netherlands': 1,
 'Sweden': 1,
 'one': 49,
 'Czech': 3,
 'Republic,': 2,
 'Denmark,': 3,
 'Finland,': 2,
 'Greece,': 1,
 'India,': 1,
 'Hong': 3,
 'Kong,': 1,
 'Malaysia,'

In [35]:
i = inversedocumentfreq(d, v, df) #inverse document frequency for documents by using dictionary and vacabulary
i

{'Association': 1.2129937233341985,
 'Computational': 0.543142325026529,
 'Linguistics': 1.5025003405291832,
 '6': 0.42449782852791007,
 'th': 2.2129937233341983,
 'Applied': 1.3504972470841332,
 'Natural': 0.35049724708413316,
 'Language': 0.2461605872693978,
 'Processing': 1.0874628412503393,
 'Conference': 0.21299372333419844,
 'Proceedings': 0.11783649029385809,
 'April': 4.672425341971495,
 '29--May': 4.672425341971495,
 '4,': 1.584962500721156,
 '2000': 2.8650704199138914,
 'Seattle,': 3.6724253419714956,
 'Washington,': 1.765534746362977,
 'USA': 2.350497247084133,
 'ANLP': 3.6724253419714956,
 '2000-PREFACE': 4.672425341971495,
 '131': 4.087462841250339,
 'papers': 4.672425341971495,
 'submitted': 2.8650704199138914,
 'ANLP-2000.': 5.672425341971495,
 '46': 4.087462841250339,
 'accepted': 2.5025003405291835,
 'presentation': 3.0874628412503395,
 'conference.': 4.087462841250339,
 'Papers': 5.672425341971495,
 'came': 3.350497247084133,
 '24': 2.8650704199138914,
 'countries:': 

In [36]:
tf = termfrequency(d, v)  #term frequency by using dictionary and vacabulary
tf

{0: {'Association': 1,
  'Computational': 1,
  'Linguistics': 1,
  '6': 16,
  'th': 25,
  'Applied': 2,
  'Natural': 5,
  'Language': 9,
  'Processing': 5,
  'Conference': 3,
  'Proceedings': 1,
  'April': 1,
  '29--May': 1,
  '4,': 1,
  '2000': 4,
  'Seattle,': 1,
  'Washington,': 1,
  'USA': 1,
  'ANLP': 11,
  '2000-PREFACE': 1,
  '131': 1,
  'papers': 5,
  'submitted': 3,
  'ANLP-2000.': 1,
  '46': 1,
  'accepted': 1,
  'presentation': 2,
  'conference.': 1,
  'Papers': 1,
  'came': 2,
  '24': 3,
  'countries:': 1,
  'fifty': 1,
  'eight': 2,
  'United': 2,
  'States': 1,
  'America,': 1,
  'eleven': 1,
  'Germany': 1,
  'Kingdom,': 1,
  'nine': 2,
  'Canada,': 1,
  'Japan,': 1,
  'four': 2,
  'Italy': 1,
  'Spain,': 1,
  'three': 1,
  'ach': 8,
  'France,': 1,
  'Korea': 1,
  'Switzerland,': 1,
  'two': 1,
  'Australia,': 1,
  'China,': 1,
  'The': 5,
  'Netherlands': 1,
  'Sweden': 1,
  'one': 3,
  'Czech': 1,
  'Republic,': 1,
  'Denmark,': 1,
  'Finland,': 1,
  'Greece,': 1,
  '

In [42]:
x = tfidf_corpus(v, tf, i, d)   #Tf-Idf find by the formula as given on internet
x

{0: {'Association': 1.2129937233341985,
  'Computational': 0.543142325026529,
  'Linguistics': 1.5025003405291832,
  '6': 6.791965256446561,
  'th': 55.324843083354956,
  'Applied': 2.7009944941682664,
  'Natural': 1.7524862354206658,
  'Language': 2.2154452854245803,
  'Processing': 5.437314206251696,
  'Conference': 0.6389811700025954,
  'Proceedings': 0.11783649029385809,
  'April': 4.672425341971495,
  '29--May': 4.672425341971495,
  '4,': 1.584962500721156,
  '2000': 11.460281679655566,
  'Seattle,': 3.6724253419714956,
  'Washington,': 1.765534746362977,
  'USA': 2.350497247084133,
  'ANLP': 40.39667876168645,
  '2000-PREFACE': 4.672425341971495,
  '131': 4.087462841250339,
  'papers': 23.362126709857478,
  'submitted': 8.595211259741674,
  'ANLP-2000.': 5.672425341971495,
  '46': 4.087462841250339,
  'accepted': 2.5025003405291835,
  'presentation': 6.174925682500679,
  'conference.': 4.087462841250339,
  'Papers': 5.672425341971495,
  'came': 6.700994494168266,
  '24': 8.595211

In [49]:
query = 'language processing conference'
vectorSpaceModel(query, d, x)

{9: 19.938755137230917,
 32: 16.99569807717766,
 2: 16.015111347965455,
 22: 12.997561846318792,
 33: 11.943543177037352}