In [1]:
import numpy as np
import re
from nltk.corpus import stopwords
import os
from collections import Counter
import string

<h3 style = 'color:purple;'>Vector Space Model (TF-IDF Weightage Model)</h3>

$$ f(q,d) = sim(q,d) =  \sum_{i=1}^n x_iy_i $$ 
q = (x_1,.....,x_n) <br>
d = (y_1,.....,y_n) <br>
x_i = count of word W_i in query. <br>
y_i = TF-IDF of word W_i in doc i.e $$ y_i = C(W_i,doc) * log_2 \frac {M+1} {k} $$
M = number of documents in the collection <br>
k = document frequency for every word in corpus


**M = 5 as it is assumed that the collection has 5 docs only.**

In [2]:

## sample documents to understand code easily
documents = {
    "d1" : "news about",
    "d2" : "news about organic food campaign",
    "d3" : "news of presidential campaign",
    "d4" : "news of presidential campaign presidential candidate",
    "d5" : "news of organic food campaign campaign campaign campaign"
}


### you can use it to work on big file


## uploading document from Data_Updated folder

# documents = {}
# for filename in os.listdir("Data-updated"):
#     file = open("Data-updated/"+filename, "r",errors = "ignore")
#     file = file.read()
#     documents[filename] = file


In [3]:
# vocabulary 

vocab = []

Write a function named wordList(doc) in such a way that it takes a txt
file as input argument and returns a list of words in your document.

In [4]:
## return list of word after separating each word on the basis of space

def wordList(doc):
    word_list = doc.split(" ")
    return word_list

Write a function named removePuncs(wordList) that takes list of
words then iterate through this list. During iteration it do some
processing on each word. Function should replace punctuation marks
as well as \n. and check either this word in stopword on not? if it is in
stopword then we didn't append this into resulting List.

In [5]:
## remove punctation and all the stop words from the list of words

def removePuncs(wordList):
    stopword_list = stopwords.words("english")
    update_wordList = []
    for word in wordList:
        trimmed_word = word.translate(str.maketrans('', '', string.punctuation))    #remove punctuation
        if (trimmed_word not in stopword_list) and len(trimmed_word)>0:             #remove stopwords and empty words
            update_wordList.append(trimmed_word)
          
    return update_wordList 

Write a function named termFrequencyInDoc(wordList) which
should take a list of words asinput argument, and output a dictionary
of words such that each word that appears in the document is key in
the dictionary and it's value is term frequency

In [6]:
#take the word list of document and find the term frequency and return the dictinoary 

def termFrequencyInDoc(wordsList):
    dictionary  = {}
    setOfWords = set(wordsList)
    for word in setOfWords:
        dictionary[word] = wordsList.count(word)
    return dictionary  


Write a function named wordDocFre(dicList) that takes list of
dictionary as input argument, each dictionary in this list is the word
that appears in the given document as keys and the no. of times the
word appears as value. This function should construct a dictionary
which has all the words that appear in the corpus as keys and no. of
docs that contain this word as value.

In [7]:
def wordDocFre(dicList):
    df_corpus = {}
    for word in vocab:
        df_corpus[word] = 0

    for word in vocab:
        count = 0
        for dic in dicList.values():
            if word in dic:
                count += 1
        df_corpus[word] = count
        
    return df_corpus


Construct a function named inverseDocFre(dicList,base) that takes
dictionary returned from wordDocFre functions above and outputs
inverse document frequency of each word.

In [8]:

def inverseDocFre(dicList,base):
    #since we have calculated k (document frequency) for all the words in the corpus, next step is to calculate idf
    M = len(documents) #number of documents in the collection
    idf_corpus = {} #inverse_document frequency for every word in corpus
    for word in vocab:
        idf_corpus[word] = (np.log((M+1) / dicList[word])) / np.log(base) #log_2 ((M+1)/k) i.e inverse document frequency
    return idf_corpus

This function named tfidf(docList) takes list of documents it calls
the function wordList to split the document in list of words and
remove stopwords and punctuation marks from them, then calls
termFrequencyInDoc() uses its output to create dictionary of
vocabulary using the function wordDocFre(), it then should call
inverseDocFre() function. It then outputs a list of dictionary, where
each document corresponds to the dictionary, its words should be
keys values should be tf-idf score.

In [9]:
def tfidf(docList):        
    
    tf_dictionary = {}
    for key, value in docList.items():
        words = wordList(value)
        words = removePuncs(words)

        for word in words:
            if word not in vocab:
                vocab.append(word)

        dic = termFrequencyInDoc(words)
        tf_dictionary[key] = dic
    
        
    idf_corpus = inverseDocFre(wordDocFre(tf_dictionary),2)

    tf_idf_docs = {}  # will store tf_idf scores for document words
    for doc_id in tf_dictionary.keys():
        tf_idf_docs[doc_id] = {}
    
    for word in vocab:
        for doc_id,doc in tf_dictionary.items(): #iterate through key,value pairs where key = doc_id and value = doc content
            tf_frequency = 0
            if word in doc:
                tf_frequency = doc[word]

            tf_idf_docs[doc_id][word] = tf_frequency * idf_corpus[word] #C(W_i,doc) * IDF(W_i)
    
    return tf_idf_docs   
    


Write a code for VSM, Function header
should be like this vectorSpaceModel(query)

In [10]:
def vectorSpaceModel(query , tf_idf_docs):
    query_vocab = removePuncs(wordList(query))
    query_wc = {} # a dictionary to store count of a word in the query (i.e x_i according to lecture slides terminology)
    for word in query_vocab:
        query_wc[word] = query.split().count(word)
    
    relevance_scores = {} # a dictionary that will store the relevance score for each doc
    # doc_id will be the key and relevance score the value for this dictionary
    for doc_id in documents.keys():
        score = 0 #initialze the score for the doc to 0 at the start
        for word in query_vocab:
            if word in tf_idf_docs[doc_id]:
                score += query_wc[word] * tf_idf_docs[doc_id][word] # count of word in query * term_freq of the word
        relevance_scores[doc_id] = score
    
    return relevance_scores

    

Queries on which run the VSM

In [11]:
queries = [
    "LDA", "Topic modelling", "Generative models", "Semantic relationships between terms", "Natural Language Processing", "Text Mining",
    "Translation model", "Learning procedures for the lexicon", "Semantic evaluations", "System results and combination"
]


Main Function

In [12]:
tf_idf_docs = (tfidf(documents))
for query in queries:
    relevance = vectorSpaceModel(query , tf_idf_docs)
    relevance_scores = Counter(relevance)
    
    # Finding 5 highest values
    highest = relevance_scores.most_common(5)
    print(f"Query : {query} \nResult : {highest}")

Query : LDA 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
Query : Topic modelling 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
Query : Generative models 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
Query : Semantic relationships between terms 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
Query : Natural Language Processing 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
Query : Text Mining 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
Query : Translation model 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
Query : Learning procedures for the lexicon 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
Query : Semantic evaluations 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
Query : System results and combination 
Result : [('d1', 0), ('d2', 0), ('d3', 0), ('d4', 0), ('d5', 0)]
