1.Parse the document to extract the data in the XML's < raw > tag

In [15]:
import os
from collections import defaultdict
from KafNafParserPy import KafNafParser

# Directory containing the NAF files
directory = './WES-Dataset/docs/'
documents = defaultdict(lambda: defaultdict(list))

# Loop over all files in the directory
for filename in os.listdir(directory):
    # Check if the file is a NAF file
    if filename.endswith('.naf'):
        # Full path to the NAF file
        filepath = os.path.join(directory, filename)

        # Open and parse the NAF file
        with open(filepath, 'r') as file:
            naf_obj = KafNafParser(file)

            # find the <raw> tag and extract its text content
            raw_data = naf_obj.get_raw()
            documents[filename] = raw_data



2. Tokenise the documents’ content

In [16]:
import nltk
nltk.download('punkt')
for doc in documents:
    raw_data = documents[doc]
    if(raw_data is not None):
        tokens = nltk.word_tokenize(raw_data)
        documents[doc] = tokens

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/malcolmborg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


3. Perform case-folding, stop-word removal and stemming 

In [17]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

def stem(token):
    if token.endswith('s') or token.endswith('es'):
        return token[:-1]
    elif token.endswith('ed'):
        return token[:-2]
    elif token.endswith('ing'):
        return token[:-3]
    return token
for doc in documents:
    tokens = documents[doc]    
    if(tokens is not None and len(tokens) > 0):
        processed_tokens = [stem(token.lower()) for token in tokens if token.lower() not in stopwords]
        documents[doc] = processed_tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/malcolmborg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


4.Build the term by document matrix containing the TF.IDF weight
for each term within each document

In [18]:

import math

term_frequencies = defaultdict(lambda: defaultdict(int))
document_frequencies = defaultdict(int)

# looping over documents and tokens to get the frequencies
for doc in documents:
    tokens = documents[doc]
    for token in tokens:
        term_frequencies[doc][token] += 1
        document_frequencies[token] += 1

tfidf_matrix = defaultdict(lambda: defaultdict(float))
for document in term_frequencies:
    for token in term_frequencies[document]:
        tf = term_frequencies[document][token]
        df = document_frequencies[token]
        tfidf_matrix[document][token] = tf * math.log(len(documents) / df)


Get a user query – note that it can be set within the notebook directly.
into a variable named query


In [19]:
user_input = input("Search for something: ")


Preprocess the user query (tokenisation, case-folding, stop-word re-
moval and stemming)

In [20]:
input_tokens = nltk.word_tokenize(user_input)
processed_input = [stem(token.lower()) for token in input_tokens if token.lower() not in stopwords]

Use cosine similarity to calculate the similarity between the query and
each document

In [21]:
# calculate query vector
query_vector = [tfidf_matrix[term][tok] for tok in input_tokens for term in tfidf_matrix]
cosine_similarities = {}

for document in documents:
    # calculate document vector for each token in document
    doc_vector = [tfidf_matrix[term][doc_tok] for term in tfidf_matrix for doc_tok in documents[document]]

    dot_product = sum(query_v * doc_v for query_v, doc_v in zip(query_vector, doc_vector))
    query_norm = math.sqrt(sum(q ** 2 for q in query_vector))
    doc_norm = math.sqrt(sum(d ** 2 for d in doc_vector))
    if query_norm != 0 and doc_norm != 0:
        cosine_similarity = dot_product / (query_norm * doc_norm)
        cosine_similarities[document] = cosine_similarity

for sim in cosine_similarities:
    print(sim, cosine_similarities[sim])

<generator object <genexpr> at 0x10d489080>
wes2015.d038.naf -0.00482161822085204
wes2015.d004.naf -0.007748624752264572
wes2015.d010.naf 0.0
wes2015.d206.naf 0.0
wes2015.d212.naf -0.001016514629595005
wes2015.d158.naf 0.0
wes2015.d170.naf 3.888832091169908e-05
wes2015.d164.naf -1.724870163278905e-05
wes2015.d165.naf 0.0
wes2015.d171.naf -0.0006725084605294279
wes2015.d159.naf 0.0
wes2015.d213.naf -0.0016945737967813993
wes2015.d207.naf -0.0024213325544128568
wes2015.d011.naf 0.0
wes2015.d005.naf 2.4819027253925947e-05
wes2015.d039.naf -0.0013848017481472117
wes2015.d013.naf -1.6397467362733867e-05
wes2015.d007.naf -0.0013593449541020079
wes2015.d239.naf 0.0
wes2015.d211.naf -2.6234457870197883e-06
wes2015.d205.naf -1.0579769971076862e-05
wes2015.d167.naf 0.0
wes2015.d173.naf 0.0
wes2015.d198.naf 0.0
wes2015.d199.naf -0.0007254936665240031
wes2015.d172.naf -2.6531398827803605e-06
wes2015.d166.naf -0.0006635246810432776
wes2015.d204.naf 2.726526021724209e-06
wes2015.d210.naf -0.00058867