# Importing necessary packages to perform document indexing and querying tasks.

In [1]:
import os
import nltk
import json
import math # for custom TF.IDF measure
import random # to choose a random example query (if needed)
import numpy as np # to create cosine similarity measure
import xml.etree.ElementTree as ET # to extract content from specific XML tag

nltk.download ("punkt")
nltk.download ("stopwords")
nltk.download ("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mvass\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mvass\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mvass\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Document Indexing Part

Getting a list of all the documents that are found under the "dataset" subfolder.

In [2]:
# Setting the file path for all documents
docs_file_path = "dataset/docs-raw-texts"

# Getting the documents found under the path set above
docs = [f for f in os.listdir(docs_file_path)]

# Printing the filenames of the documents (for testing)
docs

['wes2015.d001.naf',
 'wes2015.d002.naf',
 'wes2015.d003.naf',
 'wes2015.d004.naf',
 'wes2015.d005.naf',
 'wes2015.d006.naf',
 'wes2015.d007.naf',
 'wes2015.d008.naf',
 'wes2015.d009.naf',
 'wes2015.d010.naf',
 'wes2015.d011.naf',
 'wes2015.d012.naf',
 'wes2015.d013.naf',
 'wes2015.d014.naf',
 'wes2015.d015.naf',
 'wes2015.d016.naf',
 'wes2015.d017.naf',
 'wes2015.d018.naf',
 'wes2015.d019.naf',
 'wes2015.d020.naf',
 'wes2015.d021.naf',
 'wes2015.d022.naf',
 'wes2015.d023.naf',
 'wes2015.d024.naf',
 'wes2015.d025.naf',
 'wes2015.d026.naf',
 'wes2015.d027.naf',
 'wes2015.d028.naf',
 'wes2015.d029.naf',
 'wes2015.d030.naf',
 'wes2015.d031.naf',
 'wes2015.d032.naf',
 'wes2015.d033.naf',
 'wes2015.d034.naf',
 'wes2015.d035.naf',
 'wes2015.d036.naf',
 'wes2015.d037.naf',
 'wes2015.d038.naf',
 'wes2015.d039.naf',
 'wes2015.d040.naf',
 'wes2015.d041.naf',
 'wes2015.d042.naf',
 'wes2015.d043.naf',
 'wes2015.d044.naf',
 'wes2015.d045.naf',
 'wes2015.d046.naf',
 'wes2015.d047.naf',
 'wes2015.d04

## Step 1 - Declaring a function to parse a document and extract the data from its < raw > tag

Creating a function to parse the XML from a document and extract the content from < raw > tag.

In [3]:
def parse_doc_xml(xml_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Getting the content from the <raw> tag in the XML document
    raw_data = root.find(".//raw").text

    return raw_data

Parsing all the documents found in the dataset subfolder. The content of the < raw > tag in each document is stored in a list.

In [4]:
# Creating a list to store the content of the < raw > tag of each document
docs_xml = []

# Looping through the list of documents
for doc in docs:
    # Setting the file path of each document
    xml_file_path = docs_file_path+ '/' +str(doc)
    
    # Extracting the xml data using the parse_doc_xml() function declared earlier
    raw_data = parse_doc_xml(xml_file_path)
    
    # Adding the extracted content to the list
    docs_xml.append(raw_data)

Printing the contents of the raw tag - for testing purposes

In [5]:
print(docs_xml[6])

Bernard Mandeville and the Fable of the Bees.

Bernard Mandeville’s – The Fable of Bees.  On November 15, 1670, Dutch philosopher, political economist and satirist Bernard Mandeville was born. He became famous for The Fable of the Bees, a satire that suggests many key principles of economic thought, including division of labor and the “invisible hand“, seventy years before these concepts were more thoroughly elucidated by Adam Smith. Not very much is known about the life of Bernard Mandeville. He probably grew up in Rotterdam, Netherlands and was the son of a physician. He enrolled at Leiden University and produced his thesis De brutorum operationibus in 1689. In it, Mandeville advocated the Cartesian theory of automatism among animals. He received his degree in medicine in 1691 and his disputation was titled De chylosi vitiata. He became a well known and respected physician and produced several literary works as well, which were considered just as successful. The Grumbling Hive was pr

## Step 2 - Tokenising the documents' content (from the < raw > tag)

In [6]:
# Creating a list to store the tokens from each document
xml_docs_tokens = []

# Looping through each item in the docs_xml list
for xml_part in docs_xml:
    # Extracting the different tokens from the text within each element in the list
    data_tokens = nltk.tokenize.word_tokenize(xml_part)
    
    # Adding the tokens to the new list
    xml_docs_tokens.append(data_tokens)

# Printing the tokens that got extracted for a specific document (for testing purposes)
print(xml_docs_tokens[6])

['Bernard', 'Mandeville', 'and', 'the', 'Fable', 'of', 'the', 'Bees', '.', 'Bernard', 'Mandeville', '’', 's', '–', 'The', 'Fable', 'of', 'Bees', '.', 'On', 'November', '15', ',', '1670', ',', 'Dutch', 'philosopher', ',', 'political', 'economist', 'and', 'satirist', 'Bernard', 'Mandeville', 'was', 'born', '.', 'He', 'became', 'famous', 'for', 'The', 'Fable', 'of', 'the', 'Bees', ',', 'a', 'satire', 'that', 'suggests', 'many', 'key', 'principles', 'of', 'economic', 'thought', ',', 'including', 'division', 'of', 'labor', 'and', 'the', '“', 'invisible', 'hand', '“', ',', 'seventy', 'years', 'before', 'these', 'concepts', 'were', 'more', 'thoroughly', 'elucidated', 'by', 'Adam', 'Smith', '.', 'Not', 'very', 'much', 'is', 'known', 'about', 'the', 'life', 'of', 'Bernard', 'Mandeville', '.', 'He', 'probably', 'grew', 'up', 'in', 'Rotterdam', ',', 'Netherlands', 'and', 'was', 'the', 'son', 'of', 'a', 'physician', '.', 'He', 'enrolled', 'at', 'Leiden', 'University', 'and', 'produced', 'his', 'th

## Step 3 - Extraction of document index terms

### Step 3.1 - Performing case folding

In [7]:
# Looping through each item in the xml_docs_tokens_list
for i in range(0, len(xml_docs_tokens)):
    # Converting each of the data tokens to lowercase
    lowercase_data_tokens = [dt.lower() for dt in xml_docs_tokens[i]]
    xml_docs_tokens[i] = lowercase_data_tokens

# Printing the new extracted tokens from a single document (for testing purposes)
print(xml_docs_tokens[6])

['bernard', 'mandeville', 'and', 'the', 'fable', 'of', 'the', 'bees', '.', 'bernard', 'mandeville', '’', 's', '–', 'the', 'fable', 'of', 'bees', '.', 'on', 'november', '15', ',', '1670', ',', 'dutch', 'philosopher', ',', 'political', 'economist', 'and', 'satirist', 'bernard', 'mandeville', 'was', 'born', '.', 'he', 'became', 'famous', 'for', 'the', 'fable', 'of', 'the', 'bees', ',', 'a', 'satire', 'that', 'suggests', 'many', 'key', 'principles', 'of', 'economic', 'thought', ',', 'including', 'division', 'of', 'labor', 'and', 'the', '“', 'invisible', 'hand', '“', ',', 'seventy', 'years', 'before', 'these', 'concepts', 'were', 'more', 'thoroughly', 'elucidated', 'by', 'adam', 'smith', '.', 'not', 'very', 'much', 'is', 'known', 'about', 'the', 'life', 'of', 'bernard', 'mandeville', '.', 'he', 'probably', 'grew', 'up', 'in', 'rotterdam', ',', 'netherlands', 'and', 'was', 'the', 'son', 'of', 'a', 'physician', '.', 'he', 'enrolled', 'at', 'leiden', 'university', 'and', 'produced', 'his', 'th

Filtering the tokens to only include alphabetical tokens

In [8]:
# Removing any non-alphabetical data tokens from each element in the list
for i in range(0, len(xml_docs_tokens)):
    alpha_data_tokens = [t for t in xml_docs_tokens[i] if t.isalpha()]
    xml_docs_tokens[i] = alpha_data_tokens

# Printing the new extracted tokens from a single document (for testing purposes)
print(xml_docs_tokens[6])

['bernard', 'mandeville', 'and', 'the', 'fable', 'of', 'the', 'bees', 'bernard', 'mandeville', 's', 'the', 'fable', 'of', 'bees', 'on', 'november', 'dutch', 'philosopher', 'political', 'economist', 'and', 'satirist', 'bernard', 'mandeville', 'was', 'born', 'he', 'became', 'famous', 'for', 'the', 'fable', 'of', 'the', 'bees', 'a', 'satire', 'that', 'suggests', 'many', 'key', 'principles', 'of', 'economic', 'thought', 'including', 'division', 'of', 'labor', 'and', 'the', 'invisible', 'hand', 'seventy', 'years', 'before', 'these', 'concepts', 'were', 'more', 'thoroughly', 'elucidated', 'by', 'adam', 'smith', 'not', 'very', 'much', 'is', 'known', 'about', 'the', 'life', 'of', 'bernard', 'mandeville', 'he', 'probably', 'grew', 'up', 'in', 'rotterdam', 'netherlands', 'and', 'was', 'the', 'son', 'of', 'a', 'physician', 'he', 'enrolled', 'at', 'leiden', 'university', 'and', 'produced', 'his', 'thesis', 'de', 'brutorum', 'operationibus', 'in', 'in', 'it', 'mandeville', 'advocated', 'the', 'cart

### Step 3.2 - Performing stop-word removal

In [9]:
# Getting a list of English stop-words
stopwords = nltk.corpus.stopwords.words("english")

# Removing the stop-words from our tokens from each element in the list
for i in range(0, len(xml_docs_tokens)):
    filt_data_tokens = [t for t in xml_docs_tokens[i] if (not t in stopwords)]
    xml_docs_tokens[i] = filt_data_tokens

# Printing the filtered tokens (for testing purposes)
print(xml_docs_tokens[6])

['bernard', 'mandeville', 'fable', 'bees', 'bernard', 'mandeville', 'fable', 'bees', 'november', 'dutch', 'philosopher', 'political', 'economist', 'satirist', 'bernard', 'mandeville', 'born', 'became', 'famous', 'fable', 'bees', 'satire', 'suggests', 'many', 'key', 'principles', 'economic', 'thought', 'including', 'division', 'labor', 'invisible', 'hand', 'seventy', 'years', 'concepts', 'thoroughly', 'elucidated', 'adam', 'smith', 'much', 'known', 'life', 'bernard', 'mandeville', 'probably', 'grew', 'rotterdam', 'netherlands', 'son', 'physician', 'enrolled', 'leiden', 'university', 'produced', 'thesis', 'de', 'brutorum', 'operationibus', 'mandeville', 'advocated', 'cartesian', 'theory', 'automatism', 'among', 'animals', 'received', 'degree', 'medicine', 'disputation', 'titled', 'de', 'chylosi', 'vitiata', 'became', 'well', 'known', 'respected', 'physician', 'produced', 'several', 'literary', 'works', 'well', 'considered', 'successful', 'grumbling', 'hive', 'probably', 'published', 'pub

### Step 3.3 - Performing stemming

In [10]:
# Declaring an initialising a stemmer
stemmer = nltk.stem.PorterStemmer()

#Stemming each token found in the document
for i in range(0, len(xml_docs_tokens)):
    stemmed_tokens = [stemmer.stem(fdt) for fdt in xml_docs_tokens[i]]
    xml_docs_tokens[i] = stemmed_tokens

# Printing the stemmed tokens from a single document (for testing purposes)
print(xml_docs_tokens[6])

['bernard', 'mandevil', 'fabl', 'bee', 'bernard', 'mandevil', 'fabl', 'bee', 'novemb', 'dutch', 'philosoph', 'polit', 'economist', 'satirist', 'bernard', 'mandevil', 'born', 'becam', 'famou', 'fabl', 'bee', 'satir', 'suggest', 'mani', 'key', 'principl', 'econom', 'thought', 'includ', 'divis', 'labor', 'invis', 'hand', 'seventi', 'year', 'concept', 'thoroughli', 'elucid', 'adam', 'smith', 'much', 'known', 'life', 'bernard', 'mandevil', 'probabl', 'grew', 'rotterdam', 'netherland', 'son', 'physician', 'enrol', 'leiden', 'univers', 'produc', 'thesi', 'de', 'brutorum', 'operationibu', 'mandevil', 'advoc', 'cartesian', 'theori', 'automat', 'among', 'anim', 'receiv', 'degre', 'medicin', 'disput', 'titl', 'de', 'chylosi', 'vitiata', 'becam', 'well', 'known', 'respect', 'physician', 'produc', 'sever', 'literari', 'work', 'well', 'consid', 'success', 'grumbl', 'hive', 'probabl', 'publish', 'publish', 'famou', 'name', 'fabl', 'bee', 'privat', 'vice', 'public', 'benefit', 'next', 'mention', 'poem

## Step 4 - Build the term by document matrix containing the TF.IDF weight for each term within each document

### Step 4.1 - Calculating TF (term frequency) of each term

In [11]:
# Creating an empty lists of dictionaries to calculate TF (term frequency)
tf_dict = []

# Calculate the frequency of each token in every document
for i in range(0, len(xml_docs_tokens)):
    doc_dict = dict()
    for token in xml_docs_tokens[i]:  # a new token encountered
        if token not in doc_dict:
            doc_dict.update({token: 1})
        else:  # existing token
            count = doc_dict.get(token)
            new_count = count + 1
            doc_dict.update({token: new_count})
    tf_dict.append(doc_dict)


# Printing the contents of the dictionary (for testing purposes)
print(tf_dict[6])

{'bernard': 4, 'mandevil': 10, 'fabl': 4, 'bee': 6, 'novemb': 1, 'dutch': 1, 'philosoph': 1, 'polit': 1, 'economist': 1, 'satirist': 1, 'born': 1, 'becam': 2, 'famou': 3, 'satir': 1, 'suggest': 1, 'mani': 1, 'key': 2, 'principl': 2, 'econom': 3, 'thought': 3, 'includ': 2, 'divis': 2, 'labor': 1, 'invis': 3, 'hand': 2, 'seventi': 1, 'year': 1, 'concept': 1, 'thoroughli': 1, 'elucid': 1, 'adam': 3, 'smith': 4, 'much': 1, 'known': 2, 'life': 1, 'probabl': 2, 'grew': 1, 'rotterdam': 1, 'netherland': 1, 'son': 1, 'physician': 2, 'enrol': 1, 'leiden': 1, 'univers': 1, 'produc': 2, 'thesi': 1, 'de': 2, 'brutorum': 1, 'operationibu': 1, 'advoc': 1, 'cartesian': 1, 'theori': 1, 'automat': 1, 'among': 1, 'anim': 1, 'receiv': 1, 'degre': 1, 'medicin': 1, 'disput': 1, 'titl': 1, 'chylosi': 1, 'vitiata': 1, 'well': 3, 'respect': 1, 'sever': 1, 'literari': 1, 'work': 2, 'consid': 1, 'success': 1, 'grumbl': 1, 'hive': 1, 'publish': 2, 'name': 1, 'privat': 1, 'vice': 2, 'public': 3, 'benefit': 5, 'nex

### Step 4.2 - Calculating IDF (inverse document frequency) for each document per term

#### Step 4.2.1 - Calculating DF (number of documents containing a specific term)

In [12]:
# Setting the num_of_docs variable to be equal to the number of documents found in the docs list
num_of_docs = len(docs)

# Creating a new dictionary to store the number of times each token appears in its respective document
df_dict = dict()

# Variable used for every new token encountered
previous_i = 0

for i in range(0, len(xml_docs_tokens)):
    doc_tokens = tf_dict[i].keys()
    for token in doc_tokens:
        if token not in df_dict:
            previous_i = i
            df_dict.update({token: 1})
        elif token in df_dict and previous_i != i:
            count = df_dict.get(token)
            new_count = count + 1
            df_dict.update({token: new_count})

# Printing the number of documents to make sure that it is accurate
print(num_of_docs)

331


#### Step 4.2.2 - Calculating N/DF(x), where N stands for the number of documents and x stands for the term for which the process is being done.

In [13]:
ndf_dict = dict()

for x in df_dict:
    token_df = df_dict[x]
    df_new = num_of_docs/token_df
    ndf_dict[x] = df_new

    
# Printing the contents from the dictionary (for testing purposes)
#print(ndf_dict)

#### Step 4.2.3 - Calculating log of the answer for each term calculated previosuly to get IDF

In [14]:
idf_dict = dict()

for x in ndf_dict:
    token_ndf = ndf_dict[x]
    token_idf = math.log10(token_ndf)
    idf_dict[x] = token_idf

    
# Printing the contents from the dictionary (for testing purposes)
#print(idf_dict)

### Step 4.3 - Mutliply TF and IDF together to get weighting for each term

In [15]:
# Creating an list of dictionaries to calculate weighting
weighting_dict = tf_dict.copy()

for i in range(0, len(weighting_dict)):
    for token in weighting_dict[i]:
        token_tf = weighting_dict[i].get(token)
        token_idf = idf_dict.get(token)
        token_weighting = token_tf * token_idf
        weighting_dict[i][token] = token_weighting

Printing the data from any specific document (for testing purposes only) - done in a seperate cell to avoid modifying any values when the above is mistakenly run more than once.

In [16]:
print(weighting_dict[6])

{'bernard': 7.671072009791025, 'mandevil': 25.19827993775719, 'fabl': 10.079311975102875, 'bee': 15.118967962654313, 'novemb': 2.519827993775719, 'dutch': 2.519827993775719, 'philosoph': 1.9177680024477564, 'polit': 2.0427067390560563, 'economist': 2.519827993775719, 'satirist': 2.519827993775719, 'born': 1.2410743928228898, 'becam': 3.6417159788794, 'famou': 5.753304007343269, 'satir': 2.519827993775719, 'suggest': 2.2187979981117376, 'mani': 2.519827993775719, 'key': 5.039655987551438, 'principl': 5.039655987551438, 'econom': 7.559483981327157, 'thought': 7.559483981327157, 'includ': 5.039655987551438, 'divis': 5.039655987551438, 'labor': 2.519827993775719, 'invis': 7.559483981327157, 'hand': 4.437595996223475, 'seventi': 2.519827993775719, 'year': 1.9177680024477564, 'concept': 2.519827993775719, 'thoroughli': 2.519827993775719, 'elucid': 2.519827993775719, 'adam': 7.559483981327157, 'smith': 10.079311975102875, 'much': 2.519827993775719, 'known': 3.1311709686727878, 'life': 1.74167

# Querying Part

In [17]:
# Setting the file path for the set of queries in case that the program is set to get a random query from the path set
queries_file_path = "dataset/queries-raw-texts"

# Getting the example queries found under the path set above
queries = [f for f in os.listdir(queries_file_path)]

# Printing the filenames of the example queries (for testing)
queries

['wes2015.q01.naf',
 'wes2015.q02.naf',
 'wes2015.q03.naf',
 'wes2015.q04.naf',
 'wes2015.q06.naf',
 'wes2015.q07.naf',
 'wes2015.q08.naf',
 'wes2015.q09.naf',
 'wes2015.q10.naf',
 'wes2015.q12.naf',
 'wes2015.q13.naf',
 'wes2015.q14.naf',
 'wes2015.q16.naf',
 'wes2015.q17.naf',
 'wes2015.q18.naf',
 'wes2015.q19.naf',
 'wes2015.q22.naf',
 'wes2015.q23.naf',
 'wes2015.q24.naf',
 'wes2015.q25.naf',
 'wes2015.q26.naf',
 'wes2015.q27.naf',
 'wes2015.q28.naf',
 'wes2015.q29.naf',
 'wes2015.q32.naf',
 'wes2015.q34.naf',
 'wes2015.q36.naf',
 'wes2015.q37.naf',
 'wes2015.q38.naf',
 'wes2015.q40.naf',
 'wes2015.q41.naf',
 'wes2015.q42.naf',
 'wes2015.q44.naf',
 'wes2015.q45.naf',
 'wes2015.q46.naf']

## Step 1 - Get a user query.

Creating a function to parse the XML from a query and extract the content from < raw > tag - used only to get example query

In [18]:
def parse_query_xml(xml_file_path):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Find the user query within the XML structure
    raw_element = root.find('.//raw')

    # Check if the raw element is found
    if raw_element is not None:
        # Get the text content of the raw element
        query = raw_element.text
        return query
    else:
        return "Raw element not found in the XML structure."

Parsing the query. The content of the < nafHeader > tag in each document is stored in a variable called query. A flag is used to tell program if we want a query from the queries subfolder or get the query from user input.

In [19]:
# Boolean which informs program whether to get random query from queries subfolder
get_random_query = False

if (get_random_query):    # get random query
    index = random.randint(0, len(queries)-1)
    print("Index chosen: "+str(index))

    xml_file_path = queries_file_path+ '/' +str(queries[index])

    query = parse_query_xml(xml_file_path)
else:    # ask user to enter query
    query = input("Enter a query: ")

# Printing the query
print(query)

Enter a query: Astronomy
Astronomy


## Step 2 - Preprocess the query  (tokenisation, case-folding, stop-word removal and stemming).

## Step 2.1 - Performing tokenisation

In [20]:
# Extracting the different tokens from the query
query_tokens = nltk.tokenize.word_tokenize(query)

# Printing the tokens that got extracted (for testing purposes)
print(query_tokens)

['Astronomy']


## Step 2.2 - Performing case folding

In [21]:
# Converting all the query tokens to lowercase
lowercase_query_tokens = [qt.lower() for qt in query_tokens]

# Printing the new tokens
print(lowercase_query_tokens)

['astronomy']


Filtering the tokens to only include alphabetical tokens

In [22]:
# Removing any non-alphabetical query tokens
alpha_query_tokens = [at for at in lowercase_query_tokens if at.isalpha ()]

# Printing the new tokens
print(alpha_query_tokens)

['astronomy']


## Step 2.3 - Performing stop-word removal

In [23]:
# Getting a list of English stop-words
stopwords = nltk.corpus.stopwords.words("english")

# Removing the stop-words from our tokens
filt_query_tokens = [ft for ft in alpha_query_tokens if (not ft in stopwords)]

# Printing the filtered tokens
print(filt_query_tokens)

['astronomy']


## Step 2.4 - Performing stemming

In [24]:
# Declaring and initialising a stemmer
stemmer = nltk.stem.PorterStemmer()

stemmed_query_tokens = [stemmer.stem(t) for t in filt_query_tokens]
print(stemmed_query_tokens);

['astronomi']


## Step 3 - Using cosine similarity to calculate the similarity between query and rach document.

Creating several functions in order to perform several calculations (i.e. cosine similarity, creating a vector and creating a vocabulary).

In [25]:
def calculate_cosine_similarity(query_vector, document_vector):
    # Calculate cosine similarity between two vectors
    dot_product = np.dot(query_vector, document_vector)
    norm_query = np.linalg.norm(query_vector)
    norm_document = np.linalg.norm(document_vector)

    # Avoid division by zero
    if norm_query == 0 or norm_document == 0:
        return 0.0

    cosine_similarity = dot_product / (norm_query * norm_document)
    return cosine_similarity

def create_vector(text, vocabulary):
    # Create a vector representation of the text based on a vocabulary
    vector = np.zeros(len(vocabulary))
    for word in text:
        if word in vocabulary:
            vector[vocabulary[word]] += 1
    return vector

def create_vocabulary(texts):
    # Create a vocabulary based on a list of texts
    vocabulary = {}
    for text in texts:
        for word in text:
            if word not in vocabulary:
                vocabulary[word] = len(vocabulary)
    return vocabulary

Using the above functions to calculate the similarity between user query and each document.

In [26]:
# Creating a list storing the similarity scores since this will be used when outputting list of documents in order
similarity_list = []

In [27]:
# Create a vocabulary
for i in range(0, len(xml_docs_tokens)):
    all_texts = [stemmed_query_tokens] + [xml_docs_tokens[i]]
    
    vocabulary = create_vocabulary(all_texts)

    # Create vectors for user query and documents
    query_vector = create_vector(stemmed_query_tokens, vocabulary)
    document_vectors = create_vector(xml_docs_tokens[i], vocabulary)

    # Calculate cosine similarity with each document
    similarity_scores = calculate_cosine_similarity(query_vector, document_vectors)
    similarity_list.append(similarity_scores)
    print("Similarity Score with Document "+str((i+1))+": "+str(similarity_scores))

Similarity Score with Document 1: 0.0
Similarity Score with Document 2: 0.0
Similarity Score with Document 3: 0.0
Similarity Score with Document 4: 0.0
Similarity Score with Document 5: 0.0
Similarity Score with Document 6: 0.0
Similarity Score with Document 7: 0.0
Similarity Score with Document 8: 0.0
Similarity Score with Document 9: 0.0
Similarity Score with Document 10: 0.0
Similarity Score with Document 11: 0.0
Similarity Score with Document 12: 0.0
Similarity Score with Document 13: 0.0
Similarity Score with Document 14: 0.0
Similarity Score with Document 15: 0.0
Similarity Score with Document 16: 0.0
Similarity Score with Document 17: 0.0
Similarity Score with Document 18: 0.0
Similarity Score with Document 19: 0.0
Similarity Score with Document 20: 0.0
Similarity Score with Document 21: 0.0
Similarity Score with Document 22: 0.0
Similarity Score with Document 23: 0.0
Similarity Score with Document 24: 0.0
Similarity Score with Document 25: 0.0
Similarity Score with Document 26:

Similarity Score with Document 263: 0.023796037813647444
Similarity Score with Document 264: 0.0
Similarity Score with Document 265: 0.031992834407549194
Similarity Score with Document 266: 0.0
Similarity Score with Document 267: 0.0
Similarity Score with Document 268: 0.0
Similarity Score with Document 269: 0.0
Similarity Score with Document 270: 0.0
Similarity Score with Document 271: 0.0
Similarity Score with Document 272: 0.0
Similarity Score with Document 273: 0.0
Similarity Score with Document 274: 0.07357118927777097
Similarity Score with Document 275: 0.0
Similarity Score with Document 276: 0.0
Similarity Score with Document 277: 0.0
Similarity Score with Document 278: 0.0
Similarity Score with Document 279: 0.0
Similarity Score with Document 280: 0.0
Similarity Score with Document 281: 0.04981354813867179
Similarity Score with Document 282: 0.09467916046467048
Similarity Score with Document 283: 0.0
Similarity Score with Document 284: 0.0
Similarity Score with Document 285: 0.

## Step 4 - Outputting the list of documents as a ranked list.

Creating a function, similar to the one used to parse the XML from a document, but this time to the content from < fileDesc > tag under the < nafHeader > tag.

In [28]:
def parse_doc_title(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    # This is assuming that the document only contains one <nafHeader> tag in the XML document
    file_desc_element = root.find(".//nafHeader/fileDesc")
    
    # Getting the title from the <nafHeader> tag
    raw_data = file_desc_element.get("title")

    return raw_data

Parsing all the documents found in the dataset subfolder. The content of the < fileDesc > tag in each document is stored in a list.

In [29]:
# Creating a list to store the document titles
docs_titles = []

# Loop through each document to get its title
for doc in docs:
    file_path = docs_file_path+ '/' +str(doc)
    
    raw_data = parse_doc_title(file_path)
    
    docs_titles.append(raw_data)

Printing the contents of the fileDesc tag - for testing purposes

In [30]:
print(docs_titles[6])

Bernard Mandeville and the Fable of the Bees


Creating a dictionary which stores the document titles and their respective similarity scores. This is useful when actually sorting the titles by their score.

In [31]:
score_dict = dict()

for i in range(0, len(docs_titles)):
    score_dict.update({docs_titles[i]: similarity_list[i]})


# Printing the contents from the dictionary (for testing purposes)
#print(score_dict)

Sorting the similarity scores list in descending order.

In [32]:
# Sort dictionary in descending order
doc_titles_ranked = sorted(score_dict.items(), key=lambda x:x[1], reverse=True)

# Output the ranked list of documents (including those with 0 similarity)
counter = 1

print("Ranked List of Documents:")

# Looping through the dictionary and print each document in order with its score
for x,y in doc_titles_ranked:
    print(str(counter)+ ". Document: "+str(x)+ "\nSimilarity Score: "+str(y)+"\n")
    counter += 1

Ranked List of Documents:
1. Document: Johannes Hevelius and his Selenographia
Similarity Score: 0.18578433666678973

2. Document: Fred Whipple and the Dirty Snowballs
Similarity Score: 0.14285714285714285

3. Document: Johann Heinrich von Mädler and the First Accurate Map of the Moon
Similarity Score: 0.12898007411770432

4. Document: Jean Picard and his Love for Accuracy
Similarity Score: 0.12869789041755736

5. Document: Pierre Mechain and the Meridian Survey Expedition
Similarity Score: 0.1281682672224003

6. Document: Sir Bernard Lovell and the Radioastronomy
Similarity Score: 0.12216944435630522

7. Document: Abu Ma’shar al-Balkhi – The Prince of Astrologers
Similarity Score: 0.10297110457020561

8. Document: Friedrich Bessel and the Distances of Stars
Similarity Score: 0.10222859593214292

9. Document: Heinrich Olbers and the Olbers’ Paradox
Similarity Score: 0.10063092108532552

10. Document: Ulugh Beg – Astronomer
Similarity Score: 0.09467916046467048

11. Document: Eudoxus an