<a href="https://colab.research.google.com/github/KrishanAI-ML/IR_DCU_Project-1/blob/main/IR_system_Cranfield.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import os
import nltk
import math
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
from operator import itemgetter
from nltk.tokenize import word_tokenize
import subprocess

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Define the path to the data directory
data_path = '/content/drive/MyDrive/DCU_IR_PROJECT'

# Define the file paths for the Cranfield collection
data_file = os.path.join(data_path, 'cran.all.1400')  # Path to the Cranfield document collection
query_file = os.path.join(data_path, 'cran.qry')       # Path to the Cranfield query file
comp_file = os.path.join(data_path, 'cranqrel')        # Path to the Cranfield relevance judgments file

# Define the directory paths for storing processed data
Formated_Document = os.path.join(data_path, 'Formated Documents')  # Path to store formatted documents
PreProcessed_Documents = os.path.join(data_path, 'PreProcessed_Documents')  # Path to store preprocessed documents
Output_Directory = os.path.join(data_path, 'Output_Directory')      # Path to store output files


In [3]:
def load_passage(data_file):
  """Loads passage data from a file.

  This function reads a file containing passage data and returns a dictionary
  where keys are document IDs and values are the corresponding text data.

  Args:
      data_file (str): The path to the file containing passage data.

  Returns:
      dict: A dictionary containing document IDs as keys and text data as values.
  """

  # Open the data file in read mode with UTF-8 encoding
  with open(data_file, 'r', encoding='utf-8') as file:
    # Read the entire content of the file into a string
    passage = file.read().split('.I')[1:]

  # Create an empty dictionary to store document data
  Data = {}

  # Iterate over each passage in the file (excluding the first element)
  for sequence, doc_data in enumerate(passage, start=1):
    # Split the passage data into Document ID and Text Data using '.T\n' delimiter
    Document_ID, TextData = doc_data.split('.T\n')
    # Remove any leading/trailing whitespaces from Document ID
    Document_ID = Document_ID.strip()
    # Remove any leading/trailing whitespaces from Text Data
    TextData = TextData.strip()
    # Add the Document ID as key and Text Data as value to the dictionary
    Data[Document_ID] = TextData

    # Check if the 'Formated Documents' directory exists
    if not os.path.exists(Formated_Document):
      # Create the 'Formated Documents' directory if it doesn't exist
      os.makedirs(Formated_Document)

    # Construct the file path for the current document
    Document_File = os.path.join(Formated_Document, f'{Document_ID}.txt')

    # Open the document file in write mode with UTF-8 encoding
    with open(Document_File, 'w', encoding='utf-8') as f:
      # Write the Text Data of the document to the file
      f.write(TextData)

  # uncomment this line to print the loaded data for verification
  # print(Data)
  # Return the dictionary containing document IDs and Text Data
  return Data


In [4]:
def PreProcessing_Text(Document_ID, Text):
  """Preprocesses text data for information retrieval tasks.

  This function performs the following steps on the input text:
      1. Tokenization: Splits the text into individual words.
      2. Stop word removal: Removes common words (stop words) from the tokens.
      3. Stemming: Reduces words to their base forms (e.g., "running" -> "run").
      4. Saves the preprocessed text to a file.

  Args:
      Document_ID (str): The ID of the document being processed.
      Text (str): The text data to be preprocessed.

  Returns:
      list: A list of stemmed tokens (words) after preprocessing.
  """

  # Tokenize the text into lowercase words using NLTK word_tokenize
  # Lowercasing helps improve stemming accuracy
  Tokenizer = word_tokenize(Text.lower())

  # Load the stop words list from NLTK stopwords corpus for English
  StopWords = set(stopwords.words('english'))

  # Filter out stop words from the tokenized text
  Filtered_Tokens = []
  for Token in Tokenizer:
    if Token not in StopWords:  # Check if token is not a stop word
      Filtered_Tokens.append(Token)

  # Create a Porter Stemmer object for stemming words
  Stemmer = PorterStemmer()

  # Apply stemming to each filtered token
  Stemmed_Tokens = []
  for Token in Filtered_Tokens:
    Stemmed_Tokens.append(Stemmer.stem(Token))

  # Check if the 'PreProcessed_Documents' directory exists
  if not os.path.exists(PreProcessed_Documents):
    # Create the directory if it doesn't exist
    os.makedirs(PreProcessed_Documents)

  # Construct the file path for the preprocessed document
  Document_file = os.path.join(PreProcessed_Documents, f'{Document_ID}.txt')

  # Open the file in write mode with UTF-8 encoding
  with open(Document_file, 'w', encoding='utf-8') as f:
    # Write the preprocessed text as a space-separated string to the file
    f.write(' '.join(Stemmed_Tokens))

  # uncomment this line to print the stemmed tokens for verification
  # print(Stemmed_Tokens)
  # Return the list of stemmed tokens
  return Stemmed_Tokens


In [5]:
def Module_Inverted_Index(Text):
  """Creates an inverted index from a dictionary of document data.

  This function takes a dictionary where keys are document IDs and values
  are text data, preprocesses the text, and builds an inverted index where
  keys are unique tokens and values are lists of document IDs containing those tokens.

  Args:
      Text (dict): A dictionary containing document IDs and text data.

  Returns:
      defaultdict: The inverted index with tokens as keys and lists of document IDs as values.
  """

  # Create an empty inverted index using a defaultdict for efficient handling of missing keys
  InvertedIndex = defaultdict(list)

  # Iterate through each document ID and text data pair in the input dictionary
  for Document_ID, Paragraph in Text.items():
    # Preprocess the text (tokenize, remove stop words, and stem)
    Tokens = PreProcessing_Text(Document_ID, Paragraph)

    # Iterate through each token in the preprocessed text
    for Token in Tokens:
      # Check if the document ID is not already associated with the token in the inverted index
      if Document_ID not in InvertedIndex[Token]:
        # Add the document ID to the list of documents for the token
        InvertedIndex[Token].append(Document_ID)

  # uncomment this line to print the inverted index for verification
  # print(InvertedIndex)

  # Return the completed inverted index
  return InvertedIndex


In [6]:
def Calculate_InverseDocumentFrequency(Invert_Index, NumberOfDocuments):
  """Calculates Inverse Document Frequency (IDF) for each token in the inverted index.

  This function iterates through the inverted index and calculates the IDF score
  for each token. IDF is a measure of a term's importance based on how frequently
  it appears across documents in a collection.

  Args:
      Invert_Index (defaultdict): The inverted index with tokens as keys and lists of document IDs as values.
      NumberOfDocuments (int): The total number of documents in the collection.

  Returns:
      dict: A dictionary containing tokens as keys and their corresponding IDF scores as values.
  """

  # Create an empty dictionary to store IDF scores
  InverseDocumentFrequency = {}

  # Iterate through each token and its associated document list in the inverted index
  for Token, Document_List in Invert_Index.items():

    # Calculate IDF using the formula: IDF = log(Total Documents / (Documents containing term + 1))
    # Add 1 to the denominator to avoid division by zero for terms appearing in all documents
    InverseDocumentFrequency[Token] = math.log(NumberOfDocuments / (len(Document_List) + 1))

  # uncomment this line to print the IDF scores for verification
  # print(InverseDocumentFrequency)

  # Return the dictionary containing tokens and their IDF scores
  return InverseDocumentFrequency


In [7]:
def Load_Queries(query_file):
  """Loads queries from a file.

  This function reads a file containing queries and returns a dictionary
  where keys are query IDs and values are the corresponding query text.

  Args:
      query_file (str): The path to the file containing queries.

  Returns:
      dict: A dictionary containing query IDs as keys and query text as values.
  """

  # Open the query file in read mode with UTF-8 encoding
  with open(query_file, 'r', encoding='utf-8') as file:
    # Read the entire content of the file into a string
    Queries = file.read().split('.I')[1:]  # Split by ".I" delimiter, excluding the first element

  # Create an empty dictionary to store queries
  Queries_Dictionary = {}

  # Iterate over each query in the file (excluding the first element)
  for Query in Queries:
    # Split the query data into Query ID and Query Content using ".W\n" delimiter
    QueryID, QueryContent = Query.split('.W\n')
    # Remove any leading/trailing whitespaces from Query ID
    QueryID = QueryID.strip()
    # Remove any leading/trailing whitespaces from Query Content
    QueryContent = QueryContent.strip()
    # Add the Query ID as key and Query Content as value to the dictionary
    Queries_Dictionary[QueryID] = QueryContent

  # uncomment this line to print the loaded queries for verification
  # print(Queries_Dictionary)
  # Return the dictionary containing query IDs and query text
  return Queries_Dictionary


In [8]:
def Calculate_TermFrequency(Query_Token):
  """Calculates term frequencies for tokens in a query.

  This function counts the occurrences of each unique token in a query and
  returns a dictionary containing token frequencies.

  Args:
      Query_Token (list): A list of tokens representing a query.

  Returns:
      defaultdict: A dictionary where keys are tokens and values are their term frequencies.
  """

  # Create an empty dictionary using defaultdict to efficiently handle missing keys
  TermFrequency = defaultdict(int)

  # Iterate through each token in the query
  for Token in Query_Token:
    # Increment the count for the token in the TermFrequency dictionary
    TermFrequency[Token] += 1

  # Return the dictionary containing token frequencies
  return TermFrequency


In [9]:
def Module_Term_Document_Matrix(Documents, Inverted_Index, InverseDocumentFrequency):
  """Creates a term-document matrix (TDM) from documents, inverted index, and IDF scores.

  This function builds a TDM where rows represent terms, columns represent documents,
  and each cell contains the product of term frequency (TF) and inverse document frequency (IDF)
  for a specific term-document pair.

  Args:
      Documents (dict): A dictionary containing document IDs and text data.
      Inverted_Index (defaultdict): The inverted index with tokens as keys and lists of document IDs as values.
      InverseDocumentFrequency (dict): A dictionary containing tokens as keys and their corresponding IDF scores as values.

  Returns:
      defaultdict(dict): The term-document matrix with terms as rows, documents as columns,
                          and TF-IDF values in each cell.
  """

  # Create an empty defaultdict to store the term-document matrix
  TermDocumentMatrix = defaultdict(dict)

  # Iterate through each document ID and text data pair in the Documents dictionary
  for Document_ID, Document_Content in Documents.items():

    # Preprocess the text data for the current document (tokenize, remove stop words, and stem)
    Tokens = PreProcessing_Text(Document_ID, Document_Content)

    # Calculate term frequencies for tokens in the document
    TermFrequency = Calculate_TermFrequency(Tokens)

    # Iterate through each token and its frequency in the document
    for Token, Frequency in TermFrequency.items():

      # Check if the IDF score exists for the token in the IDF dictionary
      if Token in InverseDocumentFrequency:
        # Calculate TF-IDF by multiplying term frequency and IDF score
        TermFrequencyInverseDocumentFrequency = Frequency * InverseDocumentFrequency[Token]

        # Add the TF-IDF value to the TDM for the current token and document
        TermDocumentMatrix[Token][Document_ID] = TermFrequencyInverseDocumentFrequency

  # Return the completed term-document matrix
  return TermDocumentMatrix


In [10]:
def VSM_DocumentRanking(Query_Tokens, Term_Document_Matrix):
  """Ranks documents based on their similarity to a query using Vector Space Model (VSM).

  This function calculates a score for each document in the collection based on the
  query terms. The score considers the term frequencies in documents (TF-IDF from the TDM)
  and aims to identify documents most relevant to the query.

  Args:
      Query_Tokens (list): A list of tokens representing the query.
      Term_Document_Matrix (defaultdict(dict)): The term-document matrix with terms as rows,
                                                documents as columns, and TF-IDF values in each cell.

  Returns:
      list: A list of tuples containing document IDs and their corresponding scores,
            sorted in descending order of score (highest scoring documents first).
  """

  # Create a dictionary to store document scores with document IDs as keys and scores as values
  Document_Score = defaultdict(float)

  # Iterate through each unique token in the query
  for Token in Query_Tokens:

    # Check if the token exists in the term-document matrix (avoiding potential KeyError)
    if Token in Term_Document_Matrix:
      # Iterate through each document ID and TF-IDF value associated with the token in the TDM
      for Document_ID, TFIDF in Term_Document_Matrix[Token].items():
        # Accumulate the TF-IDF contribution of the current token to the document's score
        Document_Score[Document_ID] += TFIDF

  # Sort the document scores in descending order (highest scoring documents first)
  # Use itemgetter(1) to sort based on the second element in each tuple (the score)
  Ranked_Documents = sorted(Document_Score.items(), key=itemgetter(1), reverse=True)

  # Return the list of ranked documents (document ID, score)
  return Ranked_Documents


In [11]:
def Output_File(Query_ID, Document_Ranking, Model, Output_Directory):
  """Writes ranked documents to an output file in TREC format.

  This function takes a query ID, ranked documents (list of tuples with document ID and score),
  the model name (e.g., "VSM"), and the output directory. It creates the output directory if it
  doesn't exist and writes the ranked documents to a file in TREC format, which is a standard
  format for information retrieval evaluation.

  Args:
      Query_ID (str): The ID of the query.
      Document_Ranking (list): A list of tuples containing document IDs and their corresponding scores.
      Model (str): The name of the retrieval model used (e.g., "VSM").
      Output_Directory (str): The path to the directory for storing output files.
  """

  # Check if the output directory exists, create it if not
  if not os.path.exists(Output_Directory):
    os.makedirs(Output_Directory)

  # Construct the output file path with the model name appended
  Output_File = os.path.join(Output_Directory, f'{Model}_output.txt')

  # Open the output file in append mode (a)
  with open(Output_File, 'a') as f:
    # Iterate through each document in the ranked list (starting rank from 1)
    for Rank, (Document_ID, Document_Score) in enumerate(Document_Ranking, start=1):
      # Define unused variables (could be used for future extensions)
      Iteration_Value = 0
      Run_ID_Value = 'Information_Retrival_System'
      Rank_Value = Rank

      # Write each document information in TREC format to the file
      f.write(f"{Query_ID} {Iteration_Value} {Document_ID} {Rank_Value} {Document_Score:.6f} {Run_ID_Value + Model}\n")

  # No return value needed as the function modifies the file directly


In [12]:
def BM25_DocumentRanking(QueryTokens, Documents, Inverted_Index, NumberOfDocuments, AverageDocumentLength):
  """Ranks documents based on their similarity to a query using BM25 model.

  This function implements the BM25 (Best Match 25) document ranking algorithm.
  It calculates a score for each document in the collection based on the query terms
  and factors like document length and term frequency.

  Args:
      QueryTokens (list): A list of tokens representing the query.
      Documents (dict): A dictionary containing document IDs and text data.
      Inverted_Index (defaultdict): The inverted index with tokens as keys and lists of document IDs as values.
      NumberOfDocuments (int): The total number of documents in the collection.
      AverageDocumentLength (float): The average document length in the collection.

  Returns:
      list: A list of tuples containing document IDs and their corresponding scores,
            sorted in descending order of score (highest scoring documents first).
  """

  # Create a dictionary to store document scores with document IDs as keys and scores as values
  DocumentScore = defaultdict(float)

  # Iterate through each document ID and text data pair in the Documents dictionary
  for DocumentID, DocumentContent in Documents.items():

    # Preprocess the text data for the current document (tokenize, remove stop words, and stem)
    DocumentTokens = PreProcessing_Text(DocumentID, DocumentContent)

    # Calculate BM25 score for the document based on the query, document tokens, etc.
    DocScore = BM25_ScoreCalculation(QueryTokens, DocumentTokens, Inverted_Index, NumberOfDocuments, AverageDocumentLength)

    # Add the BM25 score for the document to the DocumentScore dictionary
    DocumentScore[DocumentID] = DocScore

  # Sort the document scores in descending order (highest scoring documents first)
  # Use itemgetter(1) to sort based on the second element in each tuple (the score)
  DocumentRanking = sorted(DocumentScore.items(), key=itemgetter(1), reverse=True)

  # Return the list of ranked documents (document ID, score)
  return DocumentRanking


In [13]:
def BM25_DocumentRanking(QueryTokens, Documents, Inverted_Index, NumberOfDocuments, AverageDocumentLength):
  """Ranks documents based on their similarity to a query using BM25 model.

  This function implements the BM25 (Best Match 25) document ranking algorithm.
  It calculates a score for each document in the collection based on the query terms
  and factors like document length and term frequency.

  Args:
      QueryTokens (list): A list of tokens representing the query.
      Documents (dict): A dictionary containing document IDs and text data.
      Inverted_Index (defaultdict): The inverted index with tokens as keys and lists of document IDs as values.
      NumberOfDocuments (int): The total number of documents in the collection.
      AverageDocumentLength (float): The average document length in the collection.

  Returns:
      list: A list of tuples containing document IDs and their corresponding scores,
            sorted in descending order of score (highest scoring documents first).
  """

  # Create a dictionary to store document scores with document IDs as keys and scores as values
  DocumentScore = defaultdict(float)

  # Iterate through each document ID and text data pair in the Documents dictionary
  for DocumentID, DocumentContent in Documents.items():

    # Preprocess the text data for the current document (tokenize, remove stop words, and stem)
    DocumentTokens = PreProcessing_Text(DocumentID, DocumentContent)

    # Calculate BM25 score for the document based on the query, document tokens, etc.
    # (Assuming BM25_ScoreCalculation is defined elsewhere)
    DocScore = BM25_ScoreCalculation(QueryTokens, DocumentTokens, Inverted_Index, NumberOfDocuments, AverageDocumentLength)

    # Add the BM25 score for the document to the DocumentScore dictionary
    DocumentScore[DocumentID] = DocScore

  # Sort the document scores in descending order (highest scoring documents first)
  # Use itemgetter(1) to sort based on the second element in each tuple (the score)
  DocumentRanking = sorted(DocumentScore.items(), key=itemgetter(1), reverse=True)

  # Return the list of ranked documents (document ID, score)
  return DocumentRanking


In [14]:
def BM25_ScoreCalculation(QueryTokens, DocumentTokens, Inverted_Index, NumberOfDocuments, AverageDocumentLength, k1=1.5, b=0.75):
  """Calculates the BM25 score for a document given a query and collection parameters.

  This function implements the BM25 (Best Match 25) document scoring formula.
  It considers factors like query terms, document frequency (DF), inverse document frequency (IDF),
  term frequency (TF) within the document, document length, and average document length in the collection.

  Args:
      QueryTokens (list): A list of tokens representing the query.
      DocumentTokens (list): A list of tokens representing the preprocessed document text.
      Inverted_Index (defaultdict): The inverted index with tokens as keys and lists of document IDs as values.
      NumberOfDocuments (int): The total number of documents in the collection.
      AverageDocumentLength (float): The average document length in the collection.
      k1 (float, optional): BM25 parameter controlling term frequency scaling (default 1.5).
      b (float, optional): BM25 parameter controlling document length impact (default 0.75).

  Returns:
      float: The BM25 score for the document relative to the query.
  """

  # Initialize document score to 0
  DocumentScore = 0

  # Get the length of the document (number of tokens)
  DocumentLength = len(DocumentTokens)

  # Iterate through each unique token in the query
  for Token in QueryTokens:

    # Get the document frequency (number of documents containing the token) from the inverted index
    DocumentFrequency = len(Inverted_Index[Token])

    # Calculate IDF (inverse document frequency) using smoothing to avoid division by zero
    InverseDocumentFrequency = math.log((NumberOfDocuments - DocumentFrequency + 0.5) / (DocumentFrequency + 0.5) + 1)

    # Calculate term frequency (number of times the token appears in the document)
    TermFrequency = DocumentTokens.count(Token)

    # Calculate the numerator part of the BM25 formula
    Numerator = TermFrequency * (k1 + 1)

    # Calculate the denominator part of the BM25 formula with document length adjustment
    Denominator = TermFrequency + k1 * (1 - b + b * (DocumentLength / AverageDocumentLength))

    # Accumulate the contribution of the current term to the document score
    DocumentScore += InverseDocumentFrequency * (Numerator / Denominator)

  # Return the final BM25 score for the document
  return DocumentScore


In [15]:
def OKAPI_BM25_DocumentRanking(QueryTokens, Documents, TokenFrequency, NumberOfTokens):
  """Ranks documents based on their similarity to a query using OKAPI BM25 model.

  This function implements the OKAPI BM25 (Okapi Best Matching 25) document ranking algorithm,
  a variant of the BM25 model. It calculates a score for each document in the collection based on
  the query terms and factors like document frequency, term frequency, document length, and average
  document length.

  Args:
      QueryTokens (list): A list of tokens representing the query.
      Documents (dict): A dictionary containing document IDs and text data.
      TokenFrequency (dict, optional): A dictionary containing term frequencies within the collection (assumed pre-calculated).
      NumberOfTokens (int): The total number of tokens in the collection (assumed pre-calculated).

  Returns:
      list: A list of tuples containing document IDs and their corresponding scores,
            sorted in descending order of score (highest scoring documents first).
  """

  # Create a dictionary to store document scores with document IDs as keys and scores as values
  DocumentScore = defaultdict(float)

  # Iterate through each document ID and text data pair in the Documents dictionary
  for DocumentID, DocumentContent in Documents.items():

    # Preprocess the text data for the current document (tokenize, remove stop words, and stem)
    DocumentTokens = PreProcessing_Text(DocumentID, DocumentContent)

    # Calculate OKAPI BM25 score for the document based on query, document tokens, etc.
    # (Assuming OKAPI_BM25_ScoreCalculation is defined elsewhere)
    DocScore = OKAPI_BM25_ScoreCalculation(QueryTokens, DocumentTokens, TokenFrequency, NumberOfTokens)

    # Add the OKAPI BM25 score for the document to the DocumentScore dictionary
    DocumentScore[DocumentID] = DocScore

  # Sort the document scores in descending order (highest scoring documents first)
  # Use itemgetter(1) to sort based on the second element in each tuple (the score)
  DocumentRanking = sorted(DocumentScore.items(), key=itemgetter(1), reverse=True)

  # Return the list of ranked documents (document ID, score)
  return DocumentRanking


In [16]:
def OKAPI_BM25_ScoreCalculation(QueryTokens, DocumentTokens, TermFrequencyInCollection, NumberOfTokens):
  """Calculates the OKAPI BM25 score for a document given a query and collection parameters.

  This function implements the OKAPI BM25 (Okapi Best Matching 25) document scoring formula,
  a variant of the BM25 model. It considers factors like query terms, document frequency within
  the document (term frequency, TF), document length, average document length (implicit here),
  and a smoothing parameter (lambda).

  Args:
      QueryTokens (list): A list of tokens representing the query terms.
      DocumentTokens (list): A list of tokens representing the preprocessed document text.
      TermFrequencyInCollection (dict): A dictionary containing term frequencies for all terms in the collection.
      NumberOfTokens (int): The total number of tokens in the collection.

  Returns:
      float: The OKAPI BM25 score for the document relative to the query.
  """

  # Smoothing parameter for OKAPI BM25 (default value)
  lambd = 0.5

  # Initialize document score to 1 (common starting point for BM25 variants)
  DocumentScore = 1

  # Get the length of the document (number of tokens)
  DocumentLength = len(DocumentTokens)

  # Iterate through each token in the query
  for Token in QueryTokens:

    # Count the term frequency (number of times the token appears) in the document
    TermFrequency = DocumentTokens.count(Token)

    # Calculate the probability considering document-specific TF and collection-level TF with smoothing
    probability = (1 - lambd) * (TermFrequency / DocumentLength) + lambd * (TermFrequencyInCollection.get(Token, 0) / NumberOfTokens)

    # Update the document score by accumulating the probability for each query term
    DocumentScore *= probability

  # Return the final OKAPI BM25 score for the document
  return DocumentScore


In [None]:
# The main function is executed only when the script is run directly (not imported as a module)
if __name__ == '__main__':

  # Load the documents from the data file
  Documents = load_passage(data_file)

  # Preprocess the documents and generate the inverted index
  Inverted_Index = Module_Inverted_Index(Documents)

  # Calculate the inverse document frequency (IDF) for all terms in the inverted index
  NumberOfDocuments = len(Documents)
  InverseDocumentFrequency = Calculate_InverseDocumentFrequency(Inverted_Index, NumberOfDocuments)

  # Load the queries from the query file
  Queries = Load_Queries(query_file)

  # ==================== VSM Model ====================

  # Create the term-document matrix (may involve TF-IDF weighting) - likely implemented in Module_Term_Document_Matrix
  Term_Document_Matrix = Module_Term_Document_Matrix(Documents, Inverted_Index, InverseDocumentFrequency)

  # Process each query using the VSM model
  for Query_ID, Query_Content in Queries.items():
    # Preprocess the query text
    Query_Tokens = PreProcessing_Text(Query_ID, Query_Content)

    # Rank documents using the VSM model (implementation assumed in VSM_DocumentRanking)
    VectorSpaceModel_DocumentRanking = VSM_DocumentRanking(Query_Tokens, Term_Document_Matrix)

    # Output top 100 ranked documents for the query using VSM in TREC format
    Output_File(Query_ID, VectorSpaceModel_DocumentRanking[:100], 'VSM', Output_Directory)

  # ==================== BM25 Model ====================

  # Calculate the total length of all documents (for average document length calculation)
  TotalLength = 0
  for Document_ID, Document_Content in Documents.items():
    TotalLength += len(PreProcessing_Text(Document_ID, Document_Content))

  # Calculate the average document length
  AverageDocumentLength = TotalLength / NumberOfDocuments

  # Process each query using the BM25 model
  for QueryID, QueryContent in Queries.items():
    # Preprocess the query text
    QueryToken = PreProcessing_Text(QueryID, QueryContent)

    # Rank documents using the BM25 model (implementation assumed in BM25_DocumentRanking)
    BestMatch25_DocumentRanking = BM25_DocumentRanking(QueryToken, Documents, Inverted_Index, NumberOfDocuments, AverageDocumentLength)

    # Output top 100 ranked documents for the query using BM25 in TREC format
    Output_File(QueryID, BestMatch25_DocumentRanking[:100], 'BM25', Output_Directory)

  # ==================== OKAPI BM25 Model ====================

  # Pre-calculate term frequencies within the collection
  TokenFrequency = defaultdict(int)
  NumberOfToken = 0
  for DocumentID, DocumentContent in Documents.items():
    DocumentTokens = PreProcessing_Text(DocumentID, DocumentContent)
    for Token in DocumentTokens:
      TokenFrequency[Token] += 1
      NumberOfToken += 1

  # Process each query using the OKAPI BM25 model
  for QueryID, QueryContent in Queries.items():
    # Preprocess the query text
    QueryTokens = PreProcessing_Text(QueryID, QueryContent)

    # Rank documents using the OKAPI BM25 model (implementation assumed in OKAPI_BM25_DocumentRanking)
    OKAPI_DocumentRanking = OKAPI_BM25_DocumentRanking(QueryTokens, Documents, TokenFrequency, NumberOfToken)

    # Output top 100 ranked documents for the query using OKAPI BM25 in TREC format
    Output_File(QueryID, OKAPI_DocumentRanking[:100], 'OKAPI_BM25', Output_Directory)

  # ==================== Evaluation ====================

  # Path to the TREC_EVAL tool (assumed to be accessible)
  trec_eval_path = 'https://drive.google.com/file/d/1cwvL5uHI7OxmSMqLShZMEJ-J4VeijMfk/view?usp=drive_link'

  # Path to the qrel file containing relevance judgments (assumed to be available)
  qrel_file_path = comp_file

  # Compute evaluation metrics for VSM model
  vsm_output_file_path = os.path.join(Output_Directory, 'vsm_output.txt')
  subprocess.run([trec_eval_path, qrel_file_path, vsm_output_file_path])

  # Compute evaluation metrics for BM25 model
  bm25_output_file_path = os.path.join(Output_Directory, 'bm25_output.txt')
  subprocess.run([trec_eval_path, qrel_file_path, bm25_output_file_path])

  # Compute evaluation metrics for Okapi BM25 model
  okapi_bm25_output_file_path = os.path.join(Output_Directory, 'okapi_bm25_output.txt')
  subprocess.run([trec_eval_path, qrel_file_path, okapi_bm25_output_file_path])
