<a href="https://colab.research.google.com/github/NIROHAN/Retrieve/blob/main/Week5_Binary_Independence_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Required libraries for handling data, text processing, and mathematical operations
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Required libraries for handling data, text processing, and mathematical operations
import numpy as np
import random
import re
import os
from collections import defaultdict
from math import log

In [5]:
def clean_and_tokenize(text_content):
    return re.findall(r'\b\w+\b', text_content.lower())

In [6]:
def load_documents(directory_path):
    document_store = {}
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.txt'):
            with open(os.path.join(directory_path, file_name), 'r', encoding='utf-8') as doc_file:
                document_store[file_name] = clean_and_tokenize(doc_file.read())

    return document_store


In [7]:
def load_queries(query_file_path):
    with open(query_file_path, 'r') as query_file:
        return [line.strip() for line in query_file.readlines()]

In [8]:
def calculate_statistics(document_store):
    # Initialize statistics tracking
    total_docs = len(document_store)
    word_doc_frequency = defaultdict(int)
    word_frequency = defaultdict(lambda: defaultdict(int))

    for doc_name, terms in document_store.items():
        unique_terms = set(terms)
        for term in terms:
            word_frequency[doc_name][term] += 1
        for term in unique_terms:
            word_doc_frequency[term] += 1

    return word_frequency, word_doc_frequency, total_docs

In [9]:
def compute_relevance_scores(query_terms, word_frequency, word_doc_frequency, total_docs):
    relevance_scores = {}
    for doc_name in word_frequency:
        doc_score = 1.0
        for term in query_terms:
            term_freq = word_frequency[doc_name].get(term, 0)
            doc_freq = word_doc_frequency.get(term, 0)
            p_relevant = (term_freq + 1) / (sum(word_frequency[doc_name].values()) + len(word_doc_frequency))
            p_not_relevant = (doc_freq + 1) / (total_docs - doc_freq + len(word_doc_frequency))
            doc_score *= (p_relevant / p_not_relevant)
        relevance_scores[doc_name] = doc_score
    return relevance_scores

In [10]:
def assign_random_relevance(query_list, doc_list, relevance_range=(0, 1)):
    random_relevance = {}

    for query in query_list:
        random_relevance[query] = {}
        for document in doc_list:
            random_relevance[query][document] = random.randint(relevance_range[0], relevance_range[1])

    return random_relevance

In [11]:
def save_relevance_scores(output_scores, output_file_path):

    with open(output_file_path, 'w') as output_file:
        for query, doc_scores in output_scores.items():
            for doc, score in doc_scores.items():
                output_file.write(f"{query},{doc},{score}\n")

In [12]:
# Main function to process documents and queries for scoring based on Binary Independence Model
def run_binary_independence_model(data_directory, query_file):
    document_store = load_documents(data_directory)
    query_list = load_queries(query_file)


    word_frequency, word_doc_frequency, total_docs = calculate_statistics(document_store)


    for query in query_list:
        query_terms = clean_and_tokenize(query)
        relevance_scores = compute_relevance_scores(query_terms, word_frequency, word_doc_frequency, total_docs)
        sorted_docs = sorted(relevance_scores.items(), key=lambda item: item[1], reverse=True)
        print(f"Query: {query}")
        for doc_name, score in sorted_docs:
            print(f"Document: {doc_name}, Score: {score:.4f}")
        print()


    random_relevance_scores = assign_random_relevance(query_list, document_store.keys())
    output_file_name = 'rohan_output.txt'
    save_relevance_scores(random_relevance_scores, output_file_name)

    print(f"Relevance scores saved to {output_file_name}")


In [14]:
print(os.listdir('/content/drive/MyDrive/Dataset/Query'))

['Text1.txt', 'text2.txt', 'text3.txt', 'text4.txt', 'text5.txt', 'text6.txt', 'text7.txt', 'text8.txt', 'text9.txt', 'queries.txt']


In [16]:
folder_location = '/content/drive/MyDrive/Dataset/Query'
query_file_location = '/content/drive/MyDrive/Dataset/Query/queries.txt'
run_binary_independence_model(folder_location, query_file_location)

Query: Foundation
Document: Text1.txt, Score: 0.5907
Document: queries.txt, Score: 0.5010
Document: text6.txt, Score: 0.3879
Document: text4.txt, Score: 0.1969
Document: text3.txt, Score: 0.1966
Document: text5.txt, Score: 0.1960
Document: text2.txt, Score: 0.1957
Document: text8.txt, Score: 0.1954
Document: text7.txt, Score: 0.1951
Document: text9.txt, Score: 0.1939

Query: Triumph
Document: text4.txt, Score: 0.5907
Document: queries.txt, Score: 0.5010
Document: text3.txt, Score: 0.3932
Document: Text1.txt, Score: 0.1969
Document: text5.txt, Score: 0.1960
Document: text2.txt, Score: 0.1957
Document: text8.txt, Score: 0.1954
Document: text7.txt, Score: 0.1951
Document: text6.txt, Score: 0.1939
Document: text9.txt, Score: 0.1939

Query: Resilience
Document: queries.txt, Score: 0.3327
Document: text5.txt, Score: 0.2603
Document: text8.txt, Score: 0.2595
Document: text7.txt, Score: 0.2591
Document: text9.txt, Score: 0.2576
Document: Text1.txt, Score: 0.1308
Document: text4.txt, Score: 0.1