<a href="https://colab.research.google.com/github/Neermalsha/poverty-of-Nepal-/blob/main/BM25_And_JM_Score_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Install necessary library for .docx file reading
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
from docx import Document
import os
from collections import defaultdict
import numpy as np
from math import log
import re
import math


In [21]:
# Function to preprocess the text (same as before)
def preprocess(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text.lower()

In [22]:
# Function to load documents from a folder (handling .docx files)
def get_documents_from_drive(folder_path):
    documents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):  # For .docx files
            file_path = os.path.join(folder_path, filename)
            doc = Document(file_path)
            full_text = []
            for para in doc.paragraphs:
                full_text.append(para.text)
            documents[filename] = preprocess(' '.join(full_text))
    return documents

# Set the folder path to your Google Drive documents folder
folder_path = '/content/drive/MyDrive/Document/'  # Adjust this path to your actual folder
documents = get_documents_from_drive(folder_path)


In [36]:
# BM25 and Jelinek-Mercer functions (same as before)
def compute_jm_scores(query, documents, term_frequency, doc_lengths, corpus_prob, lambda_param=0.1):
    scores = {}
    for doc_id, doc_text in documents.items():
        score = 0
        for term in query.split():
            p_doc = term_frequency[doc_id].get(term, 0) / doc_lengths[doc_id]
            p_corpus = corpus_prob.get(term, 0)

            # Add a check to avoid log(0) or log(negative)
            jm_value = (lambda_param * p_doc) + ((1 - lambda_param) * p_corpus)
            if jm_value > 0:  # Only calculate log if value is positive
                score += log(jm_value)
            # else:  # You can handle the case where jm_value <= 0 differently if needed
            #     score += 0  # For example, ignore the term and add 0 to the score
        scores[doc_id] = score
    return scores


def compute_jm_scores(query, documents, term_frequency, doc_lengths, corpus_prob, lambda_param=0.1):
    scores = {}
    for doc_id, doc_text in documents.items():
        score = 0
        for term in query.split():
            # Calculate term probabilities
            p_doc = term_frequency[doc_id].get(term, 0) / doc_lengths[doc_id]
            p_corpus = corpus_prob.get(term, 0)

            # Check if the term is absent in both the document and corpus
            if p_doc == 0 and p_corpus == 0:
                continue  # Skip this term because log(0) is undefined

            # Smooth probabilities to avoid math domain error
            smoothed_prob = (lambda_param * p_doc) + ((1 - lambda_param) * p_corpus)

            # Ensure the smoothed probability is greater than 0
            if smoothed_prob > 0:
                score += log(smoothed_prob)

        scores[doc_id] = score
    return scores


In [37]:
# Build term frequencies, document frequencies, and corpus statistics
term_frequency = defaultdict(lambda: defaultdict(int))
document_frequency = defaultdict(int)
doc_lengths = {}
corpus_term_count = defaultdict(int)

# Calculate term frequencies and document lengths
for doc_id, doc_text in documents.items():
    terms = doc_text.split()
    if len(terms) == 0:  # Skip empty documents
        continue
    doc_lengths[doc_id] = len(terms)
    for term in terms:
        term_frequency[doc_id][term] += 1
        corpus_term_count[term] += 1
    for term in set(terms):
        document_frequency[term] += 1

# Calculate average document length
avg_doc_length = np.mean(list(doc_lengths.values()))

# Calculate corpus probability for Jelinek-Mercer
total_terms_in_corpus = sum(corpus_term_count.values())
corpus_prob = {term: count / total_terms_in_corpus for term, count in corpus_term_count.items()}


In [40]:
# Specify the path to the file in Google Drive
queries_file_path = '/content/drive/MyDrive/Document/Quries.txt'

# Read the queries from the file
with open(queries_file_path, 'r') as f:
    queries = [line.strip() for line in f.readlines()]

# Verify the loaded queries
print("Loaded Queries:", queries)


Loaded Queries: ['Digital marketing', 'Search engine optimization', 'Content marketing', 'Marketing', 'Video', 'Editing']


In [41]:
for query in queries:
    bm25_scores = compute_bm25_scores(query, documents, term_frequency, document_frequency, doc_lengths, avg_doc_length)
    jm_scores = compute_jm_scores(query, documents, term_frequency, doc_lengths, corpus_prob)

    print(f"Query: {query}")
    print("BM25 Scores:")
    for doc_id, score in bm25_scores.items():
        print(f"{doc_id}: {score}")

    print("Jelinek-Mercer Scores:")
    for doc_id, score in jm_scores.items():
        print(f"{doc_id}: {score}")

    print("\n")


Query: Digital marketing
BM25 Scores:
Doc 1.docx: 0.7000546958296271
Doc 2.docx: 0
Doc 3.docx: 0.5656739385876729
Doc 4.docx: 0.5572184238105776
Doc 5.docx: 0.5864380579904716
Doc 6.docx: 0
Doc 7.docx: 0.6351783661044088
Doc 8.docx: 0.6424764706555894
Doc 9.docx: 0.3902084356552091
Doc 10.docx: 0
Jelinek-Mercer Scores:
Doc 1.docx: -3.5220888393508583
Doc 2.docx: -3.7598311900862207
Doc 3.docx: -3.623934700199401
Doc 4.docx: -3.630102083411437
Doc 5.docx: -3.6069741715001826
Doc 6.docx: -3.7598311900862207
Doc 7.docx: -3.5839688454147614
Doc 8.docx: -3.5764379038778866
Doc 9.docx: -3.6938930885942263
Doc 10.docx: -3.7598311900862207


Query: Search engine optimization
BM25 Scores:
Doc 1.docx: 0
Doc 2.docx: 4.83747321907206
Doc 3.docx: 0
Doc 4.docx: 0
Doc 5.docx: 0
Doc 6.docx: 0
Doc 7.docx: 0
Doc 8.docx: 0
Doc 9.docx: 0
Doc 10.docx: 1.3839684131473797
Jelinek-Mercer Scores:
Doc 1.docx: -11.394329599056817
Doc 2.docx: -10.16516595374328
Doc 3.docx: -11.394329599056817
Doc 4.docx: -11.3943