In [1]:
!pip install sentence-transformers scikit-learn spacy numpy faiss-cpu PyMuPDF python-docx
!python -m spacy download en_core_web_sm


Collecting sentence-transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Collecting spacy
  Using cached spacy-3.8.7-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting numpy
  Using cached numpy-2.3.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting faiss-cpu
  Using cached faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting PyMuPDF
  Using cached pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting python-docx
  Using cached python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
Collecting tqdm (from sentence-transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.1-cp312-cp312-


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 2.2 MB/s eta 0:00:06
     --- ------------------------------------ 1.0/12.8 MB 2.2 MB/s eta 0:00:06
     ---- ----------------------------------- 1.3/12.8 MB 1.6 MB/s eta 0:00:08
     ---- ----------------------------------- 1.6/12.8 MB 1.5 MB/s eta 0:00:08
     ----- ---------------------------------- 1.8/12.8 MB 1.4 MB/s eta 0:00:08
     ----- ---------------------------------- 1.8/12.8 MB 1.4 MB/s eta 0:00:08
     ------ --------------------------------- 2.1/12.8 MB 1.4 MB/s eta 0:00:08
     ------- -------------------------------- 2.4/12.8 MB 1.3 MB/s eta 0:00:09
     -------- ------------------------------- 2.


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
documents = [
    {
        "id": "doc1",
        "title": "Income Tax Deduction",
        "body": "Under section 80C, individuals can claim deductions for education expenses and housing loan payments.",
        "law_type": "Income Tax Act"
    },
    {
        "id": "doc2",
        "title": "GST on Textiles",
        "body": "The GST rate for textile products is 5% as per Schedule I of the GST Act.",
        "law_type": "GST Act"
    },
    {
        "id": "doc3",
        "title": "Property Registration",
        "body": "The process of property registration includes paying stamp duty and registering with the sub-registrar.",
        "law_type": "Property Law"
    },
    {
        "id": "doc4",
        "title": "Court Fee",
        "body": "Court fee for civil cases is based on the value of the suit as per the Court Fees Act.",
        "law_type": "Court Judgments"
    }
]


In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

for doc in documents:
    doc['embedding'] = model.encode(doc['body'])


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
import spacy
nlp = spacy.load("en_core_web_sm")

def extract_entities(text):
    doc = nlp(text)
    return set(ent.text.lower() for ent in doc)


In [18]:
for doc in documents:
    doc['entities'] = extract_entities(doc['body'])

In [19]:
text='how are you'
ent=extract_entities(text)
ent

{'are', 'how', 'you'}

In [20]:
query = "Income tax deduction for education"
query_embedding = model.encode(query)
query_entities = extract_entities(query)


In [21]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_ranking(query_emb, docs):
    scores = [(doc, cosine_similarity([query_emb], [doc['embedding']])[0][0]) for doc in docs]
    return sorted(scores, key=lambda x: x[1], reverse=True)


In [22]:
from sklearn.metrics.pairwise import euclidean_distances

def euclidean_ranking(query_emb, docs):
    scores = [(doc, -euclidean_distances([query_emb], [doc['embedding']])[0][0]) for doc in docs]
    return sorted(scores, key=lambda x: x[1], reverse=True)


In [23]:
def mmr_ranking(query_emb, docs, lambda_param=0.7, top_k=3):
    selected = []
    candidates = docs.copy()

    while len(selected) < top_k and candidates:
        scores = []
        for doc in candidates:
            sim_query = cosine_similarity([query_emb], [doc['embedding']])[0][0]
            sim_redundancy = max([cosine_similarity([doc['embedding']], [d['embedding']])[0][0] for d in selected], default=0)
            mmr_score = lambda_param * sim_query - (1 - lambda_param) * sim_redundancy
            scores.append((doc, mmr_score))
        doc_max = max(scores, key=lambda x: x[1])
        selected.append(doc_max[0])
        candidates.remove(doc_max[0])
    return [(doc, cosine_similarity([query_emb], [doc['embedding']])[0][0]) for doc in selected]


In [24]:
def hybrid_ranking(query_emb, query_ents, docs, w_cos=0.6, w_ent=0.4):
    results = []
    for doc in docs:
        cosine_score = cosine_similarity([query_emb], [doc['embedding']])[0][0]
        entity_score = len(query_ents.intersection(doc['entities'])) / (len(query_ents.union(doc['entities'])) + 1e-5)
        hybrid_score = w_cos * cosine_score + w_ent * entity_score
        results.append((doc, hybrid_score))
    return sorted(results, key=lambda x: x[1], reverse=True)


In [25]:
def display_results(method_name, ranked_docs):
    print(f"\n🔍 {method_name} Results:")
    for i, (doc, score) in enumerate(ranked_docs[:3], 1):
        print(f"{i}. {doc['title']} ({doc['law_type']}) - Score: {score:.4f}")

display_results("Cosine Similarity", cosine_ranking(query_embedding, documents))
display_results("Euclidean Distance", euclidean_ranking(query_embedding, documents))
display_results("MMR", mmr_ranking(query_embedding, documents))
display_results("Hybrid Similarity", hybrid_ranking(query_embedding, query_entities, documents))



🔍 Cosine Similarity Results:
1. Income Tax Deduction (Income Tax Act) - Score: 0.6623
2. Property Registration (Property Law) - Score: 0.0843
3. GST on Textiles (GST Act) - Score: 0.0605

🔍 Euclidean Distance Results:
1. Income Tax Deduction (Income Tax Act) - Score: -0.8218
2. Property Registration (Property Law) - Score: -1.3533
3. GST on Textiles (GST Act) - Score: -1.3708

🔍 MMR Results:
1. Income Tax Deduction (Income Tax Act) - Score: 0.6623
2. GST on Textiles (GST Act) - Score: 0.0605
3. Property Registration (Property Law) - Score: 0.0843

🔍 Hybrid Similarity Results:
1. Income Tax Deduction (Income Tax Act) - Score: 0.4395
2. GST on Textiles (GST Act) - Score: 0.0563
3. Property Registration (Property Law) - Score: 0.0506


In [26]:
def evaluate_precision(ranked_docs, relevant_ids, k=3):
    top_k = [doc['id'] for doc, _ in ranked_docs[:k]]
    return len(set(top_k).intersection(relevant_ids)) / k

def evaluate_diversity(ranked_docs, k=3):
    top_k_laws = [doc['law_type'] for doc, _ in ranked_docs[:k]]
    return len(set(top_k_laws)) / k
