<a href="https://colab.research.google.com/github/KailashHari-creator/DL_Model_Trial/blob/main/BGWTSuperhero.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q sentence-transformers nltk tqdm

In [None]:
import json
with open("context_enriched.json", "r", encoding="utf-8") as f:
    shlokas = json.load(f)
len(shlokas)

In [None]:
def shloka_to_text(entry):
    context = entry.get("context", {})
    themes = " ".join(context.get("themes", []))
    actions = " ".join(context.get("actions", []))
    entities = " ".join(context.get("entities", []))

    return f"""
    Shloka: {entry['S.No']}
    English Meaning: {entry['English']}
    Themes: {themes}
    Actions: {actions}
    Entities: {entities}
    """.strip()

shloka_texts = [shloka_to_text(s) for s in shlokas]

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
import numpy as np

shloka_embeddings = model.encode(
    shloka_texts,
    show_progress_bar=True,
    normalize_embeddings=True
)

In [None]:
np.save("shloka_embeddings.npy", shloka_embeddings)

In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\b(uh|um|ah|er)\b", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
def chunk_text(text, words_per_chunk=150):
    words = text.split()
    return [
        " ".join(words[i:i+words_per_chunk])
        for i in range(0, len(words), words_per_chunk)
    ]

In [None]:
with open("transcripts/discourse1.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned = clean_text(raw_text)
chunks = chunk_text(cleaned)
len(chunks)

In [None]:
chunk_embeddings = model.encode(
    chunks,
    normalize_embeddings=True,
    show_progress_bar=True
)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_top_shlokas(chunk_emb, k=3):
    sims = cosine_similarity(
        chunk_emb.reshape(1, -1),
        shloka_embeddings
    )[0]

    top_idx = sims.argsort()[-k:][::-1]
    return [
        (shlokas[i]["S.No"], sims[i])
        for i in top_idx
    ]

In [None]:
for i, chunk in enumerate(chunks[:3]):
    print(f"\n--- Chunk {i+1} ---")
    print(chunk[:200], "...\n")

    results = find_top_shlokas(chunk_embeddings[i])
    for ref, score in results:
        print(ref, round(score, 3))