# Installing required libraries

In [None]:
!pip install \
  "langchain==1.0.8" \
  "langchain-community==0.4.1" \
  "langchain-core==1.1.0" \
  "langchain-text-splitters==1.0.0" \
  "chromadb==1.3.5" \
  "llama-cpp-python==0.3.16" \
  "sentence-transformers==5.1.2" \
  "scikit-learn==1.7.2" \
  "numpy==2.1.0" \
  "rouge-score==0.1.2" \
  "nltk==3.9.2" \
  "tqdm==4.67.1" \
  "huggingface-hub==0.36.0"


# Downloading the ollama mistral 7b instruct model

In [None]:
!wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q8_0.gguf


# Importing required libraries

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
import os
from llama_cpp import Llama
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm

# loading data

In [None]:
!unzip data.zip

In [None]:
data_path = "/content/data/"

# Loading the model

In [None]:
llm = Llama(
    model_path="mistral-7b-instruct-v0.2.Q8_0.gguf",
    n_ctx=4096,
    n_threads=8,
    verbose=False
)


# Donwloading the embeddings model

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# loading the text splitter for spilliting the documents

In [None]:
splitter = CharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
)

# Loading the documents and spilliting them

In [None]:
chunks_dict = {}
speeches = os.listdir(data_path)

In [None]:
for speech in speeches:
  if os.path.splitext(speech)[1] == ".txt":
    loader = TextLoader(os.path.join(data_path, speech))
    documents = loader.load()
    chunks = splitter.split_documents(documents)
    chunks_dict[speech] = chunks


# Converting the chunks into embeddings and storing them in chroma db

In [None]:
chroma_store = Chroma.from_documents(
    documents= [chunk for chunk_list in list(chunks_dict.values()) for chunk in chunk_list],
    embedding=embeddings,
    persist_directory="chroma_db"
)

# Preparing a prompt template to generate input

In [None]:
prompt = PromptTemplate(
    input_variables=["question", "context"],
    template="""
You must answer the question strictly and exclusively using the information provided in the context below.

If the context does NOT contain information that directly answers the question, reply exactly with:
"No relevant information available."

Context:
{context}

Question:
{question}

Answer:
""",
)


# Testing everything:


*   sending query to get 2 relevant chunks from db.
*   passing the query and the chunk to the model to generate output.




In [None]:
query = "How does Ambedkar characterize the Hindu-Muslim problem?"
results = chroma_store.similarity_search(query, k=2)
context = ""
for r in results:
    print("\n--- RESULT ---")
    print(r.metadata["source"].split("/")[-1])
    print(r.page_content)
    context += r.page_content + "\n"

In [None]:
inp = prompt.format(
    question=query,
    context=context
)

In [None]:
response = llm(
    inp,
    max_tokens=200,
    temperature=0.7)

In [None]:
print(response)

In [None]:
print(response["choices"][0]["text"])

# function to generate output directly by the query

In [None]:
def get_response(query):
  results = chroma_store.similarity_search(query, k=2)
  context = ""
  fetched_documents = []
  for r in results:
    context += r.page_content + "\n"
    fetched_documents.append(r.metadata["source"].split("/")[-1])

  inp =  prompt.format(
    question=query,
    context=context
)
  response = llm(
    inp,
    max_tokens=200,
    temperature=0.7)

  output = response["choices"][0]["text"]
  return output, fetched_documents

In [None]:

print(get_response("How does Ambedkar define social democracy"))

# **Evaluating the whole pipeline**

# For evaluating relevent chunks

In [None]:
def recall(retrieved, truth):
  if len(truth) == 0 or len(retrieved) == 0:
        return 0.0
  hits = sum(1 for item in retrieved if item in truth)
  return hits / len(truth)


def precision(retrieved, truth):
    if len(truth) == 0 or len(retrieved) == 0:
        return 0.0
    hits = sum(1 for item in retrieved if item in truth)
    return hits / len(retrieved)


def mrr(retrieved, truth):
    for idx, item in enumerate(retrieved):
        if item in truth:
            return 1.0 / (idx + 1)
    return 0.0

# For evaluaing model's answer

In [None]:
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def rouge_f1_score(reference, generated):
    return rouge.score(reference, generated)['rougeL'].fmeasure

def cosine_score(reference, generated):
    emb_reference = embeddings.embed_query(reference)
    emb_generated = embeddings.embed_query(generated)
    return cosine_similarity([emb_reference], [emb_generated])[0][0]


def bleu_score(reference, generated):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference.split()], generated.split(), smoothing_function=smoothie)


# Running it on Test Dataset

In [None]:
with open("data/test_dataset.json", "r") as f:
    test_dataset_dict = json.load(f)
    test_dataset = test_dataset_dict["test_questions"]

In [None]:
print(len(test_dataset))

In [None]:
test_results = []

In [None]:
total_recall = 0
total_precision = 0
total_mrr = 0

total_rouge_f1_score = 0
total_cosine_score = 0
total_bleu_score = 0

for idx, test_item in tqdm(enumerate(test_dataset), total=len(test_dataset)):

  query = test_item['question']
  correct_answer = test_item['ground_truth']
  correct_docs = test_item['source_documents']
  isAnswerable = test_item['answerable']

  output, docs = get_response(query)

  #evaluating retrived documents
  current_recall = recall(docs, correct_docs)
  current_precision = precision(docs, correct_docs)
  current_mrr = mrr(docs, correct_docs)

  total_recall += current_recall
  total_precision += current_precision
  total_mrr += current_mrr

  #evaluating model's output
  current_rouge_f1_score = rouge_f1_score(correct_answer, output)
  current_cosine_score = cosine_score(correct_answer, output)
  current_bleu_score = bleu_score(correct_answer, output)

  total_rouge_f1_score += current_rouge_f1_score
  total_cosine_score += current_cosine_score
  total_bleu_score += current_bleu_score

  test_results.append({
      "id" : idx+1,
      "recall" : current_recall,
      "precision" : current_precision,
      "MRR" : current_mrr,
      "ROUGE-L Score" : current_rouge_f1_score,
      "Cosine Similarity" : current_cosine_score,
      "BLEU Score" : current_bleu_score,
      "question" : query,
      "correct answer" : correct_answer,
      "model output" : output

  })

  tqdm.write(f"\n\nTest Set {idx+1}")
  tqdm.write(f"Current Recall: {current_recall}")
  tqdm.write(f"Current Precision: {current_precision}")
  tqdm.write(f"Current MRR: {current_mrr}")
  tqdm.write("\n")
  tqdm.write(f"Current Rouge F1 Score: {current_rouge_f1_score}")
  tqdm.write(f"Current Cosine Score: {current_cosine_score}")
  tqdm.write(f"Current Bleu Score: {current_bleu_score}")

  tqdm.write(f"Question: {query}")
  tqdm.write(f"Correct Answer: {correct_answer}")
  tqdm.write(f"Model Output: {output}")

print("Average Recall:", (total_recall/len(test_dataset)))
print("Average Precision:", (total_precision/len(test_dataset)))
print("Average MRR:", (total_mrr/len(test_dataset)))

print("Average Rouge F1 Score:", (total_rouge_f1_score/len(test_dataset)))
print("Average Cosine Score:", (total_cosine_score/len(test_dataset)))
print("Average Bleu Score:", (total_bleu_score/len(test_dataset)))

In [None]:
with open("test_results.json", "w") as f:
    json.dump(test_results, f, indent=4)