In [18]:
pip install sentence_transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install langchain

Defaulting to user installation because normal site-packages is not writeable
Collecting numpy<2,>=1.26.4 (from langchain)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.3.3
    Uninstalling numpy-2.3.3:
      Successfully uninstalled numpy-2.3.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.16.2 which is incompatible.
tensorflow 2.18.0 requires tensorboard<2.19,>=2.18, but you have tensorboard 2.20.0 which is incompatible.
fuzzy-c-means 1.7.2 requires typer<0.10.0,>=0.9.0, but you have typer 0.19.2 which is incompatible.
en-core-sci-m

In [20]:
pip install langchain_huggingface

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [21]:
pip install chromadb

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [22]:
# !pip install "opentelemetry-api<1.37" "opentelemetry-sdk<1.37" \
#             "opentelemetry-exporter-otlp-proto-grpc<1.37" --force-reinstall


# This notebook contains Step 2 : Answer Generation and  Step 3 Assertions part of the pipeline

In [23]:
import json
import os
import numpy as np
import pandas as pd
import json
import re
import pandas as pd
from tqdm import tqdm  
from sentence_transformers import CrossEncoder
from transformers import pipeline

In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import os
from langchain_huggingface import HuggingFaceEmbeddings
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import warnings
from transformers.utils import logging as hf_logging
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer, util
import pandas as pd

In [25]:
import warnings

os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [26]:
hf_logging.set_verbosity_error()
warnings.filterwarnings("ignore", category=FutureWarning)

### Fetch data from the ChromaDb Store

In [27]:
#Initializing cross encoder

cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [28]:
def cosine_similarity(prediction, ground_truth):
    emb_pred = embedder.encode(prediction, convert_to_tensor=True)
    emb_gt   = embedder.encode(ground_truth, convert_to_tensor=True)
    return util.cos_sim(emb_pred, emb_gt).item() 

In [36]:
def retrieve_chunks(chroma_collection, question, id=None, k=10):
    where_clause = {"id": str(id)} if id else None

    result = chroma_collection.query(
        query_texts=[question],
        n_results=k,
        where=where_clause
    )

    pairs = [(question, doc) for doc in result["documents"][0]]
    scores = cross_encoder.predict(pairs)
    
    # sort by score descending
    reranked = [doc for _, doc in sorted(zip(scores, result["documents"][0]), key=lambda x: x[0], reverse=True)]
    final_chunks = reranked[:3]

    context_text = " ".join(final_chunks)

    
    context_text = " ".join(result["documents"][0])  # combine list of strings into one
    clean_chunk = re.sub(r"\$.*?\$", "", context_text)   # remove inline math
 
    # Return list of text chunks
    return clean_chunk


In [40]:
def qa_with_confidence(qa_model, qa_tokenizer, question, context):
    inputs = qa_tokenizer(
        question,
        context,
        return_tensors="pt",
        max_length=512,
        truncation=True
    )
    with torch.no_grad():
        outputs = qa_model(**inputs)

    start_idx = outputs.start_logits.argmax()
    end_idx   = outputs.end_logits.argmax()

    answer_tokens = inputs.input_ids[0, start_idx:end_idx+1]
    decoded_answer = qa_tokenizer.decode(answer_tokens, skip_special_tokens=True)

    start_probs = F.softmax(outputs.start_logits, dim=-1)
    end_probs   = F.softmax(outputs.end_logits, dim=-1)
    confidence  = (start_probs[0, start_idx] * end_probs[0, end_idx]).item()

    return decoded_answer, confidence


In [41]:
def run_pipeline(db_path: str, base_configs: dict, embedding_models: list, generation_models: list):
    client = chromadb.PersistentClient(path=db_path)

    for ds_name, ds_info in base_configs.items():
        for emb_model in embedding_models:
            for gen_model in generation_models:

                collection_name = f"{ds_info['collection']}_{emb_model}"

                print(f"  Running pipeline for {ds_name}")
                print(f"   Embedding Model : {emb_model}")
                print(f"   Generation Model: {gen_model}")
                print(f"   Collection Name : {collection_name}")

                # Connect to the collection with the correct embedding function
                embedding_fn = SentenceTransformerEmbeddingFunction(model_name=emb_model)
                chroma_collection = client.get_collection(
                    name=collection_name,
                    embedding_function=embedding_fn
                )

                # Load questions
                questions_df = pd.read_csv(ds_info["questions"])

                # Load QA model
                qa_tokenizer = AutoTokenizer.from_pretrained(gen_model)
                qa_model     = AutoModelForQuestionAnswering.from_pretrained(gen_model)

                # Run QA + evaluation 
                scores_df = run_answer_extraction_and_evaluation(
                    qa_model, qa_tokenizer, chroma_collection, questions_df
                )

                print(f"   Average Confidence : {scores_df['confidence'].mean():.4f}")
                print(f"   Average Cosine Sim : {scores_df['cosine_sim'].mean():.4f}")


In [None]:
# All embedding models 
EMBEDDING_MODELS = [
    "all-MiniLM-L6-v2",
    "all-mpnet-base-v2",
    "all-MiniLM-L12-v2"
]

# All generation (QA) models
GENERATION_MODELS = [
    "deepset/tinyroberta-squad2",
    "deepset/roberta-base-squad2",
    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"
]

BASE_CONFIGS = {
    "HotpotQA": {
        "collection": "hpqa_data_collection",  
        "questions": "processed_hotpot_df.csv"
    },
    "Qasper": {
        "collection": "qasper_data_collection",
        "questions": "processed_qasper_data.csv"
    }
}

DATASET_PATH = "./ChromaDb"
run_pipeline(DATASET_PATH, BASE_CONFIGS, EMBEDDING_MODELS, GENERATION_MODELS)


  Running pipeline for HotpotQA
   Embedding Model : all-MiniLM-L6-v2
   Generation Model: deepset/tinyroberta-squad2
   Collection Name : hpqa_data_collection_all-MiniLM-L6-v2
Skipping 5a8b57f25542995d1e6f1371 (no context retrieved)
Skipping 5a8c7595554299585d9e36b6 (no context retrieved)
Skipping 5a85ea095542994775f606a8 (no context retrieved)
Skipping 5adbf0a255429947ff17385a (no context retrieved)
Skipping 5a8e3ea95542995a26add48d (no context retrieved)
Skipping 5abd94525542992ac4f382d2 (no context retrieved)
Skipping 5a85b2d95542997b5ce40028 (no context retrieved)
Skipping 5a87ab905542996e4f3088c1 (no context retrieved)
Skipping 5a7bbb64554299042af8f7cc (no context retrieved)
Skipping 5a8db19d5542994ba4e3dd00 (no context retrieved)
Skipping 5a7166395542994082a3e814 (no context retrieved)
Skipping 5a877e5d5542993e715abf7d (no context retrieved)
Skipping 5ab3b0bf5542992ade7c6e39 (no context retrieved)
Skipping 5ab56e32554299637185c594 (no context retrieved)
Skipping 5ab6d09255429954