# This notebook contains Step 2 : Answer Generation and  Step 3 Assertions part of the pipeline

In [None]:
import json
import os
import numpy as np
import pandas as pd
import json
import re
import pandas as pd
from tqdm import tqdm  
from sentence_transformers import CrossEncoder
from transformers import pipeline

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import os
from langchain_huggingface import HuggingFaceEmbeddings
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import warnings
from transformers.utils import logging as hf_logging

In [None]:
import warnings

os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
hf_logging.set_verbosity_error()

warnings.filterwarnings("ignore", category=FutureWarning)

### Fetch data from the ChromaDb Store

In [None]:
chroma_client = chromadb.PersistentClient(path="./ChromaDb")
collection_name = "qasper_data_collection"

chroma_collection = chroma_client.get_collection(collection_name)


In [None]:
questions_qasper = pd.read_csv('processed_qasper_data.csv')

In [None]:
questions_qasper.head()

In [None]:
# keep only rows with a question
questions = (
    questions_qasper.groupby("question_id")
      .first()                                     
      .reset_index()[["question_id","question",
                      "paper_id","free_form_answer"]]
)


In [None]:
questions.head()

In [None]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")


In [None]:
def retrieve_chunks(question, paper_id=None, k=10):
    where_clause = {"paper_id": str(paper_id)} if paper_id else None

    result = chroma_collection.query(
        query_texts=[question],
        n_results=k,
        where=where_clause
    )
    
    pairs = [(question, doc) for doc in result["documents"][0]]
    scores = cross_encoder.predict(pairs)
    
    # sort by score descending
    reranked = [doc for _, doc in sorted(zip(scores, result["documents"][0]), key=lambda x: x[0], reverse=True)]
    final_chunks = reranked[:3]

    context_text = " ".join(final_chunks)

    
    context_text = " ".join(result["documents"][0])  # combine list of strings into one
    clean_chunk = re.sub(r"\$.*?\$", "", context_text)   # remove inline math
 
    # Return list of text chunks
    return clean_chunk


In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer, util
import pandas as pd

QA_MODEL = "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"
qa_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL)
qa_model     = AutoModelForQuestionAnswering.from_pretrained(QA_MODEL)

embedder = SentenceTransformer("all-MiniLM-L6-v2") 


In [None]:
def qa_with_confidence(question, context):
    inputs = qa_tokenizer(
        question,
        context,
        return_tensors="pt",
        max_length=512,
        truncation=True
    )
    with torch.no_grad():
        outputs = qa_model(**inputs)

    start_idx = outputs.start_logits.argmax()
    end_idx   = outputs.end_logits.argmax()

    answer_tokens = inputs.input_ids[0, start_idx:end_idx+1]
    decoded_answer = qa_tokenizer.decode(answer_tokens, skip_special_tokens=True)

    start_probs = F.softmax(outputs.start_logits, dim=-1)
    end_probs   = F.softmax(outputs.end_logits, dim=-1)
    confidence  = (start_probs[0, start_idx] * end_probs[0, end_idx]).item()

    return decoded_answer, confidence


In [None]:
def cosine_similarity(prediction, ground_truth):
    emb_pred = embedder.encode(prediction, convert_to_tensor=True)
    emb_gt   = embedder.encode(ground_truth, convert_to_tensor=True)
    return util.cos_sim(emb_pred, emb_gt).item() 

In [None]:
results = []
for _, row in questions.iterrows():
    chunks = retrieve_chunks(row.question, row.paper_id)
    
    if not chunks or chunks.strip() == "":
        print(f"Skipping {row.question_id} (no context retrieved)")
        continue
    
    pred, conf = qa_with_confidence(row.question, chunks)
    cos        = cosine_similarity(pred, str(row.free_form_answer))
    results.append({
        "question_id": row.question_id,
        "question" : row.question,
        "model_answer": pred,
        "ground_truth": str(row.free_form_answer),
        "confidence": conf,
        "cosine_sim": cos
    })

scores_df = pd.DataFrame(results)

print("Average Confidence :", scores_df["confidence"].mean())
print("Average Cosine Sim :", scores_df["cosine_sim"].mean())


In [None]:
scores_df

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("Starting download and caching of Hugging Face models...")

QA_MODELS = [
      "deepset/tinyroberta-squad2",
      "deepset/roberta-base-squad2",
      "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad"
]

for model_name in QA_MODELS:
    try:
        # print(f"Loading model: {model_name}")
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # This is the essential part for downloading and caching the model
        model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        
        # The following lines simulate a quick inference to ensure everything is loaded correctly
        question = "How many programming languages does BLOOM support?"
        context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
        inputs = tokenizer(question, context, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**inputs)

        answer_start_index = outputs.start_logits.argmax()
        answer_end_index = outputs.end_logits.argmax()
        predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        decoded_answer = tokenizer.decode(predict_answer_tokens)
        
        print(f"Successfully loaded and tested: {model_name}. Predicted answer: '{decoded_answer}'")

    except Exception as e:
        print(f"Failed to load model {model_name}. Error: {e}")

print("Model loading process complete.")
     