# Ideas

- Consider running LaTeX chunking pipeline RAGAS test for part III.

## Creating a Golden Test Data Set

In [29]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = TextLoader("data/nist_ai.tex")

eval_document = loader.load()

text_splitter_eval = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 400
)

eval_documents = text_splitter_eval.split_documents(eval_document)

In [30]:
len(eval_documents)

132

In [3]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from pydantic import BaseModel

generator_llm = AzureChatOpenAI(azure_deployment="gpt-4", temperature=0)
critic_llm = AzureChatOpenAI(azure_deployment="gpt-4", temperature=0)
embeddings = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-large")

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

num_qa_pairs = 20 # You can reduce the number of QA pairs to 5 if you're experiencing rate-limiting issues

testset = generator.generate_with_langchain_docs(eval_documents, num_qa_pairs, distributions, raise_exceptions=False)
testset.to_pandas()

embedding nodes:   0%|          | 0/264 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What are the priorities outlined in the White ...,[The White House (2022) Roadmap for Researcher...,The answer to given question is not present in...,simple,"[{'source': 'data/nist_ai.tex'}, {'source': 'd...",True
1,What type of information can provenance metada...,[Provenance metadata can include information a...,Provenance metadata can include information ab...,simple,[{'source': 'data/nist_ai.tex'}],True
2,"What processes are defined, assessed, and docu...",[MAP 3.4: Processes for operator and practitio...,Processes for operator and practitioner profic...,simple,[{'source': 'data/nist_ai.tex'}],True
3,What is the purpose of field testing in the ev...,[\item Al Red-teaming: A structured testing ex...,The purpose of field testing in the evaluation...,simple,[{'source': 'data/nist_ai.tex'}],True
4,What is the purpose of conducting periodic mon...,[\begin{center}\n\begin{tabular}{|c|c|c|}\n\hl...,The purpose of conducting periodic monitoring ...,simple,[{'source': 'data/nist_ai.tex'}],True
5,How should GAI acceptable use policies be upda...,[may rely on embedded GAI technologies; Addres...,GAI acceptable use policies should be updated ...,simple,[{'source': 'data/nist_ai.tex'}],True
6,What are the environmental impacts associated ...,[\item Confabulation: The production of confid...,Impacts due to high compute resource utilizati...,simple,[{'source': 'data/nist_ai.tex'}],True
7,How could model distillation or compression he...,[Trustworthy Al Characteristics: Accountable a...,Methods for creating smaller versions of train...,simple,[{'source': 'data/nist_ai.tex'}],True
8,What actions are suggested to address Human-AI...,[MAP 3.4: Processes for operator and practitio...,The suggested actions to address Human-AI Conf...,simple,[{'source': 'data/nist_ai.tex'}],True
9,How can structured public feedback be used to ...,[Measurement gaps can arise from mismatches be...,Structured public feedback can be used to eval...,simple,"[{'source': 'data/nist_ai.tex'}, {'source': 'd...",True


Let's look at the output and see what we can learn about it!

In [4]:
testset.test_data[0]

DataRow(question='What are the priorities outlined in the White House Roadmap for Researchers on Information Integrity Research and Development?', contexts=['The White House (2022) Roadmap for Researchers on Priorities Related to Information Integrity Research and Development. \\href{https://www.whitehouse.gov/wp-content/uploads/2022/12/RoadmapInformation-Integrity-RD-2022.pdf}{https://www.whitehouse.gov/wp-content/uploads/2022/12/RoadmapInformation-Integrity-RD-2022.pdf}?\n\nThiel, D. (2023) Investigation Finds AI Image Generation Models Trained on Child Abuse. Stanford Cyber Policy Center. \\href{https://cyber.fsi.stanford.edu/news/investigation-finds-ai-image-generation-modelstrained-child-abuse}{https://cyber.fsi.stanford.edu/news/investigation-finds-ai-image-generation-modelstrained-child-abuse}\n\nTirrell, L. (2017) Toxic Speech: Toward an Epidemiology of Discursive Harm. Philosophical Topics, 45(2), 139-162. \\href{https://www.jstor.org/stable/26529441}{https://www.jstor.org/sta

In [5]:
testset_df = testset.to_pandas()
testset_df.to_csv("testset.csv")

### Test the pipeline

#### Create vector DB

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain.prompts import ChatPromptTemplate
import tiktoken

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o").encode(
        text,
    )
    return len(tokens)

# Load the NIST AI document
PDF_LINK = "data/nist_ai.pdf"
loader = PyMuPDFLoader(file_path=PDF_LINK)
nist_doc = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100,
    length_function = tiktoken_len,
)

nist_chunks = text_splitter.split_documents(nist_doc)

embeddings_small = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-small")

qdrant_client = QdrantClient(":memory:") # set Qdrant DB and its location (in-memory)

qdrant_client.create_collection(
    collection_name="NIST_AI",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name="NIST_AI",
    embedding=embeddings_small,
) # create a QdrantVectorStore object with the above specified client, collection name, and embedding model.

qdrant_vector_store.add_documents(nist_chunks) # add the documents to the QdrantVectorStore

retriever = qdrant_vector_store.as_retriever()

#### Set up RAG retrieval chain

In [7]:
qa_gpt4_llm = AzureChatOpenAI(azure_deployment="gpt-4", temperature=0) # GPT-4o model

# define a template for the RAG model
rag_template = """
You are a helpful assistant that helps users find information and answer their question. 
You MUST use ONLY the available context to answer the question.
If necessary information to answer the question cannot be found in the provided context, you MUST "I don't know."

Question:
{question}

Context:
{context}
"""
# create rag prompt object from the template
prompt = ChatPromptTemplate.from_template(rag_template)

# update the chain with LLM, prompt, and question variable.
retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | qa_gpt4_llm, "context": itemgetter("context"), "question": itemgetter("question")}
)

#### Load test questions to a Pandas dataframe

In [104]:
import pandas as pd

test_df = pd.read_csv("testset.csv") # update the path to the testset.csv file
# test_df = test_df.drop([0, 18]) # drop the rows without ground truth
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()
# test_contexts = list(testset.to_pandas().contexts)

#### Generate answers

In [109]:
answers = []
contexts = []

# loop through the test questions
for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question}) # invoke the retrieval chain with the question
  answers.append(response["response"].content) # append the answer to the answers list
  contexts.append([context.page_content for context in response["context"]]) # append the context to the contexts list

In [30]:
import os
os.environ["LANGCHAIN_PROJECT"]="MidTerm-naive_rag_chain_test"

In [82]:
# update the chain with LLM, prompt, and question variable.
retrieval_baseline_qa_chain = (
    {"context": itemgetter("context"), "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | qa_gpt4_llm, "context": itemgetter("context"), "question": itemgetter("question")}
)

In [123]:
list(testset.to_pandas().contexts)[4]

['\\begin{center}\n\\begin{tabular}{|c|c|c|}\n\\hline\nAction ID & Suggested Action & GAI Risks \\\\\n\\hline\nMP-4.1-001 & \\begin{tabular}{l}\nConduct periodic monitoring of AI-generated content for privacy risks; address any \\\\\npossible instances of PII or sensitive data exposure. \\\\\n\\end{tabular} & Data Privacy \\\\\n\\hline\nMP-4.1-002 & \\begin{tabular}{l}\nImplement processes for responding to potential intellectual property infringement \\\\\nclaims or other rights. \\\\\n\\end{tabular} & Intellectual Property \\\\\n\\hline\nMP-4.1-003 & \\begin{tabular}{l}\nConnect new GAI policies, procedures, and processes to existing model, data, \\\\\nsoftware development, and IT governance and to legal, compliance, and risk \\\\\nmanagement activities. \\\\\n\\end{tabular} & Information Security; Data Privacy \\\\\n\\hline\nMP-4.1-004 & \\begin{tabular}{l}\nDocument training data curation policies, to the extent possible and according to \\\\\napplicable laws and policies. \\\\\n\\

In [105]:
answers_baseline = []
# contexts_baseline = []

# loop through the test questions
for question, context in zip(test_questions, list(testset.to_pandas().contexts)):
  response = retrieval_baseline_qa_chain.invoke({"question" : question, "context": context}) # invoke the retrieval chain with the question
  answers_baseline.append(response["response"].content) # append the answer to the answers list
  # contexts_baseline.append(context) # append the context to the contexts list

#### Create an dataset with questions, answers, context, and ground truth

In [110]:
from datasets import Dataset

naive_response_dataset_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [106]:
baseline_response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers_baseline,
    "contexts" : list(testset.to_pandas().contexts),
    "ground_truth" : test_groundtruths
})

#### Run evaluation on the naive RAG pipeline

In [55]:
from pydantic import BaseModel
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

embeddings_small = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-small") # keeping the embedding model the same as the one in the prototype

In [111]:
naive_retrieval_results = evaluate(naive_response_dataset_retrieval, 
                                               metrics, 
                                               llm=qa_gpt4_llm, 
                                               embeddings=embeddings_small
                                               )

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

In [112]:
naive_retrieval_results

{'faithfulness': 0.8023, 'answer_relevancy': 0.7886, 'context_recall': 0.6500, 'context_precision': 0.7139, 'answer_correctness': 0.3336}

In [107]:
baseline_retrieval_results = evaluate(baseline_response_dataset, 
                                               metrics, 
                                               llm=qa_gpt4_llm, 
                                               embeddings=embeddings_small
                                               )

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

In [113]:
baseline_retrieval_results

{'faithfulness': 0.8059, 'answer_relevancy': 0.8456, 'context_recall': 0.9500, 'context_precision': 0.9500, 'answer_correctness': 0.4912}

In [114]:
df_naive = pd.DataFrame(list(naive_retrieval_results.items()), columns=['Metric', 'Naive RAG - Prototype'])
df_baseline = pd.DataFrame(list(baseline_retrieval_results.items()), columns=['Metric', 'Baseline (testing on ground truth)'])
df_merged = pd.merge(df_naive, df_baseline, on='Metric')
df_merged

Unnamed: 0,Metric,Naive RAG - Prototype,Baseline (testing on ground truth)
0,faithfulness,0.802263,0.805924
1,answer_relevancy,0.788586,0.845593
2,context_recall,0.65,0.95
3,context_precision,0.713889,0.95
4,answer_correctness,0.333612,0.491195


## Fine-Tuning Open-Source Embeddings

### Chunking function

In [30]:
import tiktoken
def count_tokens(string: str) -> int:  
    """Return the number of tokens used by a list of string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [31]:
from langchain.docstore.document import Document

def extract_sections(lines: list,
                     number_of_tokens: int=1000) -> list:
    # the full code removed due to condifentiality    
    return    

    

  if lines[current_line].startswith('\\begin{abstract}') or lines[current_line].startswith('\\textbackslash begin\{abstract\}'):
  if lines[current_line].startswith('\includegraphics'):


In [6]:
file_path = "data/nist_ai.tex"
with open(file_path, 'r', encoding="utf-8") as file:
        if file.name.endswith('.tex'):
            lines = file.readlines()
            chunks = extract_sections(lines)

In [28]:
chunks[:36]

[Document(metadata={'source': 'data/nist_ai.tex'}, page_content='\\section*{NIST Trustworthy and Responsible AI NIST AI 600-1}\n\\section*{Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile}\nThis publication is available free of charge from:\\\\\n\\href{https://doi.org/10.6028/NIST.Al.600-1}{https://doi.org/10.6028/NIST.Al.600-1}\nJuly 2024\nU.S. Department of Commerce Gina M. Raimondo, Secretary\nAbout AI at NIST: The National Institute of Standards and Technology (NIST) develops measurements, technology, tools, and standards to advance reliable, safe, transparent, explainable, privacy-enhanced, and fair artificial intelligence (AI) so that its full commercial and societal benefits can be realized without harm to people or the planet. NIST, which has conducted both fundamental and applied work on AI for more than a decade, is also helping to fulfill the 2023 Executive Order on Safe, Secure, and Trustworthy AI. NIST established the U.S. AI Sa

### Training on naive chunking strategy

In [48]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = TextLoader("data/nist_ai.tex")

doc = loader.load()

text_splitter_eval = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
)

training_documents = text_splitter_eval.split_documents(doc)

Next, we're going to associate each of our chunks with a unique identifier.

In [49]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

Let's create training, validation, and test datasets...

In [34]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.0 MB 5.6 MB/s eta 0:00:02
   ------- -------------------------------- 2.1/11.0 MB 7.3 MB/s eta 0:00:02
   ---------- ----------------------------- 2.9/11.0 MB 5.8 MB/s eta 0:00:02
   ---------- ----------------------------- 2.9/11.0 MB 5.8 MB/s eta 0:00:02
   ---------- ----------------------------- 2.9/11.0 MB 5.8 MB/s eta 0:00:02
   ------------ -------------------

In [52]:
from sklearn.model_selection import train_test_split

train_dataset, _x, _, _ = train_test_split(
    training_documents, training_documents, test_size=0.2, random_state=42)

In [138]:
del(train_dataset[221]) # remove the document with id 221 with a corrupt chunk

In [53]:
validation_dataset, test_dataset, _, _ = train_test_split(
    _x, _x, test_size=0.5, random_state=42)

#### Create fine-tuning dataset

In [55]:
from langchain_openai import AzureChatOpenAI

qa_chat_model = AzureChatOpenAI(
    azure_deployment="gpt-4",
    temperature=0
)

The original prompt did not work as it generated some invalid questions and output format. The template had to be modified:

In [103]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context below, you must generate {n_questions} questions based on only the provided context.

The rules:
1. You must generate questions based on ONLY the context provided.
2. You must generate {n_questions} questions.
3. The output must be a valid python list of questions as a string using double quotation marks without backticks like: ["question 1", "question 2", "question 3", ...]
4. The questions must be relevant to the context provided.
5. The questions must be in the form of a question, not a statement.
6. The questions must be unique.
7. The question most not be about LaTeX/Markdown formatting or the document structure.

Double-check that you produce correct questions and the correct output format.

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)
question_generation_chain = qa_prompt_template | qa_chat_model

Now let's modify the questions generating function to work asynchronously

In [124]:
import asyncio  
import json  
import uuid  
import nest_asyncio  
  
# Apply nest_asyncio to allow nested event loops in Jupyter  
nest_asyncio.apply()  
  
async def create_questions(documents, n_questions):  
    questions: dict[str, Any] = {}  
    relevant_docs: dict[str, Any] = {}
  
    lock = asyncio.Lock()  
    # semaphore = asyncio.Semaphore(5)  # Limit number of concurrent tasks if needed  
  
    async def process_document(document):  
        # async with semaphore:  
        try:  
            # Run the blocking I/O operation in a separate thread  
            questions_generated = await asyncio.to_thread(  
                question_generation_chain.invoke,  
                {"context": document.page_content, "n_questions": n_questions}  
            )
            questions_list = json.loads(questions_generated.content)  
            async with lock:  
                for question in questions_list:  
                    question_id = str(uuid.uuid4())  
                    questions[question_id] = question  
                    relevant_docs[question_id] = document.metadata["id"]  

        except Exception as e:  
            print(f"Error processing document {document.metadata.get('id', '')}: {e}")  
  
    tasks = [process_document(document) for document in documents]  
  
    # Run tasks concurrently and wait for them to finish  
    await asyncio.gather(*tasks)  
    
    print(f"Questions generated: {len(questions)}")

    return questions, relevant_docs

In [140]:
training_questions, training_relevant_contexts = await create_questions(train_dataset, 3)

Error processing document 38a58ac2-9521-4523-946c-fe901615f1f5: Invalid \escape: line 1 column 30 (char 29)
Questions generated: 678


In [149]:
# find unprocessed document
for seqid, doc in enumerate(train_dataset):
    if doc.metadata["id"] == "38a58ac2-9521-4523-946c-fe901615f1f5":
        print(seqid)

103


In [150]:
train_dataset.pop(103) # pop the unprocessed document

Document(metadata={'source': 'data/nist_ai.tex', 'id': '38a58ac2-9521-4523-946c-fe901615f1f5'}, page_content='\\documentclass[10pt]{article}\n\\usepackage[utf8]{inputenc}\n\\usepackage[T1]{fontenc}\n\\usepackage{hyperref}\n\\hypersetup{colorlinks=true, linkcolor=blue, filecolor=magenta, urlcolor=cyan,}\n\\urlstyle{same}\n\\usepackage{graphicx}\n\\usepackage[export]{adjustbox}\n\\graphicspath{ {./images/} }\n\\usepackage{amsmath}\n\\usepackage{amsfonts}\n\\usepackage{amssymb}\n\\usepackage[version=4]{mhchem}\n\\usepackage{stmaryrd}\n\n\\title{NIST Trustworthy and Responsible AI NIST AI 600-1 }\n\n\\author{}\n\\date{}\n\n\n%New command to display footnote whose markers will always be hidden\n\\let\\svthefootnote\\thefootnote\n\\newcommand\\blfootnotetext[1]{%\n  \\let\\thefootnote\\relax\\footnote{#1}%\n  \\addtocounter{footnote}{-1}%\n  \\let\\thefootnote\\svthefootnote%\n}')

In [141]:
val_questions, val_relevant_contexts = await create_questions(validation_dataset, 3)

Questions generated: 87


In [142]:
test_questions, test_relevant_contexts = await create_questions(test_dataset, 3)

Questions generated: 87


In [151]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in train_dataset}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("data/training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [153]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in validation_dataset}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("data/val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [154]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_dataset}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("data/test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

#### Fine-tuning `snowflake-arctic-embed-m`

In [155]:
!pip install -qU sentence_transformers datasets pyarrow

In [2]:
!pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121


In [3]:
import torch

if torch.cuda.is_available():
    print(f"CUDA is available. Device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available.")

CUDA is available. Device name: NVIDIA RTX 2000 Ada Generation Laptop GPU


In [4]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-m"
model = SentenceTransformer(model_id).to("cuda")

  from tqdm.autonotebook import tqdm, trange


In [5]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [6]:
BATCH_SIZE = 20

In [7]:
import json
# Load the saved training dataset
with open("data/training_dataset.jsonl", "r") as f:
    train_dataset = json.load(f)

In [9]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [10]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [11]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [12]:
# Load the saved validation dataset
with open("data/val_dataset.jsonl", "r") as f:
    val_dataset = json.load(f)

In [13]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [14]:
EPOCHS = 5

In [15]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

  0%|          | 0/170 [00:00<?, ?it/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

{'eval_cosine_accuracy@1': 0.8850574712643678, 'eval_cosine_accuracy@3': 0.9655172413793104, 'eval_cosine_accuracy@5': 1.0, 'eval_cosine_accuracy@10': 1.0, 'eval_cosine_precision@1': np.float64(0.8850574712643678), 'eval_cosine_precision@3': np.float64(0.32183908045977005), 'eval_cosine_precision@5': np.float64(0.19999999999999996), 'eval_cosine_precision@10': np.float64(0.09999999999999998), 'eval_cosine_recall@1': np.float64(0.02458492975734355), 'eval_cosine_recall@3': np.float64(0.026819923371647507), 'eval_cosine_recall@5': np.float64(0.027777777777777776), 'eval_cosine_recall@10': np.float64(0.027777777777777776), 'eval_cosine_ndcg@10': np.float64(0.20801986240857814), 'eval_cosine_mrr@10': 0.9270114942528737, 'eval_cosine_map@100': np.float64(0.02575031928480204), 'eval_dot_accuracy@1': 0.8850574712643678, 'eval_dot_accuracy@3': 0.9655172413793104, 'eval_dot_accuracy@5': 1.0, 'eval_dot_accuracy@10': 1.0, 'eval_dot_precision@1': np.float64(0.8850574712643678), 'eval_dot_precision

#### Evaluating retriever

In [1]:
import pandas as pd
import tqdm

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from langchain_core.documents import Document

Now we'll define a function that will help us evaluate our retrieval process.

> NOTE: We're assuming 1 correct document in a "hit".

In [3]:
def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=True,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

All that's left to do is evaluate, we'll evaluate our model against:

1. OpenAI's closed source `text-embedding-3-small`
2. The base non-fine-tuned version of `Snowflake/snowflake-arctic-embed-m`.

Let's see how it stacks up!

In [4]:
# Load the saved test dataset
import json
with open("data/test_dataset.jsonl", "r") as f:
    test_dataset = json.load(f)

In [6]:
from tqdm import tqdm
te3_openai = AzureOpenAIEmbeddings(model="text-embedding-3-small")
te3_results = evaluate_openai(test_dataset, te3_openai)

100%|██████████| 87/87 [00:20<00:00,  4.33it/s]


In [7]:
te3_results_df = pd.DataFrame(te3_results)

In [8]:
te3_hit_rate = te3_results_df["is_hit"].mean()
te3_hit_rate

0.9655172413793104

##### Large model

In [62]:
from tqdm import tqdm
te3_openai_l = AzureOpenAIEmbeddings(model="text-embedding-3-large")
te3l_results = evaluate_openai(test_dataset, te3_openai_l)

100%|██████████| 87/87 [00:25<00:00,  3.47it/s]


In [65]:
te3l_results_df = pd.DataFrame(te3l_results)

In [66]:
te3l_hit_rate = te3l_results_df["is_hit"].mean()
te3l_hit_rate

1.0

### `Snowflake/snowflake-arctic-embed-m` (base)

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-m")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

100%|██████████| 87/87 [00:00<00:00, 143.97it/s]


In [7]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)

In [8]:
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
arctic_embed_m_hit_rate

0.6206896551724138

### `Snowflake/snowflake-arctic-embed-m` (fine-tuned)

In [9]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

Some weights of BertModel were not initialized from the model checkpoint at finetuned_arctic and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 87/87 [00:00<00:00, 147.81it/s]


In [10]:
finetune_results_df = pd.DataFrame(finetune_results)

In [11]:
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

1.0

#### Upload finetuned_model

In [30]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
model.push_to_hub("Mr-Cool/midterm-finetuned-embedding")

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

'https://huggingface.co/Mr-Cool/midterm-finetuned-embedding/commit/927d0ef0bbf81c223d936e1dd95819a30a0c7416'

## Task 5

I was not satisfied with the original test and would like to do a better test by generating a bigger dataset and cleaning data manually if necessary

In [1]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = TextLoader("data/nist_ai.tex")

eval_document = loader.load()

text_splitter_eval = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 400
)

eval_documents = text_splitter_eval.split_documents(eval_document)

In [2]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from pydantic import BaseModel

generator_llm = AzureChatOpenAI(azure_deployment="gpt-4", temperature=0)
critic_llm = AzureChatOpenAI(azure_deployment="gpt-4", temperature=0)
embeddings = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-large")

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

num_qa_pairs = 100

testset = generator.generate_with_langchain_docs(eval_documents, num_qa_pairs, distributions, raise_exceptions=False)
testset.to_pandas()

embedding nodes:   0%|          | 0/264 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/100 [00:00<?, ?it/s]

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the purpose of implementing a use-case...,[\begin{center}\n\begin{tabular}{|c|c|c|}\n\hl...,The purpose of implementing a use-case based s...,simple,[{'source': 'data/nist_ai.tex'}],True
1,What actions are suggested to address the risk...,"[Al Actor Tasks: Al Impact Assessment, Domain ...",The suggested actions to address the risks of ...,simple,[{'source': 'data/nist_ai.tex'}],True
2,How are AI systems resourced according to orga...,"[Al Actor Tasks: Governance and Oversight, Ope...",Mechanisms are in place to inventory AI system...,simple,[{'source': 'data/nist_ai.tex'}],True
3,What are the risks associated with integrating...,[Risks from confabulations may arise when user...,Risks from confabulations may arise when users...,simple,[{'source': 'data/nist_ai.tex'}],True
4,What actions are suggested to address data pri...,[Security; Harmful Bias and \\\nHomogenization...,Conduct periodic monitoring of AI-generated co...,simple,"[{'source': 'data/nist_ai.tex'}, {'source': 'd...",True
...,...,...,...,...,...,...
95,How do test dataset label errors impact model ...,[Errors in third-party GAI components can also...,Test dataset label errors can impact the 'stab...,reasoning,[{'source': 'data/nist_ai.tex'}],True
96,How do biased data and performance gaps link t...,[To guide organizations in identifying and man...,Biased data and performance gaps link to GAI r...,reasoning,"[{'source': 'data/nist_ai.tex'}, {'source': 'd...",True
97,How could GAI content mislead doctors?,[Risks from confabulations may arise when user...,GAI content could mislead doctors by providing...,reasoning,[{'source': 'data/nist_ai.tex'}],True
98,How do GAI predictions cause hallucinations?,[While some of these described capabilities li...,GAI predictions cause hallucinations because g...,reasoning,[{'source': 'data/nist_ai.tex'}],True


Failed to batch ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/batch. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/batch', '{"detail":"Monthly unique traces usage limit exceeded"}')
post: trace=6c2933f5-31a1-4845-8c3c-07bc92e95234,id=6c2933f5-31a1-4845-8c3c-07bc92e95234; trace=08b96409-dfcc-4ced-899b-71b523e6c8c5,id=08b96409-dfcc-4ced-899b-71b523e6c8c5; trace=fcbef85b-9007-45fd-823c-fcdf7b718472,id=fcbef85b-9007-45fd-823c-fcdf7b718472; trace=a19371c9-13e2-4b1a-a91e-f7b6ee70f068,id=a19371c9-13e2-4b1a-a91e-f7b6ee70f068; trace=4855ba79-c9aa-4b6c-b8ef-b0b3ae12f5eb,id=4855ba79-c9aa-4b6c-b8ef-b0b3ae12f5eb; trace=b6c5b891-541e-43d9-bde3-1b8af4c9c34b,id=b6c5b891-541e-43d9-bde3-1b8af4c9c34b; patch: trace=396dc333-42ad-4f02-96ba-bc6bf891be58,id=396dc333-42ad-4f02-96ba-bc6bf891be58; trace=50cc5158-aa4d-4474-85a0-c5e0ef54695d,id=50cc5158-aa4d-4474-85a0-c5e0ef54695d; trace=82809d7f-fc

In [3]:
testset_df = testset.to_pandas()

In [15]:
testset_df[90:100]

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
90,How do distillation & compression affect GAI's...,[Methods for creating smaller versions of trai...,Methods for creating smaller versions of train...,reasoning,[{'source': 'data/nist_ai.tex'}],True
91,Why might GAI validation fail in games and tests?,[\section*{Limitations of Current Pre-deployme...,GAI validation might fail in games and tests b...,reasoning,[{'source': 'data/nist_ai.tex'}],True
92,Which contract terms prevent liability and dat...,[\hline\nGV-6.2-007 & \begin{tabular}{l}\nRevi...,The contract terms that prevent liability and ...,reasoning,[{'source': 'data/nist_ai.tex'}],True
93,How do SBOMs & SLAs reduce IP risks in 3rd-par...,[Third party GAI integrations may give rise to...,The answer to given question is not present in...,reasoning,[{'source': 'data/nist_ai.tex'}],True
94,How might GAI errors cause wrong diagnoses in ...,[Risks from confabulations may arise when user...,"GAI errors, such as confabulated summaries of ...",reasoning,[{'source': 'data/nist_ai.tex'}],True
95,How do test dataset label errors impact model ...,[Errors in third-party GAI components can also...,Test dataset label errors can impact the 'stab...,reasoning,[{'source': 'data/nist_ai.tex'}],True
96,How do biased data and performance gaps link t...,[To guide organizations in identifying and man...,Biased data and performance gaps link to GAI r...,reasoning,"[{'source': 'data/nist_ai.tex'}, {'source': 'd...",True
97,How could GAI content mislead doctors?,[Risks from confabulations may arise when user...,GAI content could mislead doctors by providing...,reasoning,[{'source': 'data/nist_ai.tex'}],True
98,How do GAI predictions cause hallucinations?,[While some of these described capabilities li...,GAI predictions cause hallucinations because g...,reasoning,[{'source': 'data/nist_ai.tex'}],True
99,How to gather early AI feedback w/o field test...,[\item Al Red-teaming: A structured testing ex...,Organizations can gather early AI feedback wit...,reasoning,[{'source': 'data/nist_ai.tex'}],True


I have manually checked all 100 questions and ground truth pairs and have not found any weird ones that I found in the initial dataset (in the beginning of the notebook).

In [16]:
testset_df.to_csv("testset.csv")

### Test various pipelines

#### Naive pipeline

The original pipeline used in prototype and using simple pdf OCR, token based chunking, and small embedding model

In [56]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain.prompts import ChatPromptTemplate
import tiktoken

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o").encode(
        text,
    )
    return len(tokens)

# Load the NIST AI document
PDF_LINK = "data/nist_ai.pdf"
loader = PyMuPDFLoader(file_path=PDF_LINK)
nist_doc = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100,
    length_function = tiktoken_len,
)

nist_chunks = text_splitter.split_documents(nist_doc)

embeddings_small = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-small")

qdrant_client = QdrantClient(":memory:") # set Qdrant DB and its location (in-memory)

qdrant_client.create_collection(
    collection_name="NIST_AI",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name="NIST_AI",
    embedding=embeddings_small,
) # create a QdrantVectorStore object with the above specified client, collection name, and embedding model.

qdrant_vector_store.add_documents(nist_chunks) # add the documents to the QdrantVectorStore

retriever = qdrant_vector_store.as_retriever()

##### Set up RAG retrieval chain

In [57]:
qa_gpt4_llm = AzureChatOpenAI(azure_deployment="gpt-4", temperature=0) # GPT-4o model

# define a template for the RAG model
rag_template = """
You are a helpful assistant that helps users find information and answer their question. 
You MUST use ONLY the available context to answer the question.
If necessary information to answer the question cannot be found in the provided context, you MUST reply "I don't know."

Question:
{question}

Context:
{context}
"""
# create rag prompt object from the template
prompt = ChatPromptTemplate.from_template(rag_template)

# update the chain with LLM, prompt, and question variable.
retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | qa_gpt4_llm, "context": itemgetter("context"), "question": itemgetter("question")}
)

##### Load test questions to a Pandas dataframe

In [58]:
import pandas as pd

test_df = pd.read_csv("testset.csv") # update the path to the testset.csv file
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

##### Generate answers

In [59]:
answers = []
contexts = []

# loop through the test questions
for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question}) # invoke the retrieval chain with the question
  answers.append(response["response"].content) # append the answer to the answers list
  contexts.append([context.page_content for context in response["context"]]) # append the context to the contexts list

##### Create a dataset with questions, answers, context, and ground truth

In [60]:
from datasets import Dataset

naive_response_dataset_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

##### Run evaluation on the naive RAG pipeline

In [61]:
from pydantic import BaseModel
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

embeddings_small = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-small") # keeping the embedding model the same as the one in the prototype

In [62]:
naive_retrieval_results = evaluate(naive_response_dataset_retrieval, 
                                               metrics, 
                                               llm=qa_gpt4_llm, 
                                               embeddings=embeddings_small
                                               )

Evaluating:  59%|█████▊    | 293/500 [04:10<05:39,  1.64s/it]No statements were generated from the answer.
Evaluating: 100%|██████████| 500/500 [07:43<00:00,  1.08it/s]


In [64]:
naive_retrieval_results

{'faithfulness': 0.7749, 'answer_relevancy': 0.9091, 'context_recall': 0.7477, 'context_precision': 0.7428, 'answer_correctness': 0.4041}

In [69]:
naive_correctness = naive_retrieval_results.to_pandas().answer_correctness

#### Fine-tuned `Snowflake/snowflake-arctic-embed-m` model

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_tuned = HuggingFaceEmbeddings(model_name="Mr-Cool/midterm-finetuned-embedding")

Some weights of BertModel were not initialized from the model checkpoint at Mr-Cool/midterm-finetuned-embedding and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain.prompts import ChatPromptTemplate
import tiktoken

def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o").encode(
        text,
    )
    return len(tokens)

# Load the NIST AI document
PDF_LINK = "data/nist_ai.pdf"
loader = PyMuPDFLoader(file_path=PDF_LINK)
nist_doc = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 100,
    length_function = tiktoken_len,
)

nist_chunks = text_splitter.split_documents(nist_doc)


qdrant_client_tuned = QdrantClient(":memory:") # set Qdrant DB and its location (in-memory)

qdrant_client_tuned.create_collection(
    collection_name="NIST_AI",
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

qdrant_vector_store_tuned = QdrantVectorStore(
    client=qdrant_client_tuned,
    collection_name="NIST_AI",
    embedding=embedding_tuned,
) # create a QdrantVectorStore object with the above specified client, collection name, and embedding model.

qdrant_vector_store_tuned.add_documents(nist_chunks) # add the documents to the QdrantVectorStore

retriever_tuned = qdrant_vector_store_tuned.as_retriever()

In [12]:
qa_gpt4_llm = AzureChatOpenAI(azure_deployment="gpt-4", temperature=0) # GPT-4o model

# define a template for the RAG model
rag_template = """
You are a helpful assistant that helps users find information and answer their question. 
You MUST use ONLY the available context to answer the question.
If necessary information to answer the question cannot be found in the provided context, you MUST reply "I don't know."

Question:
{question}

Context:
{context}
"""
# create rag prompt object from the template
prompt = ChatPromptTemplate.from_template(rag_template)

# update the chain with LLM, prompt, and question variable.
retrieval_augmented_qa_chain_tuned = (
    {"context": itemgetter("question") | retriever_tuned, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | qa_gpt4_llm, "context": itemgetter("context"), "question": itemgetter("question")}
)

In [15]:
answers_tuned = []
contexts_tuned = []

# loop through the test questions
for question in test_questions:
  response = retrieval_augmented_qa_chain_tuned.invoke({"question" : question}) # invoke the retrieval chain with the question
  answers_tuned.append(response["response"].content) # append the answer to the answers list
  contexts_tuned.append([context.page_content for context in response["context"]]) # append the context to the contexts list

##### Create a dataset with questions, answers, context, and ground truth

In [16]:
from datasets import Dataset

naive_response_dataset_retrieval_tuned = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers_tuned,
    "contexts" : contexts_tuned,
    "ground_truth" : test_groundtruths
})

##### Run evaluation on the naive tuned RAG pipeline

In [20]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]




For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from ragas.metrics._context_entities_recall import (


In [23]:
naive_retrieval_results_tuned = evaluate(naive_response_dataset_retrieval_tuned, 
                                               metrics, 
                                               llm=qa_gpt4_llm, 
                                               embeddings=embedding_tuned
                                               )

Evaluating:  25%|██▌       | 126/500 [01:39<03:36,  1.73it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 500/500 [07:15<00:00,  1.15it/s]


In [24]:
naive_retrieval_results_tuned

{'faithfulness': 0.8038, 'answer_relevancy': 0.8670, 'context_recall': 0.8668, 'context_precision': 0.8067, 'answer_correctness': 0.4412}

In [66]:
naive_tuned_correctness = naive_retrieval_results_tuned.to_pandas().answer_correctness

In [28]:
naive_retrieval_results_tuned.to_pandas().faithfulness.std()

0.23725891000142335

In [29]:
naive_retrieval_results_tuned.to_pandas().faithfulness.mean()

0.8037863071392927

### Custom chunking pipeline + RAG

In [35]:
file_path = "data/nist_ai.tex"
with open(file_path, 'r', encoding="utf-8") as file:
        if file.name.endswith('.tex'):
            lines = file.readlines()
            chunks = extract_sections(lines, 500)

In [126]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from langchain.prompts import ChatPromptTemplate

embeddings_large = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-large")

qdrant_client_custom = QdrantClient(":memory:") # set Qdrant DB and its location (in-memory)

qdrant_client_custom.create_collection(
    collection_name="NIST_AI",
    vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
)

qdrant_vector_store_custom = QdrantVectorStore(
    client=qdrant_client_custom,
    collection_name="NIST_AI",
    embedding=embeddings_large,
) # create a QdrantVectorStore object with the above specified client, collection name, and embedding model.

qdrant_vector_store_custom.add_documents(chunks) # add the documents to the QdrantVectorStore

retriever_custom = qdrant_vector_store_custom.as_retriever(kwargs={"k": 10})

##### Set up RAG retrieval chain

In [127]:
qa_gpt4_llm = AzureChatOpenAI(azure_deployment="gpt-4", temperature=0) # GPT-4o model

# define a template for the RAG model
rag_template = """
You are a helpful assistant that helps users find information and answer their question. 
You MUST use ONLY the available context to answer the question.
If necessary information to answer the question cannot be found in the provided context, you MUST reply "I don't know."

Question:
{question}

Context:
{context}
"""
# create rag prompt object from the template
prompt = ChatPromptTemplate.from_template(rag_template)

# update the chain with LLM, prompt, and question variable.
retrieval_augmented_qa_chain_custom = (
    {"context": itemgetter("question") | retriever_custom, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | qa_gpt4_llm, "context": itemgetter("context"), "question": itemgetter("question")}
)

##### Generate answers

In [128]:
answers_custom = []
contexts_custom = []

# loop through the test questions
for question in test_questions:
  response = retrieval_augmented_qa_chain_custom.invoke({"question" : question}) # invoke the retrieval chain with the question
  answers_custom.append(response["response"].content) # append the answer to the answers list
  contexts_custom.append([context.page_content for context in response["context"]]) # append the context to the contexts list

##### Create a dataset with questions, answers, context, and ground truth

In [129]:
from datasets import Dataset

custom_response_dataset_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers_custom,
    "contexts" : contexts_custom,
    "ground_truth" : test_groundtruths
})

##### Run evaluation on the naive RAG pipeline

In [130]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [131]:
custom_retrieval_results = evaluate(custom_response_dataset_retrieval, 
                                               metrics, 
                                               llm=qa_gpt4_llm, 
                                               embeddings=embeddings_large
                                               )

Evaluating:  30%|███       | 150/500 [02:11<03:04,  1.90it/s]No statements were generated from the answer.
Evaluating:  55%|█████▍    | 273/500 [04:22<04:21,  1.15s/it]Failed to parse output. Returning None.
Evaluating: 100%|██████████| 500/500 [08:47<00:00,  1.06s/it]


In [132]:
custom_retrieval_results

{'faithfulness': 0.7631, 'answer_relevancy': 0.8824, 'context_recall': 0.7633, 'context_precision': 0.7547, 'answer_correctness': 0.3877}

In [67]:
custom_correctness = custom_retrieval_results.to_pandas().answer_correctness

### Test baseline on ground truth

In [74]:
# update the chain with LLM, prompt, and question variable.
retrieval_baseline_qa_chain = (
    {"context": itemgetter("context"), "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | qa_gpt4_llm, "context": itemgetter("context"), "question": itemgetter("question")}
)

In [97]:
import ast
answers_baseline = []

# loop through the test questions
for question, context in zip(test_questions, [ast.literal_eval(entry) for entry in test_df['contexts'].values.tolist()]):
  response = retrieval_baseline_qa_chain.invoke({"question" : question, "context": context}) # invoke the retrieval chain with the question
  answers_baseline.append(response["response"].content) # append the answer to the answers list

In [98]:
baseline_response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers_baseline,
    "contexts" : [ast.literal_eval(entry) for entry in test_df['contexts'].values.tolist()],
    "ground_truth" : test_groundtruths
})

In [99]:
from pydantic import BaseModel
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

embeddings_small = AzureOpenAIEmbeddings(azure_deployment="text-embedding-3-small") # keeping the embedding model the same as the one in the prototype

In [119]:
baseline_retrieval_results = evaluate(baseline_response_dataset, 
                                               metrics, 
                                               llm=qa_gpt4_llm, 
                                               embeddings=embeddings_small,
                                               )

TypeError: evaluate() got an unexpected keyword argument 'is_async'

In [120]:
baseline_retrieval_results

{'faithfulness': 0.7696, 'answer_relevancy': 0.9013, 'context_recall': 0.9897, 'context_precision': 0.9300, 'answer_correctness': 0.6087}

### Run a t-test to compare means

#### Naive vs Tuned

In [70]:
from scipy import stats

t_statistic, p_value = stats.ttest_ind(naive_correctness, naive_tuned_correctness)

# Print the results
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

T-statistic: -1.1349317414332412
P-value: 0.2577759616110477


We cannot reject the null hypothesis. Therefore, there is no evidence that the true means for answer correctness between the naive RAG pipeline and naive RAG using fine-tuned embedding model are different. 

In [71]:
t_statistic, p_value = stats.ttest_ind(naive_retrieval_results.to_pandas().context_recall, naive_retrieval_results_tuned.to_pandas().context_recall)

# Print the results
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

T-statistic: -2.366199523053206
P-value: 0.01893716873870553


Given significantly low p-value (0.0189), we cannot reject the null hypothesis that the means are equal and need to accept the alternative hypothesis. Long story short, the context recall true means for the two RAG pipelines are different. We can conclude that the fine-tuned embedding model does help to increase context recall.

In [72]:
t_statistic, p_value = stats.ttest_ind(naive_retrieval_results.to_pandas().context_precision, naive_retrieval_results_tuned.to_pandas().context_precision)

# Print the results
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

T-statistic: -1.3221955578032385
P-value: 0.18762839952756716


In [73]:
t_statistic, p_value = stats.ttest_ind(custom_retrieval_results.to_pandas().answer_correctness, naive_retrieval_results_tuned.to_pandas().answer_correctness)

# Print the results
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

T-statistic: -1.7746964163508423
P-value: 0.07748454636821077


In [125]:
t_statistic, p_value = stats.ttest_ind(baseline_retrieval_results.to_pandas().answer_correctness.replace(np.nan, 0), naive_retrieval_results_tuned.to_pandas().answer_correctness.replace(np.nan, 0))

# Print the results
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

T-statistic: 4.8161690666581505
P-value: 2.903042167699062e-06


In [110]:
t_statistic

nan

In [None]:
naive_retrieval_results_tuned.to_pandas().answer_correctness
naive_retrieval_results.to_pandas().answer_correctness
custom_retrieval_results.to_pandas().answer_correctness
baseline_retrieval_results.to_pandas().answer_correctness

In [117]:
import numpy as np
baseline_retrieval_results.to_pandas().context_recall.replace(np.nan, 0).unique()

array([1., 0.])

In [105]:
naive_retrieval_results_tuned.to_pandas().context_recall

0     1.000000
1     0.666667
2     0.000000
3     0.000000
4     1.000000
        ...   
95    1.000000
96    1.000000
97    1.000000
98    1.000000
99    1.000000
Name: context_recall, Length: 100, dtype: float64

In [137]:
df_naive = pd.DataFrame({'faithfulness': 0.7749, 'answer_relevancy': 0.9091, 'context_recall': 0.7477, 'context_precision': 0.7428, 'answer_correctness': 0.4041}.items(), columns=['Metric', 'text-embedding-3-small RAG'])
df_finetuned = pd.DataFrame({'faithfulness': 0.8038, 'answer_relevancy': 0.8670, 'context_recall': 0.8668, 'context_precision': 0.8067, 'answer_correctness': 0.4412}.items(), columns=['Metric', 'Fine-tuned RAG'])
df_merged = pd.merge(df_naive, df_finetuned, on='Metric')
df_merged

Unnamed: 0,Metric,text-embedding-3-small RAG,Fine-tuned RAG
0,faithfulness,0.7749,0.8038
1,answer_relevancy,0.9091,0.867
2,context_recall,0.7477,0.8668
3,context_precision,0.7428,0.8067
4,answer_correctness,0.4041,0.4412


In [138]:
df_naive = pd.DataFrame({'faithfulness': 0.7749, 'answer_relevancy': 0.9091, 'context_recall': 0.7477, 'context_precision': 0.7428, 'answer_correctness': 0.4041}.items(), columns=['Metric', 'Naive chunking small'])
df_naive_large = pd.DataFrame({'faithfulness': 0.7842, 'answer_relevancy': 0.8822, 'context_recall': 0.7207, 'context_precision': 0.7331, 'answer_correctness': 0.3921}.items(), columns=['Metric', 'Naive chunking large'])
df_custom_large = pd.DataFrame({'faithfulness': 0.7557, 'answer_relevancy': 0.8818, 'context_recall': 0.7533, 'context_precision': 0.7603, 'answer_correctness': 0.3842}.items(), columns=['Metric', 'Custom chunking large'])
df_baseline = pd.DataFrame({'faithfulness': 0.7696, 'answer_relevancy': 0.9013, 'context_recall': 0.9897, 'context_precision': 0.9300, 'answer_correctness': 0.6087}.items(), columns=['Metric', 'Ground truth small'])
df_merged = pd.merge(df_naive, df_naive_large, on='Metric')
df_merged = pd.merge(df_merged, df_custom_large, on='Metric')
df_merged = pd.merge(df_merged, df_baseline, on='Metric')
df_merged


Unnamed: 0,Metric,Naive chunking small,Naive chunking large,Custom chunking large,Ground truth small
0,faithfulness,0.7749,0.7842,0.7557,0.7696
1,answer_relevancy,0.9091,0.8822,0.8818,0.9013
2,context_recall,0.7477,0.7207,0.7533,0.9897
3,context_precision,0.7428,0.7331,0.7603,0.93
4,answer_correctness,0.4041,0.3921,0.3842,0.6087
