# Fine-tuning Embeddings for RAG on Specific Data




### Nest Asyncio

In [1]:
import nest_asyncio

nest_asyncio.apply()

### Provide OpenAI API Key

In [2]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

In [12]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import BSHTMLLoader

path = "data/"
text_loader = DirectoryLoader(path, glob="*.html", loader_cls=BSHTMLLoader)

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)

In [14]:
training_documents = text_splitter.split_documents(text_loader.load())

In [15]:
len(training_documents)

52

In [16]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [17]:
training_documents

[Document(metadata={'source': 'data/podcast_blog.html', 'title': 'Tech & Science Podcast Blog', 'id': '844e47da-c0b5-42a2-a17e-929fb08381e5'}, page_content="Tech & Science Podcast Blog\n\n\n\nTech & Science Podcast Transcripts\n\nAre Humans Dumb?\nTopic: are humans dumb\n\n[INTRO]\n\nWelcome to our podcast where we delve into the intriguing question: Are humans dumb? Today, we will explore this topic from two contrasting perspectives - skepticism and belief. Let's navigate through the complexities of human cognition and behavior to uncover the opportunities, risks, key questions, and potential solutions surrounding this thought-provoking issue.\n\n[SKEPTIC PERSPECTIVE]"),
 Document(metadata={'source': 'data/podcast_blog.html', 'title': 'Tech & Science Podcast Blog', 'id': 'cd856edb-6390-443f-97af-34fc5c9ad6e7'}, page_content="Let's start with the skeptic's viewpoint. When examining the information related to human intelligence, it's essential to consider the evolutionary perspective. T

In [18]:
len(training_documents)

52

In [19]:
training_split_documents = training_documents[:len(training_documents) - 24]
val_split_documents = training_documents[-24: -12]
test_split_documents = training_documents[-12:]

In [20]:
len(training_split_documents)

28

In [21]:
len(val_split_documents)

12

In [22]:
len(test_split_documents)

12

In [23]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [24]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [25]:
qa_prompt_template

ChatPromptTemplate(input_variables=['context', 'n_questions'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'n_questions'], input_types={}, partial_variables={}, template='Given the following context, you must generate questions based on only the provided context.\n\nYou are to generate {n_questions} questions which should be provided in the following format:\n\n1. QUESTION #1\n2. QUESTION #2\n...\n\nContext:\n{context}\n'), additional_kwargs={})])

In [26]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [27]:
import uuid
import re

async def create_questions(documents, n_questions):
    questions = {}
    relevant_docs = {}

    for document in documents:
        # Generate questions using the chat model
        question_response = await qa_prompt_template.ainvoke({"context": document.page_content, "n_questions": n_questions})
        
        # Debug: Print the type and structure of the response
        print(f"Type of response: {type(question_response)}")
        print(f"Dir of response: {dir(question_response)}")
        print(f"Response: {question_response}")
        
        try:
            # Try different ways to get the content
            if hasattr(question_response, 'text'):
                question_text = question_response.text
            elif hasattr(question_response, 'message'):
                question_text = question_response.message.content
            else:
                # If all else fails, try string conversion
                question_text = str(question_response)
            
            # Parse the numbered questions from the response
            parsed_questions = re.findall(r'\d+\.\s+(.*?)(?=\d+\.|$)', question_text, re.DOTALL)
            
            # For each parsed question, create a unique ID and store the question
            for question in parsed_questions:
                question = question.strip()  # Clean up any extra whitespace
                if question:  # Only process non-empty questions
                    question_id = str(uuid.uuid4())
                    questions[question_id] = question
                    relevant_docs[question_id] = [document.metadata["id"]]
                    
        except Exception as e:
            print(f"Error processing document: {e}")
            raise

    return questions, relevant_docs

In [28]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)

Type of response: <class 'langchain_core.prompt_values.ChatPromptValue'>
Dir of response: ['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__firstlineno__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_computed_fields__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_m

We'll use the function to generate training, validation, and test data.

In [29]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)

Type of response: <class 'langchain_core.prompt_values.ChatPromptValue'>
Dir of response: ['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__firstlineno__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_computed_fields__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_m

In [30]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)

Type of response: <class 'langchain_core.prompt_values.ChatPromptValue'>
Dir of response: ['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__firstlineno__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_computed_fields__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_m

### Reformating and Saving Datasets

In [31]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [32]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [33]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

## Task 4: Fine-tuning 

In [34]:
from sentence_transformers import SentenceTransformer

model_id = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_id)

In [35]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [36]:
BATCH_SIZE = 5

Let's move our dataset into the expected format for training.

In [37]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [38]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [44]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [48, 96, 192, 384]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [45]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [46]:
EPOCHS = 5

In [47]:
import wandb
wandb.init(mode="disabled")

In [62]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned-minilm-l6-v2',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
13,No log,No log,0.541667,0.625,0.708333,0.916667,0.541667,0.208333,0.141667,0.091667,0.541667,0.625,0.708333,0.916667,0.689315,0.62204,0.6293
26,No log,No log,0.541667,0.625,0.708333,0.916667,0.541667,0.208333,0.141667,0.091667,0.541667,0.625,0.708333,0.916667,0.689315,0.62204,0.6293
39,No log,No log,0.541667,0.625,0.708333,0.916667,0.541667,0.208333,0.141667,0.091667,0.541667,0.625,0.708333,0.916667,0.689315,0.62204,0.6293
50,No log,No log,0.541667,0.625,0.708333,0.916667,0.541667,0.208333,0.141667,0.091667,0.541667,0.625,0.708333,0.916667,0.689315,0.62204,0.6293
52,No log,No log,0.541667,0.625,0.708333,0.916667,0.541667,0.208333,0.141667,0.091667,0.541667,0.625,0.708333,0.916667,0.689315,0.62204,0.6293
65,No log,No log,0.541667,0.625,0.708333,0.916667,0.541667,0.208333,0.141667,0.091667,0.541667,0.625,0.708333,0.916667,0.689315,0.62204,0.6293


In [49]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [50]:
hf_username = "dataera2013"

In [63]:
model.push_to_hub(f"{hf_username}/midterm-small-model-2")

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

'https://huggingface.co/dataera2013/midterm-small-model-2/commit/3e2bc609b5d19419e0c5b87b6b07feac51af8ebd'

## Task 5: Evaluating our Retriever

In [64]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from tqdm.auto import tqdm

In [65]:
def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

### `text-embedding-3-small`

In [66]:
te3_openai = OpenAIEmbeddings(model="text-embedding-3-small")
te3_results = evaluate_openai(test_dataset, te3_openai)

  0%|          | 0/24 [00:00<?, ?it/s]

In [68]:
te3_results_df = pd.DataFrame(te3_results)

In [69]:
te3_hit_rate = te3_results_df["is_hit"].mean()
te3_hit_rate

np.float64(0.7083333333333334)

In [70]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

  0%|          | 0/24 [00:00<?, ?it/s]

In [71]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)

In [72]:
arctic_embed_m_results_df

Unnamed: 0,id,question,expected_id,is_hit
0,802fc41d-f59f-4aeb-b4e0-40be6974385e,QUESTION #1\n,5921600e-f987-4e8f-8e44-8bfae2ffbe8c,False
1,e7431897-c386-4de0-bdba-4d9fa442c953,QUESTION #2\n...\n\nContext:\n[INTRO]\n\nWelco...,5921600e-f987-4e8f-8e44-8bfae2ffbe8c,True
2,d08b7d43-657b-49f1-91f7-3b4d20a0d01f,QUESTION #1\n,607b53c6-1622-4c81-af72-b6f738e5969a,True
3,e336d5f6-99b1-49ce-b15e-207f0973e6f9,QUESTION #2\n...\n\nContext:\nUnderstanding th...,607b53c6-1622-4c81-af72-b6f738e5969a,True
4,3038149e-df54-4321-86af-acd4d8a1b1d8,QUESTION #1\n,9deabcd2-a0fc-4b06-98b6-144ad9528770,True
5,d00aab12-b584-41fd-812f-54124abaa4a1,QUESTION #2\n...\n\nContext:\nUrbanization and...,9deabcd2-a0fc-4b06-98b6-144ad9528770,True
6,d705b6d3-1531-4bd8-b82b-27a1de72a821,QUESTION #1\n,753c2dce-bd8a-41fc-93ed-16fd987ad356,False
7,8b0abfee-adec-4c60-aadd-752c885843d2,QUESTION #2\n...\n\nContext:\nBy incorporating...,753c2dce-bd8a-41fc-93ed-16fd987ad356,True
8,397339ba-3cb7-4b7f-9279-fe597f1020de,QUESTION #1\n,b6754cac-0dc6-42b0-8eaf-9e4973308aa2,True
9,2c9b94f0-1897-4fa7-963f-d99158496560,QUESTION #2\n...\n\nContext:\nNutrition and Me...,b6754cac-0dc6-42b0-8eaf-9e4973308aa2,True


In [73]:
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
arctic_embed_m_hit_rate

np.float64(0.7083333333333334)

In [74]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned-minilm-l6-v2")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

  0%|          | 0/24 [00:00<?, ?it/s]

In [75]:
finetune_results_df = pd.DataFrame(finetune_results)

In [76]:
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

np.float64(0.7083333333333334)

Vibe Checking the RAG Pipeline

### Creating New Chunks

In order to try and evaluate our system more fairly, let's create new chunks that we will use to create our Vector Store.

In [77]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap  = 50,
    length_function = len
)

training_documents = text_splitter.split_documents(text_loader.load())

### Base Chain

We'll start by constructing our base chain, which will use the untrained retrieval model.

#### R - Retrieval

In [78]:
from langchain_community.vectorstores import FAISS

base_vectorstore = FAISS.from_documents(training_documents, huggingface_embeddings)
base_retriever = base_vectorstore.as_retriever(search_kwargs={"k": 6})

#### A - Augmented

In [79]:
from langchain_core.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

Context:
{context}

Question:
{question}

Answer:
"""

rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

#### G - Generation

In [80]:
rag_llm =  ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

#### RAG - LCEL RAG Pipeline

In [81]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

base_rag_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [82]:
base_rag_chain.invoke({"question" : "What is an agent?"})["response"]

'I do not know.'

In [83]:
base_rag_chain.invoke({"question" : "What are the main challenges and opportunities in implementing AI in healthcare?"})["response"]

'The main challenges in implementing AI in healthcare include:\n\n1. **Data Privacy and Security**: Ensuring that patient data is protected and used ethically.\n2. **Bias and Fairness in Algorithms**: Addressing potential biases in AI models that could lead to unfair treatment outcomes.\n3. **Lack of Interpretability**: Some AI models are complex and difficult to interpret, raising questions about accountability and trust in their decisions.\n\nThe opportunities in implementing AI in healthcare include:\n\n1. **Improved Diagnosis and Treatment**: Machine learning can enhance the accuracy and efficiency of diagnoses and treatment plans.\n2. **Personalized Medicine**: AI can help tailor treatments to individual patients based on their unique data.\n3. **Operational Efficiency**: AI can streamline administrative processes, reducing costs and improving patient care.\n\nOverall, while there are significant challenges to address, the potential benefits of AI in healthcare are substantial.'

In [84]:
base_rag_chain.invoke({"question" : "What are the ethical concerns surrounding CRISPR technology?"})["response"]

'The ethical concerns surrounding CRISPR technology include the ability to edit the human germline, which raises questions about heritable changes and long-term consequences. There are also concerns about moral responsibility related to altering the gene pool, informed consent, access to technology, unintended consequences, and the risks of eugenics and discrimination.'

In [85]:
base_rag_chain.invoke({"question" : "What is the skeptic talking about?"})["response"]

'The skeptic is discussing the psychological factors behind conspiracy theory beliefs, emphasizing the influences of confirmation bias, social dynamics, group polarization, and psychological vulnerabilities that contribute to the formation and maintenance of these beliefs. Additionally, in the context of human intelligence, the skeptic raises questions about the validity of studies suggesting a decline in human cognition over time, urging a critical assessment of the methodology and sample size used in such studies.'

### Fine-tuned Embedding Model

Now let's rebuild our RAG chain with the Fine-tuned model - the only component we need to change is our `FAISS` vectorstore!

In [86]:
finetune_vectorstore = FAISS.from_documents(training_documents, finetune_embeddings)
finetune_retriever = finetune_vectorstore.as_retriever(search_kwargs={"k": 6})

In [87]:
finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
)

In [89]:
finetune_rag_chain.invoke({"question" : "What is an Agent?"})["response"]

'I do not know.'

In [90]:
finetune_rag_chain.invoke({"question" : "What are the main challenges and opportunities in implementing AI in healthcare?"})["response"]

'The main challenges in implementing AI in healthcare include:\n\n1. **Data Privacy and Security**: Ensuring that patient data is protected and used ethically.\n2. **Bias and Fairness in Algorithms**: Addressing potential biases in AI models that could lead to unfair treatment or outcomes.\n3. **Lack of Interpretability**: Some AI models are complex and difficult to interpret, raising questions about accountability and trust in AI-driven decisions.\n\nThe opportunities in implementing AI in healthcare include:\n\n1. **Improved Diagnosis and Treatment**: AI can enhance the accuracy and efficiency of healthcare diagnosis and treatment.\n2. **Personalized Medicine**: AI can help tailor treatments to individual patients based on their unique data.\n3. **Operational Efficiency**: AI can streamline healthcare operations, reducing costs and improving patient care.\n\nOverall, while there are significant challenges to address, the potential benefits of AI in healthcare are substantial.'

In [91]:
finetune_rag_chain.invoke({"question" : "What are the ethical concerns surrounding CRISPR technology?"})["response"]

'The ethical concerns surrounding CRISPR technology include the ability to edit the human germline, which raises questions about heritable changes and long-term consequences. There are also concerns about moral responsibility related to altering the gene pool, issues of informed consent, access to technology, unintended consequences, and the risks of eugenics and discrimination.'

In [92]:
finetune_rag_chain.invoke({"question" : "What is the skeptic talking about?"})["response"]

'The skeptic is discussing the psychological factors behind conspiracy theory beliefs, emphasizing the influences such as confirmation bias, social dynamics, group polarization, and psychological vulnerabilities that contribute to the formation and maintenance of these beliefs. Additionally, in the context of human intelligence, the skeptic is questioning the validity of studies suggesting a decline in human cognition over time, urging a critical assessment of the methodology and sample size used in such studies.'

In [93]:
# First, install all required NLTK data
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download all NLTK data
nltk.download('all')

# Now proceed with loading
from langchain_community.document_loaders import DirectoryLoader
path = "data/"
loader = DirectoryLoader(path, glob="*.html")
docs = loader.load()

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/nageshbm/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]  

In [94]:
# Now proceed with loading the HTML files
from langchain_community.document_loaders import DirectoryLoader

path = "data/"
loader = DirectoryLoader(path, glob="*.html")
docs = loader.load()

In [95]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

  for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text):
  txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
  txt = re.sub('(?<={0})\.'.format(am), '∯', txt)


In [96]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Applying HeadlinesExtractor:   0%|          | 0/1 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/1 [00:00<?, ?it/s]

Applying SummaryExtractor:   0%|          | 0/1 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/6 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/13 [00:00<?, ?it/s]

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/1 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]

In [97]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Wut iz Post-Quantum Cryptography?,[Tech & Science Podcast Transcripts Are Humans...,Post-Quantum Cryptography (PQC) involves the d...,single_hop_specifc_query_synthesizer
1,What are the latest developments in Quantum Co...,[Tech & Science Podcast Transcripts Are Humans...,The latest developments in quantum computing p...,single_hop_specifc_query_synthesizer
2,How duz AI impact job markits and wut skills w...,[Environmental Impact of Fast Fashion Topic: E...,AI impacts job markets by creating new job opp...,single_hop_specifc_query_synthesizer
3,What are the environmental impacts of fast fas...,[Environmental Impact of Fast Fashion Topic: E...,Fast fashion poses a significant environmental...,single_hop_specifc_query_synthesizer
4,How does the Tech Innovators Podcast address t...,[Social Media and Mental Health Topic: What ar...,The Tech Innovators Podcast discusses the chal...,single_hop_specifc_query_synthesizer
5,what space explorers podcast talk about?,[Social Media and Mental Health Topic: What ar...,The Space Explorers Podcast discusses the bene...,single_hop_specifc_query_synthesizer
6,how 5G tech change telecommunications and what...,[that while the advancements in cancer researc...,5G technology is reshaping telecommunications ...,single_hop_specifc_query_synthesizer
7,what climate change do?,[that while the advancements in cancer researc...,The context does not provide specific informat...,single_hop_specifc_query_synthesizer
8,What are the psychological factors behind cons...,[pave the way for a more resilient and sustain...,The psychological factors behind conspiracy th...,single_hop_specifc_query_synthesizer
9,What are the psychological factors that contri...,[pave the way for a more resilient and sustain...,The psychological factors behind conspiracy th...,single_hop_specifc_query_synthesizer


In [98]:
path = "data/"
loader = DirectoryLoader(path, glob="*.html")
docs = loader.load()

In [99]:
base_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")

In [100]:
ft_embeddings = HuggingFaceEmbeddings(model_name="dataera2013/legal-ft-2")

Some weights of BertModel were not initialized from the model checkpoint at dataera2013/legal-ft-2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [101]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient(":memory:")

client.create_collection(
    collection_name="base_ai_across_years",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

client.create_collection(
    collection_name="ft_ai_across_years",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

base_vector_store = QdrantVectorStore(
    client=client,
    collection_name="base_ai_across_years",
    embedding=base_embeddings,
)

ft_vector_store = QdrantVectorStore(
    client=client,
    collection_name="ft_ai_across_years",
    embedding=ft_embeddings,
)

In [102]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(docs)
len(split_documents)

38

In [103]:
_ = base_vector_store.add_documents(documents=split_documents)

In [104]:
__ = ft_vector_store.add_documents(documents=split_documents)

In [105]:
base_retriever = base_vector_store.as_retriever(search_kwargs={"k": 5})
ft_retriever = ft_vector_store.as_retriever(search_kwargs={"k": 5})

In [106]:
def base_retrieve(state):
  retrieved_docs = base_retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

def ft_retrieve(state): 
  retrieved_docs = ft_retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

In [107]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
You are a helpful assistant who answers questions based on provided context. You must only use the provided context, and cannot use your own knowledge.

### Question
{question}

### Context
{context}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [108]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [109]:
def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
  response = llm.invoke(messages)
  return {"response" : response.content}

In [110]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class BaseState(TypedDict):
  question: str
  context: List[Document]
  response: str

class FState(TypedDict):
  question: str
  context: List[Document]
  response: str


In [111]:
base_graph_builder = StateGraph(BaseState).add_sequence([base_retrieve, generate])
base_graph_builder.add_edge(START, "base_retrieve")
base_graph = base_graph_builder.compile()

ft_graph_builder = StateGraph(FState).add_sequence([ft_retrieve, generate])
ft_graph_builder.add_edge(START, "ft_retrieve")
ft_graph = ft_graph_builder.compile()

In [113]:
response = base_graph.invoke({"question" : "Why do people believe in conspiracy theories?"})
response["response"]

'People believe in conspiracy theories due to several psychological factors. These include confirmation bias, social dynamics, group polarization, and psychological vulnerabilities. Confirmation bias leads individuals to seek out information that supports their preexisting beliefs, while social dynamics and group polarization can strengthen these beliefs within specific communities. Psychological vulnerabilities also play a significant role, as certain individuals may turn to conspiracy theories as a way to make sense of complex or uncertain situations in their lives. Understanding these factors offers opportunities for constructive interventions, such as enhancing critical thinking skills and building trust in institutions.'

In [114]:
response = ft_graph.invoke({"question" : "Why do people believe in conspiracy theories?"})
response["response"]

'People believe in conspiracy theories due to a variety of psychological factors. These include:\n\n1. **Confirmation Bias**: Individuals tend to favor information that confirms their existing beliefs, which can lead them to accept conspiracy theories that align with their worldview.\n\n2. **Social Dynamics**: Group polarization can occur within communities that support conspiracy theories, reinforcing these beliefs and making individuals more entrenched in their views.\n\n3. **Psychological Vulnerabilities**: Certain psychological traits or vulnerabilities can make individuals more susceptible to conspiracy narratives, particularly in times of uncertainty or fear.\n\nBy understanding these factors, there are opportunities to promote critical thinking, build trust in institutions, and enhance emotional intelligence, helping individuals cope with uncertainty without resorting to conspiracy theories.'

In [115]:
base_dataset = dataset
ft_dataset = dataset

for base_test_row in base_dataset:
  base_response = base_graph.invoke({"question" : base_test_row.eval_sample.user_input})
  base_test_row.eval_sample.response = base_response["response"]
  base_test_row.eval_sample.retrieved_contexts = [context.page_content for context in base_response["context"]]

for ft_test_row in ft_dataset:
  ft_response = ft_graph.invoke({"question" : ft_test_row.eval_sample.user_input})
  ft_test_row.eval_sample.response = ft_response["response"]
  ft_test_row.eval_sample.retrieved_contexts = [context.page_content for context in ft_response["context"]]

In [116]:
base_dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,Wut iz Post-Quantum Cryptography?,[[BELIEVER PERSPECTIVE]\n\nThe latest quantum ...,[Tech & Science Podcast Transcripts Are Humans...,Post-Quantum Cryptography (PQC) refers to the ...,Post-Quantum Cryptography (PQC) involves the d...,single_hop_specifc_query_synthesizer
1,What are the latest developments in Quantum Co...,[Quantum Computing and Cryptography\n\nTopic: ...,[Tech & Science Podcast Transcripts Are Humans...,The latest developments in quantum computing h...,The latest developments in quantum computing p...,single_hop_specifc_query_synthesizer
2,How duz AI impact job markits and wut skills w...,[[SKEPTIC PERSPECTIVE]\n\nJob displacement and...,[Environmental Impact of Fast Fashion Topic: E...,AI impacts job markets by creating new job opp...,AI impacts job markets by creating new job opp...,single_hop_specifc_query_synthesizer
3,What are the environmental impacts of fast fas...,[[SKEPTIC PERSPECTIVE]\n\nLet's start by exami...,[Environmental Impact of Fast Fashion Topic: E...,Fast fashion has significant environmental imp...,Fast fashion poses a significant environmental...,single_hop_specifc_query_synthesizer
4,How does the Tech Innovators Podcast address t...,[[SKEPTIC PERSPECTIVE]\n\nWhen discussing bloc...,[Social Media and Mental Health Topic: What ar...,The Tech Innovators Podcast addresses the chal...,The Tech Innovators Podcast discusses the chal...,single_hop_specifc_query_synthesizer
5,what space explorers podcast talk about?,[Space Exploration Benefits\n\nTopic: How does...,[Social Media and Mental Health Topic: What ar...,The Space Explorers Podcast discusses the bene...,The Space Explorers Podcast discusses the bene...,single_hop_specifc_query_synthesizer
6,how 5G tech change telecommunications and what...,"[[CONCLUSION]\n\nAs we wrap up, it's evident t...",[that while the advancements in cancer researc...,5G technology is significantly changing teleco...,5G technology is reshaping telecommunications ...,single_hop_specifc_query_synthesizer
7,what climate change do?,[Climate Change Science and Impacts\n\nTopic: ...,[that while the advancements in cancer researc...,Climate change contributes to a variety of glo...,The context does not provide specific informat...,single_hop_specifc_query_synthesizer
8,What are the psychological factors behind cons...,"[[CONCLUSION]\n\nTo wrap up, stakeholders must...",[pave the way for a more resilient and sustain...,The psychological factors behind conspiracy th...,The psychological factors behind conspiracy th...,single_hop_specifc_query_synthesizer
9,What are the psychological factors that contri...,"[[CONCLUSION]\n\nTo wrap up, stakeholders must...",[pave the way for a more resilient and sustain...,The psychological factors contributing to beli...,The psychological factors behind conspiracy th...,single_hop_specifc_query_synthesizer


In [117]:
ft_dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,Wut iz Post-Quantum Cryptography?,[[BELIEVER PERSPECTIVE]\n\nThe latest quantum ...,[Tech & Science Podcast Transcripts Are Humans...,Post-Quantum Cryptography (PQC) refers to the ...,Post-Quantum Cryptography (PQC) involves the d...,single_hop_specifc_query_synthesizer
1,What are the latest developments in Quantum Co...,[Quantum Computing and Cryptography\n\nTopic: ...,[Tech & Science Podcast Transcripts Are Humans...,The latest developments in quantum computing h...,The latest developments in quantum computing p...,single_hop_specifc_query_synthesizer
2,How duz AI impact job markits and wut skills w...,[[SKEPTIC PERSPECTIVE]\n\nJob displacement and...,[Environmental Impact of Fast Fashion Topic: E...,AI impacts job markets by creating new job opp...,AI impacts job markets by creating new job opp...,single_hop_specifc_query_synthesizer
3,What are the environmental impacts of fast fas...,[[SKEPTIC PERSPECTIVE]\n\nLet's start by exami...,[Environmental Impact of Fast Fashion Topic: E...,Fast fashion has significant environmental imp...,Fast fashion poses a significant environmental...,single_hop_specifc_query_synthesizer
4,How does the Tech Innovators Podcast address t...,[[SKEPTIC PERSPECTIVE]\n\nWhen discussing bloc...,[Social Media and Mental Health Topic: What ar...,The Tech Innovators Podcast addresses the chal...,The Tech Innovators Podcast discusses the chal...,single_hop_specifc_query_synthesizer
5,what space explorers podcast talk about?,[Space Exploration Benefits\n\nTopic: How does...,[Social Media and Mental Health Topic: What ar...,The Space Explorers Podcast discusses the bene...,The Space Explorers Podcast discusses the bene...,single_hop_specifc_query_synthesizer
6,how 5G tech change telecommunications and what...,"[[CONCLUSION]\n\nAs we wrap up, it's evident t...",[that while the advancements in cancer researc...,5G technology is significantly changing teleco...,5G technology is reshaping telecommunications ...,single_hop_specifc_query_synthesizer
7,what climate change do?,[Climate Change Science and Impacts\n\nTopic: ...,[that while the advancements in cancer researc...,Climate change contributes to a variety of glo...,The context does not provide specific informat...,single_hop_specifc_query_synthesizer
8,What are the psychological factors behind cons...,"[[CONCLUSION]\n\nTo wrap up, stakeholders must...",[pave the way for a more resilient and sustain...,The psychological factors behind conspiracy th...,The psychological factors behind conspiracy th...,single_hop_specifc_query_synthesizer
9,What are the psychological factors that contri...,"[[CONCLUSION]\n\nTo wrap up, stakeholders must...",[pave the way for a more resilient and sustain...,The psychological factors contributing to beli...,The psychological factors behind conspiracy th...,single_hop_specifc_query_synthesizer


In [118]:
from ragas import EvaluationDataset

base_evaluation_dataset = EvaluationDataset.from_pandas(base_dataset.to_pandas())
ft_evaluation_dataset = EvaluationDataset.from_pandas(ft_dataset.to_pandas())

In [119]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))

In [120]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

base_result = evaluate(
    dataset=base_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

ft_result = evaluate(
    dataset=ft_evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

In [121]:
base_result

{'context_recall': 0.9167, 'faithfulness': 0.9222, 'factual_correctness': 0.5960, 'answer_relevancy': 0.8666, 'context_entity_recall': 0.5856, 'noise_sensitivity_relevant': 0.3764}

In [122]:
ft_result

{'context_recall': 0.8667, 'faithfulness': 0.9196, 'factual_correctness': 0.6200, 'answer_relevancy': 0.8643, 'context_entity_recall': 0.5689, 'noise_sensitivity_relevant': 0.3823}

# Analysis of Base and Fine-tuned Model Results

Here’s a detailed analysis comparing the `base_result` and `ft_result` dictionaries, including percentage changes for each metric.

# Model Performance Comparison Report

## 🔍 Key Findings
- Base model demonstrates superior retrieval capabilities
- Fine-tuned model shows slight improvement in factual accuracy
- Both models maintain consistent faithfulness scores

## 📊 Metric-by-Metric Analysis

| Metric | Base Model | Fine-tuned Model | Change |
|--------|------------|------------------|--------|
| Context Recall | 0.9167 | 0.8667 | -5.00% |
| Faithfulness | 0.9222 | 0.9196 | -0.26% |
| Factual Correctness | 0.5960 | 0.6200 | +2.40% |
| Answer Relevancy | 0.8666 | 0.8643 | -0.23% |
| Context Entity Recall | 0.5856 | 0.5689 | -1.67% |
| Noise Sensitivity | 0.3764 | 0.3823 | +0.59% |

## 💡 Conclusion
The fine-tuning process resulted in a trade-off:
- **Gains**: Improved factual correctness (+2.4%)
- **Losses**: Decreased context recall (-5%)
- **Stable**: Faithfulness and answer relevancy remained largely unchanged

The minimal differences suggest the base model was already well-optimized for the podcast content domain. This shows the benefits of choosing the fine-tuned model when factual accuracy is the priority for podcast related context.
