# Testset Generation for RAG

#Directory loader

In [None]:
from langchain_community.document_loaders import DirectoryLoader

async def adocument_loader(path: str = r"D:\My Files\RAG_with_RAGAS\Sample_Docs_Markdown"):
    loader = DirectoryLoader(path, glob="**/*.md")
    docs = await loader.aload()
    return docs

In [None]:
import nest_asyncio
nest_asyncio.apply()
import asyncio

docs = asyncio.run(adocument_loader())
docs[0]

Document(metadata={'source': 'D:\\My Files\\RAG_with_RAGAS\\Sample_Docs_Markdown\\advisory-group-members.md'}, page_content='layout: default title: Advisory Group Members description: "This page lists the members of the Diversity, Inclusion & Belonging Advisory Group."\n\n{{< group-by-expertise "Diversity, Inclusion & Belonging Advisory Group" >}}')

#LLM

In [6]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [7]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

#Loading Embeding model & LLM chain 

In [8]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_ollama import OllamaEmbeddings

generator_chain = LangchainLLMWrapper(llm)
generator_embedding = LangchainEmbeddingsWrapper(OllamaEmbeddings(model="llama3.2:1b"))

#Generate Testset

In [13]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_chain, embedding_model=generator_embedding)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Applying HeadlinesExtractor:   0%|          | 0/5 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/12 [00:00<?, ?it/s]         unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
Applying SummaryExtractor:  14%|█▍        | 1/7 [01:11<07:10, 71.81s/it]Property 'summary' already exists in node 'd4f8a8'. Skipping!
Applying SummaryExtractor:  43%|████▎     | 3/7 [01:41<01:52, 28.01s/it]Property 'summary' already exists in node '592e6d'. Skipping!
Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/25 [00:00<?, ?it/s]Property 'summary_embeddin

#Analyzing Dataset

In [14]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What resources and tools does GitLab provide t...,"[title: ""The Ally Lab"" description: Learn what...","The Ally Lab at GitLab provides tools, resourc...",single_hop_specifc_query_synthesizer
1,whats zoom,[Empathy & Emotional Intelligence An example o...,Zoom is an example of a tool used in a meeting...,single_hop_specifc_query_synthesizer
2,what is asia in the context of allyship,[What it means to be an ally Take on the strug...,Asia is not mentioned in the context of allysh...,single_hop_specifc_query_synthesizer
3,what allyship trainings are avalible in europe,[Page. Think about what you have individually ...,There are no specific allyship trainings menti...,single_hop_specifc_query_synthesizer
4,How can a sponsor effectively use their power ...,"[<1-hop>\n\ntitle: "" Sales Sponsorship Pilot P...",A sponsor can effectively use their power and ...,multi_hop_abstract_query_synthesizer
5,What are the challenges faced by team members ...,"[<1-hop>\n\ntitle: "" Sales Sponsorship Pilot P...",The challenges faced by team members from unde...,multi_hop_abstract_query_synthesizer
6,How can cultural competency and consciousness ...,[<1-hop>\n\nWhat it means to be an ally Take o...,Fostering cultural competency and consciousnes...,multi_hop_abstract_query_synthesizer
7,What are some effective ways to foster a cultu...,"[<1-hop>\n\ntitle: "" Building an Inclusive Rem...",Fostering a culture of inclusivity and fairnes...,multi_hop_abstract_query_synthesizer
8,What are some ways to foster an inclusive remo...,[<1-hop>\n\nWhat it means to be an ally Take o...,Fostering an inclusive remote culture requires...,multi_hop_specific_query_synthesizer
9,What are some essential skills required to be ...,[<1-hop>\n\nPage. Think about what you have in...,"According to the context, some essential skill...",multi_hop_specific_query_synthesizer


#KnowledgeGraph creation

In [15]:
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph()

In [16]:
from ragas.testset.graph import Node, NodeType

for doc in docs:
    kg.nodes.append(
        Node(
            type= NodeType.DOCUMENT,
            properties={"page_content": doc.page_content, "document_metadata": doc.metadata},
        )
    )

In [None]:
from ragas.testset.transforms import default_transforms, apply_transforms

transformer_llm = generator_chain
embedding_model = generator_embedding

trans = default_transforms(documents= docs, llm= generator_chain, embedding_model= embedding_model)
apply_transforms(kg, trans)

Applying HeadlineSplitter:   0%|          | 0/12 [00:00<?, ?it/s]         unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
                                                                 

KeyboardInterrupt: 

Property 'summary' already exists in node '0b9c3f'. Skipping!
Property 'summary' already exists in node '028dcc'. Skipping!


In [None]:
kg.save("knowledge_graph.json")
loaded_kg = KnowledgeGraph.load("knowledge_graph.json")
loaded_kg

#Testset Generation

In [None]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_chain, embedding_model=embedding_model, knowledge_graph=loaded_kg)

In [None]:
from ragas.testset.synthesizers import default_query_distribution

query_distribution = default_query_distribution(generator_chain)

In [None]:
testset = generator.generate(testset_size=10, query_distribution=query_distribution)
testset.to_pandas()