In [7]:
import os

from dotenv import load_dotenv
load_dotenv()

import nest_asyncio
nest_asyncio.apply()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

### Building the Tests

[News Source](https://www.cbsnews.com/news/trump-shot-rally-assassination-attempt/)

In [12]:
# load the documents
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("data/Trump/").load_data()


## 1. Using GPT-4o as the generator llm and critic llm

In [17]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# generator with openai models
generator_llm = OpenAI(model="gpt-4o")
critic_llm = OpenAI(model="gpt-4o")
embeddings = OpenAIEmbedding(model='text-embedding-ada-002')

generator = TestsetGenerator.from_llama_index(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings,
)

In [18]:
# generate testset
testset = generator.generate_with_llamaindex_docs(
    documents,
    test_size=10,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)

embedding nodes:   0%|          | 0/18 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/10 [00:00<?, ?it/s]

max retries exceeded for SimpleEvolution(generator_llm=LlamaIndexLLMWrapper(run_config=RunConfig(timeout=180, max_retries=15, max_wait=90, max_workers=16, exception_types=(<class 'Exception'>,), log_tenacity=False, seed=42)), docstore=InMemoryDocumentStore(splitter=<langchain_text_splitters.base.TokenTextSplitter object at 0x173a907d0>, nodes=[Node(metadata={'page_label': '1', 'file_name': 'Trump.pdf', 'file_path': '/Users/chris/Desktop/7980/CS7980_Capstone_RBCMuseum/eval/data/Trump/Trump.pdf', 'file_type': 'application/pdf', 'file_size': 86251, 'creation_date': '2024-10-15', 'last_modified_date': '2024-10-15'}, page_content='FormerPresidentDonaldTrumpwasshotandinjuredinanassassinationattempton\nSaturdaynightthatalsokilledaspectatorandcriticallyinjuredtwoothers.Sniperskilled\ntheshooter,a20-year-oldman,afterhefiredeightroundsattherallyinButler,\nPennsylvania.\nTheformerpresidentcouldbeseentouchinghisearastheshootingunfoldedbeforehe\nwasshieldedbySecretServiceandwhiskedoffstage—withbloo

In [19]:
testset

TestDataset(test_data=[DataRow(question='Who identified the suspicious person outside the security perimeter at the rally in Butler County?', contexts=['ofButlerCounty.\nThegunmanwasontheroofofabuilding,outsidetherally\'ssecurityperimetersetupby\ntheSecretService,andopenedfirefromabout410feetawayfromthestagewhere\nTrumpwasspeaking,lawenforcementsourcessaid.\nTwolawenforcementsourcestoldCBSNewsthattheshooterwasspottedoutsidethe\nsecurityperimeteraspeoplewerefilingintotherally,andhewasreportedbyabystander\ntotheButlerCountySheriff\'sOffice.Hewasidentifiedasasuspiciouspersonbypolice,\nthesourcessaid.Multipleattendeesalsosaidtheysawthegunmanandalertedofficers\nshortlybeforetheshooting.\nAnarmedmunicipalofficerwithButlerTownshipencounteredthegunmanbeforethe\nshooting,ButlerCountySheriffMichaelSloupeconfirmedtoCBSNews.Theofficerand\nothershadbeenpreviouslyalertedtoasuspiciouspersonandbegansearchingforhim\nrightaway,SloupetoldCBSPittsburghreporterJenBorrasso.\nAtsomepoint,theofficerwashoisted

In [20]:
df = testset.to_pandas()
df.head()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,Who identified the suspicious person outside t...,[ofButlerCounty.\nThegunmanwasontheroofofabuil...,The suspicious person outside the security per...,simple,"[{'page_label': '4', 'file_name': 'Trump.pdf',...",True
1,What was found in the gunman's vehicle that ra...,[Former President Donald Trump was shot and in...,Suspicious devices were found in the gunman's ...,simple,"[{'file_name': 'Trump2.docx', 'file_path': '/U...",True
2,What was found in the gunman's vehicle that ra...,[Former President Donald Trump was shot and in...,Suspicious devices were found in the gunman's ...,simple,"[{'file_name': 'Trump2.docx', 'file_path': '/U...",True
3,What role did Corey Comperatore have in the co...,[ofButlerCounty.\nThegunmanwasontheroofofabuil...,The answer to given question is not present in...,reasoning,"[{'page_label': '4', 'file_name': 'Trump.pdf',...",True
4,Who reported the suspicious person at the rall...,[ofButlerCounty.\nThegunmanwasontheroofofabuil...,The suspicious person at the rally was reporte...,reasoning,"[{'page_label': '4', 'file_name': 'Trump.pdf',...",True


### Building the query engine

In [21]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.settings import Settings

vector_index = VectorStoreIndex.from_documents(documents)

query_engine = vector_index.as_query_engine()

simple questions

In [22]:
# convert it to pandas dataset
df = testset.to_pandas()
df["question"][0]

'Who identified the suspicious person outside the security perimeter at the rally in Butler County?'

In [23]:
response_vector = query_engine.query(df["question"][0])

print(response_vector)

A bystander identified the suspicious person outside the security perimeter at the rally in Butler County.


### Evaluate the query engine

In order to run an evaluation with Ragas and LlamaIndex:

- LlamaIndex QueryEngine: what we will be evaluating

- Metrics: Ragas defines a set of metrics that can measure different aspects of the QueryEngine. The available metrics and their meaning can be found here

- Questions: A list of questions that ragas will test the QueryEngine against.

In [24]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    harmfulness,
]

In [25]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# using GPT 3.5, use GPT 4 / 4-turbo for better accuracy
evaluator_llm = OpenAI(model="gpt-4o")

In [26]:
# convert to HF dataset
ds = testset.to_dataset()

ds_dict = ds.to_dict()
ds_dict["question"]
ds_dict["ground_truth"]

["The suspicious person outside the security perimeter at the rally in Butler County was identified by a bystander who reported him to the Butler County Sheriff's Office.",
 "Suspicious devices were found in the gunman's vehicle, along with a piece of commercially available equipment that appeared capable of initiating the devices. Bomb technicians were called to the scene to secure and investigate the devices.",
 "Suspicious devices were found in the gunman's vehicle, along with a piece of commercially available equipment that appeared capable of initiating the devices. Bomb technicians were called to the scene to secure and investigate the devices.",
 'The answer to given question is not present in context',
 "The suspicious person at the rally was reported by a bystander to the Butler County Sheriff's Office."]

evaluation

In [27]:
from ragas.integrations.llama_index import evaluate

result = evaluate(
    query_engine=query_engine,
    metrics=metrics,
    dataset=ds,
    llm=evaluator_llm,
    embeddings=OpenAIEmbedding(),
)

Running Query Engine:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

n values greater than 1 not support for LlamaIndex LLMs
n values greater than 1 not support for LlamaIndex LLMs
n values greater than 1 not support for LlamaIndex LLMs
n values greater than 1 not support for LlamaIndex LLMs
n values greater than 1 not support for LlamaIndex LLMs


In [28]:
print(result)

{'faithfulness': 0.9333, 'answer_relevancy': 0.9438, 'context_precision': 0.9000, 'context_recall': 0.8000, 'harmfulness': 0.0000}


In [29]:
result.to_pandas()

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_precision,context_recall,harmfulness
0,Who identified the suspicious person outside t...,[ofButlerCounty.\nThegunmanwasontheroofofabuil...,A bystander identified the suspicious person o...,The suspicious person outside the security per...,1.0,1.0,1.0,1.0,0
1,What was found in the gunman's vehicle that ra...,[Law enforcement sources told CBS News on Sund...,Suspicious devices were found in the gunman's ...,Suspicious devices were found in the gunman's ...,1.0,0.940589,1.0,1.0,0
2,What was found in the gunman's vehicle that ra...,[Law enforcement sources told CBS News on Sund...,Suspicious devices were found in the gunman's ...,Suspicious devices were found in the gunman's ...,0.666667,0.940589,1.0,1.0,0
3,What role did Corey Comperatore have in the co...,"[""Hiscommitmenttohiswifeandtwodaughters,andhis...",Corey Comperatore served as the fire chief for...,The answer to given question is not present in...,1.0,0.837873,0.5,0.0,0
4,Who reported the suspicious person at the rall...,[ofButlerCounty.\nThegunmanwasontheroofofabuil...,A bystander reported the suspicious person at ...,The suspicious person at the rally was reporte...,1.0,0.999999,1.0,1.0,0


## 2. Using GPT-3.5-turbo as the generator llm and critic llm

In [37]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# generator with openai models
generator_llm = OpenAI(model="gpt-3.5-turbo")
critic_llm = OpenAI(model="gpt-3.5-turbo")
embeddings = OpenAIEmbedding(model='text-embedding-ada-002')

generator = TestsetGenerator.from_llama_index(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings,
)

In [38]:
# generate testset
testset = generator.generate_with_llamaindex_docs(
    documents,
    test_size=10,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)

embedding nodes:   0%|          | 0/18 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/10 [00:00<?, ?it/s]

max retries exceeded for SimpleEvolution(generator_llm=LlamaIndexLLMWrapper(run_config=RunConfig(timeout=180, max_retries=15, max_wait=90, max_workers=16, exception_types=(<class 'Exception'>,), log_tenacity=False, seed=42)), docstore=InMemoryDocumentStore(splitter=<langchain_text_splitters.base.TokenTextSplitter object at 0x31a55abd0>, nodes=[Node(metadata={'page_label': '1', 'file_name': 'Trump.pdf', 'file_path': '/Users/chris/Desktop/7980/CS7980_Capstone_RBCMuseum/eval/data/Trump/Trump.pdf', 'file_type': 'application/pdf', 'file_size': 86251, 'creation_date': '2024-10-15', 'last_modified_date': '2024-10-15'}, page_content='FormerPresidentDonaldTrumpwasshotandinjuredinanassassinationattempton\nSaturdaynightthatalsokilledaspectatorandcriticallyinjuredtwoothers.Sniperskilled\ntheshooter,a20-year-oldman,afterhefiredeightroundsattherallyinButler,\nPennsylvania.\nTheformerpresidentcouldbeseentouchinghisearastheshootingunfoldedbeforehe\nwasshieldedbySecretServiceandwhiskedoffstage—withbloo

In [39]:
testset

TestDataset(test_data=[])