In [7]:
import os

from dotenv import load_dotenv
load_dotenv()

import nest_asyncio
nest_asyncio.apply()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

### Building the Tests

In [8]:
# load the documents
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("data/NYC/").load_data()


In [17]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# generator with openai models
generator_llm = OpenAI(model="gpt-3.5-turbo")
critic_llm = OpenAI(model="gpt-4")
embeddings = OpenAIEmbedding(model='text-embedding-ada-002')

generator = TestsetGenerator.from_llama_index(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings,
)

In [20]:
# generate testset
testset = generator.generate_with_llamaindex_docs(
    documents,
    test_size=5,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)

embedding nodes:   0%|          | 0/240 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/5 [00:00<?, ?it/s]

In [21]:
df = testset.to_pandas()
df.head()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the significance of Broadway theaters ...,"[History/The_Great_White_W ay/)\non May 4, 201...",The Great White Way is a phrase known worldwid...,simple,"[{'page_label': '64', 'file_name': 'NY.pdf', '...",True
1,What city has overtaken Hong Kong as the most ...,"[30. Goh Chiew Tong (June 7, 2023). ""New York ...",New York has overtaken Hong Kong as the most e...,simple,"[{'page_label': '37', 'file_name': 'NY.pdf', '...",True
2,How many homeless individuals were in NYC's ma...,"[ outdoors (72%), whereas the unsheltered home...","In November 2023, there were 92,824 homeless p...",reasoning,"[{'page_label': '55', 'file_name': 'NY.pdf', '...",True
3,What were the population figures for NYC in 20...,[135. Population - Decennial Census - Census 2...,"The population of New York City was 8,008,278 ...",multi_context,"[{'page_label': '43', 'file_name': 'NY.pdf', '...",True
4,What demographic changes happened in NYC due t...,"[The Battle of Long Island, one of the\nlarges...",The British occupation of New York during the ...,multi_context,"[{'page_label': '5', 'file_name': 'NY.pdf', 'f...",True


### Building the query engine

In [22]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.settings import Settings

vector_index = VectorStoreIndex.from_documents(documents)

query_engine = vector_index.as_query_engine()

simple questions

In [23]:
# convert it to pandas dataset
df = testset.to_pandas()
df["question"][0]

'What is the significance of Broadway theaters in Times Square and New York City?'

In [24]:
response_vector = query_engine.query(df["question"][0])

print(response_vector)

Broadway theaters in Times Square and New York City are significant as they form a major entertainment hub that attracts millions of visitors annually. They contribute significantly to the city's economy through ticket sales and tourism revenue. Additionally, Broadway theaters are a key part of the world-renowned entertainment industry in New York City, making it a central location for live performances and cultural experiences.


### Evaluate the query engine

In order to run an evaluation with Ragas and LlamaIndex:

- LlamaIndex QueryEngine: what we will be evaluating

- Metrics: Ragas defines a set of metrics that can measure different aspects of the QueryEngine. The available metrics and their meaning can be found here

- Questions: A list of questions that ragas will test the QueryEngine against.

In [25]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    harmfulness,
]

In [26]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# using GPT 3.5, use GPT 4 / 4-turbo for better accuracy
evaluator_llm = OpenAI(model="gpt-3.5-turbo")

In [29]:
# convert to HF dataset
ds = testset.to_dataset()

ds_dict = ds.to_dict()
ds_dict["question"]
ds_dict["ground_truth"]

["The Great White Way is a phrase known worldwide to describe Broadway's profusion of theaters in Times Square. By 1910, the blocks of Broadway just above 42nd Street were at the very heart of the Great White Way. The glow of Times Square symbolized the center of New York, if not of the world.",
 'New York has overtaken Hong Kong as the most expensive city in the world for expats, according to a new survey.',
 "In November 2023, there were 92,824 homeless people in New York City's main municipal shelter system, including 33,365 homeless children.",
 'The population of New York City was 8,008,278 in 2000 and 8,804,190 in 2020. The percentage increase from 2000 to 2020 was approximately 10%. The answer to the given question is present in the context.',
 'The British occupation of New York during the Revolutionary War led to an influx of Loyalist refugees and escaped slaves who joined the British forces for the promise of freedom. As many as 10,000 escaped slaves crowded into the city dur

In [32]:
ds

Dataset({
    features: ['question', 'contexts', 'ground_truth', 'evolution_type', 'metadata', 'episode_done'],
    num_rows: 5
})

evaluation

In [33]:
from ragas.integrations.llama_index import evaluate

result = evaluate(
    query_engine=query_engine,
    metrics=metrics,
    dataset=ds,
    llm=evaluator_llm,
    embeddings=OpenAIEmbedding(),
)

Running Query Engine:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

n values greater than 1 not support for LlamaIndex LLMs
n values greater than 1 not support for LlamaIndex LLMs
n values greater than 1 not support for LlamaIndex LLMs
No statements were generated from the answer.
n values greater than 1 not support for LlamaIndex LLMs
n values greater than 1 not support for LlamaIndex LLMs


In [35]:
print(result)

{'faithfulness': 0.9375, 'answer_relevancy': 0.9519, 'context_precision': 1.0000, 'context_recall': 0.7333, 'harmfulness': 0.0000}


In [36]:
result.to_pandas()

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_precision,context_recall,harmfulness
0,What is the significance of Broadway theaters ...,"[""2 plays + 9 nominations = good odds for loca...",Broadway theaters in Times Square and New York...,The Great White Way is a phrase known worldwid...,1.0,1.0,1.0,1.0,0
1,What city has overtaken Hong Kong as the most ...,"[30. Goh Chiew Tong (June 7, 2023). ""New York ...",New York,New York has overtaken Hong Kong as the most e...,,0.982605,1.0,1.0,0
2,How many homeless individuals were in NYC's ma...,[(https://ww\nw.cityandstateny .com/policy/202...,"In November 2023, there were 92,824 homeless p...","In November 2023, there were 92,824 homeless p...",1.0,0.953754,1.0,0.5,0
3,What were the population figures for NYC in 20...,"[239. ""Table PL-P1 NYC: Total Population New Y...","The population of New York City in 2000 was 8,...","The population of New York City was 8,008,278 ...",1.0,0.949369,1.0,0.666667,0
4,What demographic changes happened in NYC due t...,[The combined British-Hessian assault force of...,The British occupation during the Revolutionar...,The British occupation of New York during the ...,0.75,0.873914,1.0,0.5,0
