In [165]:
import os
import openai
from ragas import evaluate
from datasets import Dataset 
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_similarity, answer_correctness

In [186]:

os.environ.pop("OPENAI_API_KEY", None)  # Remove the current value
load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY")
openai.api_key = api_key
print(api_key)

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


In [187]:
urls = [
    "https://en.wikipedia.org/wiki/New_York_City",
    "https://en.wikipedia.org/wiki/Snow_leopard",
    "https://www.britannica.com/place/Galapagos-Islands",
    "https://www.birdlife.org/birds/penguins/#:~:text=The%20threats%20are%20numerous%2C%20including,is%20melting%20before%20their%20eyes."
]

In [188]:
# collect data using selenium url loader
loader = SeleniumURLLoader(urls=urls)
documents = loader.load()

Python(27642) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Python(27679) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [189]:
documents

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/New_York_City', 'title': 'New York City - Wikipedia', 'description': 'No description found.', 'language': 'en'}, page_content='New York City\n\nAfrikaans\n\nAlemannisch\n\nአማርኛ\n\nAnarâškielâ\n\nÆnglisc\n\nالعربية\n\nAragonés\n\nܐܪܡܝܐ\n\nԱրեւմտահայերէն\n\nArmãneashti\n\nArpetan\n\nঅসমীয়া\n\nAsturianu\n\nअवधी\n\nAvañe\'ẽ\n\nАвар\n\nAymar aru\n\nAzərbaycanca\n\nتۆرکجه\n\nBasa Bali\n\nBamanankan\n\nবাংলা\n\nBanjar\n\n閩南語 / Bân-lâm-gú\n\nBasa Banyumasan\n\nБашҡортса\n\nБеларуская\n\nБеларуская (тарашкевіца)\n\nभोजपुरी\n\nBikol Central\n\nBislama\n\nБългарски\n\nBoarisch\n\nབོད་ཡིག\n\nBosanski\n\nBrezhoneg\n\nБуряад\n\nCatalà\n\nЧӑвашла\n\nCebuano\n\nČeština\n\nChamoru\n\nChavacano de Zamboanga\n\nChi-Chewa\n\nChiTumbuka\n\nCorsu\n\nCymraeg\n\nDagbanli\n\nDansk\n\nالدارجة\n\nDeitsch\n\nDeutsch\n\nDiné bizaad\n\nDolnoserbski\n\nडोटेली\n\nEesti\n\nΕλληνικά\n\nEmiliàn e rumagnòl\n\nЭрзянь\n\nEspañol\n\nEsperanto\n\nEstremeñu\n\nEusk

In [190]:
documentList = []
for doc in documents:
    d = str(doc.page_content).replace("\\n", " ").replace("\\t"," ").replace("\n", " ").replace("\t", " ")
    documentList.append(d)

In [191]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [192]:
text_splitter = SemanticChunker(embedding_function)
docs = text_splitter.create_documents(documentList)

In [193]:
# storing embeddings in chromadb
vector_store = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")

In [194]:
# use this to load vector database
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)

In [198]:

PROMPT_TEMPLATE = """
Go through the context and answer given question strictly based on context. 
Context: {context}
Question: {question}
Answer:
"""

qa_chain = RetrievalQA.from_chain_type(
        llm = ChatOpenAI(model= 'gpt-4o-mini', temperature=0),
        # retriever=vector_store.as_retriever(search_kwargs={'k': 3}),
        retriever=vector_store.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PromptTemplate.from_template(PROMPT_TEMPLATE)}
    )

In [199]:
queries = [
    "Who discovered the Galapagos Islands and how?",
    "What is Brooklyn–Battery Tunnel?",
    "Are Penguins found in the Galapagos Islands?",
    "How many languages are spoken in New York?",
    "In which countries are snow leopards found?",
    "What are the threats to penguin populations?",
    "What is the economic significance of New York City?",
    "How did New York City get its name?",
    "How did Galapagos Islands get its name?",
    "What is the significance of the Statue of Liberty in New York City?",
    
]

ground_truths = [
    "The Galapagos Islands were discovered in 1535 by the bishop of Panama, Tomás de Berlanga, whose ship had drifted off course while en route to Peru. He named them Las Encantadas (“The Enchanted”), and in his writings he marveled at the thousands of large galápagos (tortoises) found there. Numerous Spanish voyagers stopped at the islands from the 16th century, and the Galapagos also came to be used by pirates and by whale and seal hunters. ",
    "The Brooklyn-Battery Tunnel (officially known as the Hugh L. Carey Tunnel) is the longest continuous underwater vehicular tunnel in North America and runs underneath Battery Park, connecting the Financial District in Lower Manhattan to Red Hook in Brooklyn.[586]",
    "Penguins live on the galapagos islands side by side with tropical animals.",
    "As many as 800 languages are spoken in New York.",
    "Siberia, Tajikistan, Kyrgyzstan, Uzbekistan, Kazakhstan, Afghanistan, Pakistan, India, Nepal, Bhutan, Mongolia, and Tibet.",
    "The threats are numerous, including habitat loss, pollution, disease, and reduced food availability due to commercial fishing. Climate change is of particular concern for many species of penguin, as the sea ice that they depend on to find food or build nests is melting before their eyes.",
    "New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.",
    "New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.",
    "Tomás de Berlanga, who discovered the islands, named them Las Encantadas (“The Enchanted”), and in his writings he marveled at the thousands of large galápagos (tortoises) found there. Numerous Spanish voyagers stopped at the islands from the 16th century, and the Galapagos also came to be used by pirates and by whale and seal hunters.",
    "The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.",
    
]


In [200]:
results = []
contexts = []
for query in queries:
    result = qa_chain({"query": query})
   
    results.append(result['result'])
    sources = result["source_documents"]
    contents = []
    for i in range(len(sources)):
        contents.append(sources[i].page_content)
    contexts.append(contents)

In [201]:
d = {
    "question": queries,
    "answer": results,
    "contexts": contexts,
    "ground_truth": ground_truths
}

In [205]:
from datasets import Dataset 
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_similarity, answer_correctness
from ragas import evaluate

dataset = Dataset.from_dict(d)
score = evaluate(dataset,metrics=[faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_similarity, answer_correctness])
score_df = score.to_pandas()
score_df

Evaluating:   0%|          | 0/70 [00:00<?, ?it/s]

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision,context_recall,context_entity_recall,semantic_similarity,answer_correctness
0,Who discovered the Galapagos Islands and how?,[Because of subsequent evolutionary adaptation...,The Galapagos Islands were discovered in 1535 ...,The Galapagos Islands were discovered in 1535 ...,1.0,0.978762,0.25,1.0,0.272727,0.961835,0.540459
1,What is Brooklyn–Battery Tunnel?,"[(August 17, 2016). ""Marine Park, Brooklyn: Bl...",The context provided does not contain any info...,The Brooklyn-Battery Tunnel (officially known ...,0.5,0.0,0.0,0.0,0.25,0.806362,0.201591
2,Are Penguins found in the Galapagos Islands?,[The archipelago is renowned for its unusual a...,"Yes, penguins can be spotted on the volcanic i...",Penguins live on the galapagos islands side by...,1.0,0.98602,1.0,1.0,0.333333,0.914166,0.603541
3,How many languages are spoken in New York?,"[^ Jump up to: a b Semple, Kirk (June 8, 2013)...",There are as many as 800 languages spoken in N...,As many as 800 languages are spoken in New York.,1.0,0.991743,1.0,1.0,1.0,0.991481,0.99787
4,In which countries are snow leopards found?,[(eds.). Snow Leopards: Biodiversity of the Wo...,The context provided does not specify the coun...,"Siberia, Tajikistan, Kyrgyzstan, Uzbekistan, K...",0.5,0.0,0.0,0.0,0.166667,0.775482,0.19387
5,What are the threats to penguin populations?,[PENGUINS Watching penguins waddle across sli...,The threats to penguin populations include hab...,"The threats are numerous, including habitat lo...",1.0,1.0,1.0,1.0,0.0,0.963839,0.99096
6,What is the economic significance of New York ...,"[^ Homberger, Eric (2005). The Historical Atla...",The economic significance of New York City is ...,"New York City's economic significance is vast,...",0.5,1.0,0.0,0.0,0.0,0.951539,0.462885
7,How did New York City get its name?,[City in the United States New York City Midt...,New York City was named in honor of the Duke o...,New York City got its name when it came under ...,1.0,0.956777,1.0,1.0,0.2,0.943791,0.835948
8,How did Galapagos Islands get its name?,[Because of subsequent evolutionary adaptation...,The Galapagos Islands were named by the bishop...,"Tomás de Berlanga, who discovered the islands,...",0.75,0.923205,0.5,1.0,0.285714,0.966165,0.650632
9,What is the significance of the Statue of Libe...,[City in the United States New York City Midt...,"The Statue of Liberty, dedicated in 1886, is s...",The Statue of Liberty in New York City holds g...,1.0,1.0,1.0,0.666667,0.25,0.964455,0.616114


In [209]:
score_df[['faithfulness','answer_relevancy', 'context_precision', 'context_recall',
       'context_entity_recall', 'semantic_similarity', 'answer_correctness']].mean(axis=0)

faithfulness             0.825000
answer_relevancy         0.783651
context_precision        0.575000
context_recall           0.666667
context_entity_recall    0.275844
semantic_similarity      0.923912
answer_correctness       0.609387
dtype: float64