In [1]:
import pandas as pd
from datasets import Dataset, load_dataset
from ragas.evaluation import evaluate
from ragas.metrics import answer_relevancy, faithfulness, answer_correctness, context_recall, context_precision, answer_similarity
import os
from dotenv import load_dotenv

from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain import hub
import openai
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableParallel
from langchain_community.retrievers import BM25Retriever
from langchain.docstore.document import Document
from langchain.retrievers import EnsembleRetriever
import asyncio
import nest_asyncio
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
nest_asyncio.apply()
import gradio as gr

openai.api_key = os.getenv("OPENAI_API_KEY")  # Ensure you have set this in your .env file
embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)
vectorstore = FAISS.load_local("nba_vector_db_semantic", embeddings=embedding, allow_dangerous_deserialization=True)


In [2]:
import time
import openai

# 1) Import RateLimitError (with a fallback to a generic Exception if the import fails)
try:
    from openai.error import RateLimitError
except ImportError:
    RateLimitError = Exception

In [3]:
cheap_llm = ChatOpenAI(model="gpt-3.5-turbo", temperature= 0)

In [4]:
documents = vectorstore.docstore._dict.values()

In [5]:
dense_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
sparse_retriever = BM25Retriever.from_documents(list(documents), k=10)
ensemble_retriever = EnsembleRetriever(retrievers=[dense_retriever, sparse_retriever], weights=[0.5, 0.5], c=0)

In [6]:
prompt = hub.pull("jclemens24/rag-prompt")



In [7]:
relevance_prompt_template = PromptTemplate.from_template(
    """
    Given the following question and retrieved context, determine if the context is relevant to the question.
    Provide a score from 1 to 5, where 1 is not at all relevant and 5 is highly relevant.
    Return ONLY the numeric score, without any additional text or explanation.

    Question: {question}
    Retrieved Context: {retrieved_context}

    Relevance Score:"""
)

In [8]:
def format_docs(docs):
 return "\n\n".join(doc.page_content for doc in docs)

In [9]:
def extract_score(llm_output):
    try:
        score = float(llm_output.strip())
        return score
    except ValueError:
        return 0

# Chain it all together with LangChain
def conditional_answer(x):
    relevance_score = extract_score(x['relevance_score'])
    if relevance_score < 2:
        return "I don't know."
    else:
        return x['answer']

In [10]:
## Define the LLM
llm = ChatOpenAI(model_name="gpt-4o-mini")
str_output_parser = StrOutputParser()

In [11]:
## Define the chain
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | RunnableParallel(
        {"relevance_score": (
            RunnablePassthrough()
            | (lambda x: relevance_prompt_template.format(question=x['question'], retrieved_context=x['context']))
            | llm
            | str_output_parser
        ), "answer": (
            RunnablePassthrough()
            | prompt
            | llm
            | str_output_parser
        )}
    )
    | RunnablePassthrough().assign(final_answer=conditional_answer)
)

In [12]:
rag_chain_with_source_dense = RunnableParallel(
    {"context": dense_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [13]:
rag_chain_with_source_ensemble = RunnableParallel(
    {"context": ensemble_retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [14]:
## read the dataset
df = pd.read_csv("nba_generated_qa.csv")
#rename the answer column to groud_truth
df.rename(columns={"answer": "ground_truth"}, inplace=True)
df.head(10)

Unnamed: 0,question,ground_truth,context
0,How many 3-pointers did Kemba Walker make in t...,Walker made 250 3-pointers in the 2018-2019 NB...,"On November 17, he scored a career-high and fr..."
1,Who did Ray Allen co-star alongside in 2015?,"Kyrie Irving, Baron Davis, and J.","In 2015, Allen co-starred alongside Kyrie Irvi..."
2,What college did Miye Oni play basketball for?,Yale Bulldogs,"Olumiye Dimolu ""Miye"" Oni (born August 4, 1997..."
3,In what year did AJ Green record his fourth co...,2015,"In Week 5, against the New England Patriots, h..."
4,What was Ronald Dupree's shooting percentage f...,0.286,"He shot 0.286 from the field, 0.0 from 3-point..."
5,In which year did Kirk Haston play basketball?,Kirk Haston played basketball in the year ment...,"Basketball""."
6,In what year was Adonal Foyle inducted into th...,2009,He also became a member of the National Basket...
7,In which year did Cozell McQueen lead the Ital...,1988–89,Though he briefly played in the NBA for the De...
8,In which year did Brandon Ingram represent the...,...,Olympic basketball team.
9,What were Greg Brown III's shooting percentage...,"Greg Brown III shot 0.42 from the field, 0.33 ...","He shot 0.42 from the field, 0.33 from 3-point..."


In [15]:
saved_data = df.astype(str).to_dict(orient="list")
saved_testing_dataset = Dataset.from_dict(saved_data)
saved_testing_dataset

Dataset({
    features: ['question', 'ground_truth', 'context'],
    num_rows: 76
})

In [16]:
def generate_answer(question, ground_truth, rag_chain):
    result = rag_chain.invoke(question)
    return {
    "question": question,
    "answer": result["answer"]["final_answer"],
    "contexts": [doc.page_content for doc in result["context"]],
    "ground_truth": ground_truth
 }

In [17]:
def safe_generate_answer(question, ground_truth, chain, max_retries=5):
    delay = 1.0
    for _ in range(max_retries):
        try:
            return generate_answer(question, ground_truth, chain)
        except RateLimitError:
            print(f"Rate limit hit, retrying in {delay}s…")
            time.sleep(delay)
            delay *= 2
    # if we still fail, return a placeholder
    return {
        "predicted_answer": None,
        "similarity_score": None,
        "error": "rate_limit_exceeded"
    }


In [18]:
testing_dataset_similarity = saved_testing_dataset.map(
    lambda ex: safe_generate_answer(
        ex["question"],
        ex["ground_truth"],
        rag_chain_with_source_dense
    ),
    batched=False,
    remove_columns=saved_testing_dataset.column_names
)



Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…


In [20]:
score_similarity = evaluate(
    testing_dataset_similarity,
    metrics=[
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness,
    answer_similarity
 ],
 llm=cheap_llm
)
similarity_df = score_similarity.to_pandas()

Evaluating:   0%|          | 0/456 [00:00<?, ?it/s]

Exception raised in Job[72]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-rdgbPDQbOKFwp8vaWQzMQLnQ on tokens per min (TPM): Limit 200000, Used 197695, Requested 3571. Please try again in 379ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[69]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-rdgbPDQbOKFwp8vaWQzMQLnQ on tokens per min (TPM): Limit 200000, Used 197971, Requested 8399. Please try again in 1.911s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[90]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-rdgbPDQbOKFwp8vaWQzMQLnQ on tokens per min (TPM): Limit 200

In [21]:
key_columns = ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall', 'answer_correctness', 'semantic_similarity']

In [22]:
similarity_df

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision,context_recall,answer_correctness,semantic_similarity
0,How many 3-pointers did Kemba Walker make in t...,"[Augustin, he became their starting point guar...",Kemba Walker made 250 three-pointers in the 20...,Walker made 250 3-pointers in the 2018-2019 NB...,0.0,0.996949,0.944444,1.00,0.729575,0.918301
1,Who did Ray Allen co-star alongside in 2015?,"[In 2015, Allen co-starred alongside Kyrie Irv...","In 2015, Ray Allen co-starred alongside Kyrie ...","Kyrie Irving, Baron Davis, and J.",0.0,1.000000,0.000000,1.00,0.977542,0.910168
2,What college did Miye Oni play basketball for?,"[Olumiye Dimolu ""Miye"" Oni (born August 4, 199...",Miye Oni played college basketball for the Yal...,Yale Bulldogs,1.0,0.982126,0.966667,0.20,0.714126,0.856506
3,In what year did AJ Green record his fourth co...,[He followed that up with eight receptions for...,"AJ Green recorded his fourth consecutive 1,000...",2015,0.0,0.911119,0.592619,0.75,0.839809,0.787749
4,What was Ronald Dupree's shooting percentage f...,"[Ronald Edmund Dupree Jr. (born January 26, 19...",Ronald Dupree's shooting percentage from the f...,0.286,1.0,0.970677,1.000000,0.00,0.193134,0.772537
...,...,...,...,...,...,...,...,...,...,...
71,What was Anthony Johnson's shooting percentage...,"[He shot 0.371 from the field, 0.328 from 3-po...",Anthony Johnson's shooting percentage from the...,0.5,0.5,1.000000,0.808333,0.60,0.192842,0.771424
72,In which year did Vlatko Čančar win his first ...,[Vlatko Čančar ( CHAHN-char; born 10 April 199...,Vlatko Čančar won his first NBA championship w...,In June 2023,1.0,0.958585,0.500000,1.00,0.704641,0.818565
73,What event led to Michael Jordan becoming a gr...,"[He has two older brothers, James Jr. and Larr...",Michael Jordan became a grandfather in 2019 wh...,"His daughter Jasmine giving birth to a son, wh...",0.0,0.953684,0.450000,0.00,0.722694,0.890777
74,What significant achievement did Rumeal Robins...,"[Shortly after he turned 10 years old, his gra...","During his junior year in 1989, Rumeal Robinso...",Rumeal Robinson sank two crucial free throws w...,1.0,0.989923,0.807341,1.00,0.673087,0.978061


In [23]:
similarity_means = similarity_df[key_columns].mean()
similarity_means

faithfulness           0.569672
answer_relevancy       0.813273
context_precision      0.702329
context_recall         0.750758
answer_correctness     0.640485
semantic_similarity    0.858064
dtype: float64

In [24]:
similarity_df.to_csv("evaluation_dense.csv", index=False)

In [25]:
testing_dataset_hybrid = saved_testing_dataset.map(
    lambda ex: safe_generate_answer(
        ex["question"],
        ex["ground_truth"],
        rag_chain_with_source_ensemble
    ),
    batched=False,
    remove_columns=saved_testing_dataset.column_names
)

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 2.0s…
Rate limit hit, retrying in 4.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 2.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 2.0s…
Rate limit hit, retrying in 4.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 2.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…
Rate limit hit, retrying in 1.0s…


In [26]:
score_similarity = evaluate(
    testing_dataset_hybrid,
    metrics=[
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness,
    answer_similarity
 ],
 llm=cheap_llm
)
hybrid_df = score_similarity.to_pandas()

Evaluating:   0%|          | 0/456 [00:00<?, ?it/s]

Exception raised in Job[21]: BadRequestError(Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 18514 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}})
Exception raised in Job[18]: BadRequestError(Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 18463 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}})
Exception raised in Job[51]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-3.5-turbo in organization org-rdgbPDQbOKFwp8vaWQzMQLnQ on tokens per min (TPM): Limit 200000, Used 196680, Requested 11157. Please try again in 2.351s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', '

In [27]:
hybrid_means = hybrid_df[key_columns].mean()
hybrid_means

faithfulness           0.565972
answer_relevancy       0.809999
context_precision      0.687688
context_recall         0.819565
answer_correctness     0.638502
semantic_similarity    0.864974
dtype: float64

In [28]:
similarity_means

faithfulness           0.569672
answer_relevancy       0.813273
context_precision      0.702329
context_recall         0.750758
answer_correctness     0.640485
semantic_similarity    0.858064
dtype: float64

In [29]:
hybrid_df.to_csv("evaluation_hybrid.csv", index=False)