In [1]:
# !pip install openai -U
# !pip install llama-index llama-index-core llama-index-readers-file llama-index-llms-ollama llama-index-llms-huggingface llama-index-embeddings-huggingface llama-index-llms-openai llama-index-embeddings-openai -U
# !pip install spacy -U

In [2]:
import os
import torch
import tiktoken
import pandas as pd
import matplotlib.pyplot as plt
from time import sleep
from tqdm import tqdm

from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness, context_relevancy, answer_similarity

from llama_index.llms.openai import OpenAI
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, \
    ServiceContext, StorageContext, load_index_from_storage, get_response_synthesizer, \
    PromptTemplate
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor, MetadataReplacementPostProcessor
from llama_index.core.node_parser import  SentenceSplitter, SentenceWindowNodeParser
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.llms import ChatMessage, MessageRole

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

from dotenv import load_dotenv
load_dotenv()
#os.environ['HUGGINGFACE_API_KEY']=os.getenv("HUGGINGFACE_API_KEY")
#os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")


C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


True

In [3]:
LLM_NAME = "gpt-3.5-turbo"                              # LLM model path from OpenAI
TOKENIZER_NAME = "gpt-3.5-turbo"                        # Tokenizer path from OpenAI
INPUT_DATA_PATH = '../data/txt_from_pdf_with_file_name' # Path to the input data
QUESTION_ANS_PATH = '../data/question_answer_short.json'       # Path to the question list
QCR_EVAL_SAVE_PATH = "../RAG_eval_results/QCR_eval_results.csv" # Path to save question-context-response evaluation results 
# Path to save average question-context-response evaluation results for different hyperparameters choices
HYPER_PARAM_EVAL_SAVE_PATH = "../RAG_eval_results/HYPER_PARAM_eval_results.csv" 

cand_EMBED_MODEL = ["text-embedding-3-small","BAAI/bge-small-en-v1.5"]   # model path for candidate embedding models

# candidate chunk size, chunk overlap, and similarity top-k combinations
cand_CHUNK_SIM_TOP_K = [{'chunk_size':512, 'chunk_overlap':20, 'sim_top_k':2},
                        {'chunk_size':256, 'chunk_overlap':10, 'sim_top_k':4},
                        {'chunk_size':128, 'chunk_overlap':10, 'sim_top_k':8}] 
# candidate system prompts 
cand_SYS_PROMPT=["""
You are an AI teaching Assistant for the SEP 775 course. 
You will provide an interactive platform for students to ask questions and receive guidance on course materials.
Your goal is to answer questions as accurately as possible based on the instructions and context provided.
If you found the answer based on the context provided, you should provide the answer first, then at the end, beginning a new sentence with the words "Source:", followed by the name of the lecture, or assignment, or paper if possible.
""",
                """
You are an AI Teaching Assistant for the SEP 775 course. 
Your job is to answer students' questions about course materials according to the instructions and context provided.
If you found the answer based on the context provided, you should provide the answer first, then at the end, beginning a new sentence with the words "Source:", followed by the name of the lecture, or assignment, or paper if possible.
"""]     

# Customized prompt for query engine
QA_PROMPT_TPL_str = (
    "Below is the context information related to a course: SEP 775 - Computational Natural Language Processing.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)

In [4]:
# Load data from documents
documents = SimpleDirectoryReader(INPUT_DATA_PATH).load_data()

In [5]:
# Read QA pairs generated using ChatGPT with GPT-4
sample_QA_pairs_df = pd.read_json("../data/question_answer.json")

In [6]:
sample_QA_pairs_df.head()

Unnamed: 0,queries,responses
0,What is the instructor's name for this course?,The instructor's name of the course is Hamidre...
1,What is the email address of the instructor fo...,The email address of the instructor is mahyarh...
2,When is this course held every week?,This course is held every Wednesday from 3:30p...
3,What is the name of the TA for this course?,The TA's name for this course is Reza Namazi.
4,What is Word2vec?,Word2vec is a framework for learning word vect...


In [7]:
def gen_QCR_data(query_engine, QA_pairs_df):
    """
    Function to generate a question-context-response dataset to be evaluated, 
    the returned dataset also includes reference "ground-truth" response from the input.
    
    Parameters:
    query_engine: LlamaIndex query engine object 
    QA_pairs_df: Dataframe that includes sample QA pairs 
    
    Output:
    QCR_ds: question-context-response dataset with features [question, answer, contexts, ground_truth]
    """
    sample_questions = QA_pairs_df['queries'].values
    ref_answers = QA_pairs_df['responses'].values
    
    print("Performing queries for %d sample questions..."%len(sample_questions))
    
    contexts = []
    answers = []
    num_no_ans_Q = 0
    for Q in tqdm(sample_questions):
        response = query_engine.query(Q)
        answer = str(response)

        if ("Empty Response" in answer):
            num_no_ans_Q += 1

        contexts.append([x.node.get_content() for x in response.source_nodes])
        answers.append(answer)
    
    print("In the %d sample questions, %d questions could not find relevant context."%(len(sample_questions),num_no_ans_Q))
    # Take the question, context, response, and reference response of the first 40 queries for later evaluations
    QCR_ds = Dataset.from_dict(
        {
            "question": sample_questions[:40],
            "answer": answers[:40],
            "contexts": contexts[:40],
            "ground_truth": ref_answers[:40],
        }
    )
    
    return QCR_ds, num_no_ans_Q

In [8]:
def eval_QCR_data(QCR_ds):
    """
    Function to perform evaluation on question-context-response dataset
    with answer_relevancy, faithfulness, answer_similarity, and context_relevancy metrics
    
    Outputs:
    ans_eval_result: Evaluation results of answer_relevancy and faithfulness metrics
    ans_sim_eval_result: Evaluation results of answer_similarity metric
    contexts_eval_result: Evaluation results of context_relevancy metric
    """
    print("Performing evaluation...")
    # Seldomly the evaluations could reach exception due to closed AsyncClient, 
    # but that DID NOT found to be affecting the evaluation results
    # So we decided to not raise the exceptions during evaluations
    ans_eval_result = evaluate(QCR_ds, [answer_relevancy, faithfulness],raise_exceptions=False)
    ans_sim_eval_result = evaluate(QCR_ds.select_columns(["question","answer","ground_truth"]), 
                                   [answer_similarity],raise_exceptions=False)
    contexts_eval_result = evaluate(QCR_ds.select_columns(["question","contexts"]), [context_relevancy],raise_exceptions=False)
    print(ans_eval_result)
    print(contexts_eval_result)
    print(ans_sim_eval_result)
    
    return ans_eval_result, ans_sim_eval_result, contexts_eval_result

In [9]:
def save_QCR_eval_result(filepath, QCR_ds, embed_model, chunk_top_k_settings, sys_prompt,
                         ans_eval_result, ans_sim_eval_result, contexts_eval_result):
    """
    Function to append evaluation results for each of the sample questions 
    with corresponding hyperparameter/embedding model choices into CSV file
    """
    
    eval_df = QCR_eval_result_df = pd.DataFrame({
        "question": QCR_ds['question'],
        "answer": QCR_ds['answer'],
        "contexts": QCR_ds['contexts'],
        "embedding_model": [embed_model]*40,
        "chunk_size": [chunk_top_k_settings['chunk_size']]*40,
        "chunk_overlap": [chunk_top_k_settings['chunk_overlap']]*40,
        "sim_top_k": [chunk_top_k_settings['sim_top_k']]*40,
        "sys_prompt": [sys_prompt]*40,
        "Answer Relevancy": ans_eval_result.scores['answer_relevancy'],
        "Faithfulness": ans_eval_result.scores['faithfulness'],
        "Answer Semantics Similarity": ans_sim_eval_result.scores['answer_similarity'],
        "Context Relevancy": contexts_eval_result.scores['context_relevancy']
        })
    
    
    if not os.path.exists(filepath):
        eval_df.to_csv(filepath, mode='w')
    
    else:
        curr_eval_df = pd.read_csv(filepath)
        curr_eval_df.drop(columns=['Unnamed: 0'],inplace=True)
        concat_eval_df = pd.concat([curr_eval_df,eval_df])
        concat_eval_df.reset_index(inplace=True,drop=True)
        concat_eval_df.to_csv(filepath, mode='w')

In [10]:
def append_hyper_param_eval_result(filepath, embedding_model, chunk_size, chunk_overlap, sim_top_k, long_or_short_sys_prompt, 
                                   answer_relevance, faithfulness, answer_similarity, context_relevance, 
                                   num_no_ans_Q):
    """
    Function to append average evaluation result values over the sample questions 
    with corresponding hyperparameter/embedding model choices into CSV file
    """
    if not os.path.exists(filepath):
        with open(filepath,'a') as fd:
            fd.write('embedding_model,chunk_size,chunk_overlap,sim_top_k,long_or_short_sys_prompt,Answer Relevance,'\
                     'Faithfulness,Answer Semantics Similarity,Context Relevance,Number of No Answer Question\n')
    
    new_record_str = "%s,%d,%d,%d,%s,%.3f,%.3f,%.3f,%.3f,%d\n"%(embedding_model,chunk_size,chunk_overlap,sim_top_k,
                                                                long_or_short_sys_prompt,answer_relevance,faithfulness,
                                                                answer_similarity,context_relevance,num_no_ans_Q)
    print(new_record_str)
    with open(filepath,'a') as fd:
        fd.write(new_record_str)

In [11]:
def init_query_engine(vec_store_index, SIM_TOP_K, QA_PROMPT_TPL):
    """
    Function to initialize LlamaIndex query engine, with specified similarity top k for retriever
    and query prompt template
    """
    print("Initializing Query Engine...")
    # configure retriever
    retriever = VectorIndexRetriever(
        index=vec_store_index,
        similarity_top_k=SIM_TOP_K,
    )

    # configure postprocessor
    postprocessor = SimilarityPostprocessor(
        similarity_cutoff=0.30,
    )

    # configure response synthesizer
    response_synthesizer = get_response_synthesizer()

    # assemble query engine
    query_engine=RetrieverQueryEngine(
        retriever=retriever, 
        response_synthesizer=response_synthesizer,
        node_postprocessors=[postprocessor],
    )
            
    # Create a prompt template for the QA task   
    qa_prompt_tmpl = PromptTemplate(QA_PROMPT_TPL)

    # Update the prompt in the query engine
    query_engine.update_prompts({"response_synthesizer:text_qa_template": qa_prompt_tmpl})
    return query_engine

In [12]:
# Nested for loops for performing grid search on candidate hyperparameter values and embedding models
for embed_model_id in cand_EMBED_MODEL:
    print("Evaluating with Embedding Model: %s"%embed_model_id)
    # Set up embedding model
    if 'bge-small' in embed_model_id:
        Settings.embed_model = HuggingFaceEmbedding(model_name=embed_model_id, max_length=512)
    else:
        Settings.embed_model = OpenAIEmbedding(model=embed_model_id, max_length=1024)
    for chunk_top_k_settings in cand_CHUNK_SIM_TOP_K:
        print("Evaluating with (chunk size, chunk overlap, similarity-top-k): (%d,%d,%d)"%(chunk_top_k_settings['chunk_size'],
                                                                                           chunk_top_k_settings['chunk_overlap'],
                                                                                           chunk_top_k_settings['sim_top_k']))
        
        # Set up chunk settings
        Settings.chunk_size = chunk_top_k_settings['chunk_size']
        Settings.chunk_overlap = chunk_top_k_settings['chunk_overlap']
        print("Initializing index from documents...")
        # Initializing indices from documents
        index = VectorStoreIndex.from_documents(documents)
        for ith_sys_prompt in range(len(cand_SYS_PROMPT)):
            # Set up LLM and tokenizer 
            llm = OpenAI(
                temperature=0.3, 
                model=LLM_NAME, 
                system_prompt=cand_SYS_PROMPT[ith_sys_prompt],
            )
            Settings.llm = llm
            Settings.tokenizer = tiktoken.encoding_for_model(TOKENIZER_NAME).encode
            # Initialize query engine
            query_engine = init_query_engine(index,chunk_top_k_settings['sim_top_k'],QA_PROMPT_TPL_str)
            # perform queries and generate question-context-response data to be evaluated
            QCR_ds, num_no_ans_Q = gen_QCR_data(query_engine, sample_QA_pairs_df)
            # perform evaluation
            ans_eval, ans_sim_eval, contexts_eval = eval_QCR_data(QCR_ds)
            # Saving evaluation results
            save_QCR_eval_result(QCR_EVAL_SAVE_PATH, QCR_ds, embed_model_id, chunk_top_k_settings, 
                                 cand_SYS_PROMPT[ith_sys_prompt], ans_eval, ans_sim_eval, contexts_eval)
            
            if ith_sys_prompt == 0:
                append_hyper_param_eval_result(HYPER_PARAM_EVAL_SAVE_PATH,embed_model_id,chunk_top_k_settings['chunk_size'],
                                               chunk_top_k_settings['chunk_overlap'],chunk_top_k_settings['sim_top_k'],"long",
                                               ans_eval['answer_relevancy'],ans_eval['faithfulness'],
                                               ans_sim_eval['answer_similarity'],contexts_eval['context_relevancy'],
                                               num_no_ans_Q)
            else:
                append_hyper_param_eval_result(HYPER_PARAM_EVAL_SAVE_PATH,embed_model_id,chunk_top_k_settings['chunk_size'],
                                               chunk_top_k_settings['chunk_overlap'],chunk_top_k_settings['sim_top_k'],"short",
                                               ans_eval['answer_relevancy'],ans_eval['faithfulness'],
                                               ans_sim_eval['answer_similarity'],contexts_eval['context_relevancy'],
                                               num_no_ans_Q)
            
            # Let the program sleep for 2 minute to prevent exceeding OpenAI API rate limit
            print("Waiting for 2 minute...")
            sleep(120)
            
            

Evaluating with Embedding Model: text-embedding-3-small
Evaluating with (chunk size, chunk overlap, similarity-top-k): (512,20,2)
Initializing index from documents...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [03:27<00:00,  2.30s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9410, 'faithfulness': 0.8923}
{'context_relevancy': 0.1160}
{'answer_similarity': 0.9252}
text-embedding-3-small,512,20,2,long,0.941,0.892,0.925,0.116,0

Waiting for 1 minute...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [03:21<00:00,  2.24s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9113, 'faithfulness': 0.8769}
{'context_relevancy': 0.1091}
{'answer_similarity': 0.9248}
text-embedding-3-small,512,20,2,short,0.911,0.877,0.925,0.109,0

Waiting for 1 minute...
Evaluating with (chunk size, chunk overlap, similarity-top-k): (256,10,4)
Initializing index from documents...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [03:36<00:00,  2.41s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9456, 'faithfulness': 0.8813}
{'context_relevancy': 0.0755}
{'answer_similarity': 0.9240}
text-embedding-3-small,256,10,4,long,0.946,0.881,0.924,0.075,0

Waiting for 1 minute...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [03:34<00:00,  2.39s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9416, 'faithfulness': 0.8475}
{'context_relevancy': 0.0642}
{'answer_similarity': 0.9252}
text-embedding-3-small,256,10,4,short,0.942,0.847,0.925,0.064,0

Waiting for 1 minute...
Evaluating with (chunk size, chunk overlap, similarity-top-k): (128,10,8)
Initializing index from documents...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [04:27<00:00,  2.97s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9742, 'faithfulness': 0.8645}
{'context_relevancy': 0.0772}
{'answer_similarity': 0.9296}
text-embedding-3-small,128,10,8,long,0.974,0.864,0.930,0.077,0

Waiting for 1 minute...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [04:31<00:00,  3.02s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9745, 'faithfulness': 0.8733}
{'context_relevancy': 0.0654}
{'answer_similarity': 0.9253}
text-embedding-3-small,128,10,8,short,0.974,0.873,0.925,0.065,0

Waiting for 1 minute...
Evaluating with Embedding Model: BAAI/bge-small-en-v1.5
Evaluating with (chunk size, chunk overlap, similarity-top-k): (512,20,2)
Initializing index from documents...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [02:48<00:00,  1.87s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Task exception was never retrieved
future: <Task finished name='Task-2296' coro=<AsyncClient.aclose() done, defined at C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection.py", 

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9442, 'faithfulness': 0.9006}
{'context_relevancy': 0.0509}
{'answer_similarity': 0.9160}
BAAI/bge-small-en-v1.5,512,20,2,long,0.944,0.901,0.916,0.051,0

Waiting for 1 minute...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [02:49<00:00,  1.88s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9613, 'faithfulness': 0.8843}
{'context_relevancy': 0.0509}
{'answer_similarity': 0.9139}
BAAI/bge-small-en-v1.5,512,20,2,short,0.961,0.884,0.914,0.051,0

Waiting for 1 minute...
Evaluating with (chunk size, chunk overlap, similarity-top-k): (256,10,4)
Initializing index from documents...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [02:52<00:00,  1.92s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9670, 'faithfulness': 0.8514}
{'context_relevancy': 0.0654}
{'answer_similarity': 0.9220}
BAAI/bge-small-en-v1.5,256,10,4,long,0.967,0.851,0.922,0.065,0

Waiting for 1 minute...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [02:56<00:00,  1.96s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Task exception was never retrieved
future: <Task finished name='Task-3415' coro=<AsyncClient.aclose() done, defined at C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection.py", 

Task exception was never retrieved
future: <Task finished name='Task-3425' coro=<AsyncClient.aclose() done, defined at C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection.py", 

{'answer_relevancy': 0.9713, 'faithfulness': 0.8408}
{'context_relevancy': 0.0647}
{'answer_similarity': 0.9270}
BAAI/bge-small-en-v1.5,256,10,4,short,0.971,0.841,0.927,0.065,0

Waiting for 1 minute...
Evaluating with (chunk size, chunk overlap, similarity-top-k): (128,10,8)
Initializing index from documents...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [03:14<00:00,  2.16s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9503, 'faithfulness': 0.8164}
{'context_relevancy': 0.0966}
{'answer_similarity': 0.9252}
BAAI/bge-small-en-v1.5,128,10,8,long,0.950,0.816,0.925,0.097,0

Waiting for 1 minute...
Initializing Query Engine...
Performing queries for 90 sample questions...


100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [03:17<00:00,  2.19s/it]


In the 90 sample questions, 0 questions could not find relevant context.
Performing evaluation...


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

{'answer_relevancy': 0.9790, 'faithfulness': 0.8633}
{'context_relevancy': 0.0953}
{'answer_similarity': 0.9235}
BAAI/bge-small-en-v1.5,128,10,8,short,0.979,0.863,0.923,0.095,0

Waiting for 1 minute...


Task exception was never retrieved
future: <Task finished name='Task-718' coro=<AsyncClient.aclose() done, defined at C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection.py", l

Task exception was never retrieved
future: <Task finished name='Task-1438' coro=<AsyncClient.aclose() done, defined at C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection.py", 

Task exception was never retrieved
future: <Task finished name='Task-3604' coro=<AsyncClient.aclose() done, defined at C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 313, in aclose
    await self._close_connections(closing_connections)
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection_pool.py", line 305, in _close_connections
    await connection.aclose()
  File "C:\Users\jimta\anaconda3\envs\NLP_env\lib\site-packages\httpcore\_async\connection.py", 

In [14]:
pd.read_csv(HYPER_PARAM_EVAL_SAVE_PATH)

Unnamed: 0,embedding_model,chunk_size,chunk_overlap,sim_top_k,long_or_short_sys_prompt,Answer Relevance,Faithfulness,Answer Semantics Similarity,Context Relevance,Number of No Answer Question
0,text-embedding-3-small,512,20,2,long,0.941,0.892,0.925,0.116,0
1,text-embedding-3-small,512,20,2,short,0.911,0.877,0.925,0.109,0
2,text-embedding-3-small,256,10,4,long,0.946,0.881,0.924,0.075,0
3,text-embedding-3-small,256,10,4,short,0.942,0.847,0.925,0.064,0
4,text-embedding-3-small,128,10,8,long,0.974,0.864,0.93,0.077,0
5,text-embedding-3-small,128,10,8,short,0.974,0.873,0.925,0.065,0
6,BAAI/bge-small-en-v1.5,512,20,2,long,0.944,0.901,0.916,0.051,0
7,BAAI/bge-small-en-v1.5,512,20,2,short,0.961,0.884,0.914,0.051,0
8,BAAI/bge-small-en-v1.5,256,10,4,long,0.967,0.851,0.922,0.065,0
9,BAAI/bge-small-en-v1.5,256,10,4,short,0.971,0.841,0.927,0.065,0
