# RAG Evaluations

In [1]:
import os
import dspy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir('../')

In [3]:
from src.chromadb_rm import ChromadbRM

In [4]:
class GenerateAnswer(dspy.Signature):
    """Answer questions given the context"""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Short factual answer to the question. 1 - 5 words long.")

class RAG(dspy.Module):
    def __init__(self, num_passages=5):
        super().__init__()
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [5]:
def setup():
    """
    Setup the dsypy and retrieval models
    """

    turbo = dspy.OpenAI(model='gpt-3.5-turbo')

    chroma_rm = ChromadbRM(collection_name="test-overlap-0", persist_directory="chroma.db", local_embed_model="sentence-transformers/paraphrase-MiniLM-L6-v2",
                                   openai_api_key=os.environ["OPENAI_API_KEY"])

    dspy.settings.configure(lm=turbo, rm=chroma_rm)
    
    rag = RAG()

    return rag

In [9]:
rag = setup()

Collection Count: 0




In [29]:
# Read question, ground_truths from ./data/processed/synthetic_dataset.csv
import pandas as pd

df = pd.read_excel("data/processed/nba.xlsx")

# df = df[['Question', 'Answer']]

In [34]:
df2 = pd.read_csv("data/processed/xx.csv")
df2.head()

Unnamed: 0,Question,Answer,Context,Source
0,How many fouls do you get in an NBA game?,A player is disqualified from the game when he...,Each team shall consist of five players. A pla...,2023-24-NBA-Season-Official-Playing-Rules.pdf
1,How many seconds can you hold the ball without...,The “shot clock” refers to the timing device t...,The “shot clock” refers to the timing device t...,2023-24-NBA-Season-Official-Playing-Rules.pdf
2,How many times do they change sides in basketb...,The teams change baskets for the second half. ...,a. A team’s basket consists of the basket ring...,2023-24-NBA-Season-Official-Playing-Rules.pdf
3,What constitutes a technically foul nowadays?,A technical foul is the penalty for unsportsma...,b. A technical foul is the penalty for unsport...,2023-24-NBA-Season-Official-Playing-Rules.pdf
4,Giannis takes 15 seconds to shoot free throws....,This attempt of free throw must be made within...,A free throw is the privilege given a player t...,2023-24-NBA-Season-Official-Playing-Rules.pdf


In [30]:
df.head()

Unnamed: 0,Question,Answer,Context,Source
0,How many fouls do you get in an NBA game?,A player is disqualified from the game when he...,Each team shall consist of five players. A pla...,2023-24-NBA-Season-Official-Playing-Rules.pdf
1,How many seconds can you hold the ball without...,The “shot clock” refers to the timing device t...,The “shot clock” refers to the timing device t...,2023-24-NBA-Season-Official-Playing-Rules.pdf
2,How many times do they change sides in basketb...,The teams change baskets for the second half. ...,a. A team’s basket consists of the basket ring...,2023-24-NBA-Season-Official-Playing-Rules.pdf
3,What constitutes a technically foul nowadays?,A technical foul is the penalty for unsportsma...,b. A technical foul is the penalty for unsport...,2023-24-NBA-Season-Official-Playing-Rules.pdf
4,Giannis takes 15 seconds to shoot free throws....,This attempt of free throw must be made within...,A free throw is the privilege given a player t...,2023-24-NBA-Season-Official-Playing-Rules.pdf


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# split the data into train and test
train, test = train_test_split(df, test_size=0.2)

In [16]:
# save the train and test data
train.to_csv("./data/processed/train_synthetic.csv", index=False)
test.to_csv("./data/processed/test_synthetic.csv", index=False)

# load the train and test data
train = pd.read_csv("./data/processed/train_synthetic.csv")
test = pd.read_csv("./data/processed/test_synthetic.csv")

In [25]:
import torch.nn as nn

# Example model definition
model = nn.Linear(10, 2)  # A simple linear model


In [27]:
import torch

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Move tensors and models to MPS
model = model.to(device)
input_tensor = input_tensor.to(device)


NameError: name 'input_tensor' is not defined

In [28]:
import tqdm

# Create an empty list to store rows
eval_results_rows = []

for index, row in test.iterrows():
    # Get the question
    question = row['Question']
    # Response from rag
    response = rag(question)
    # Create a dictionary to represent a row
    row_dict = {'question': question, 'contexts': response.context, 'answer': response.answer, 'ground_truths' : row['ground_truths']}
    # Append the row dictionary to the list
    eval_results_rows.append(row_dict)

# Create the df_eval_results DataFrame from the list of rows
df_eval_results = pd.DataFrame(eval_results_rows)


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [13]:
df_eval_results

Unnamed: 0,question,contexts,answer,ground_truths
0,Who developed the Dvorak technique?,"[= dvorak technique =, . development of the ob...",Vernon Dvorak,['Vernon Dvorak']
1,When did No. 20 Squadron relocate to Cairns?,"[. now comprising 252 officers and men, the sq...",11 November 1942.,['11 November 1942.']
2,Who won the Claxton Shield's Most Valuable Pla...,"[. at the baseball australia diamond awards, h...",Wayne Lundgren,['Wayne Lundgren.']
3,Q: Who did Lesnar attack after Rollins refused...,[. lesnar won the match and his third wwe cham...,"The new day, the league of nations, and Kevin ...","['A: Booker T, John ""Bradshaw"" Layfield, Micha..."
4,"Who is the ghost in the poem ""Little Gidding""?","[. within the poem, the narrator meets a ghost...",Combination of poets and literary figures.,['Combination of literary figures.']
...,...,...,...,...
81,Who did Brock Lesnar defeat to become the new ...,"[. with the victory, lesnar became the undispu...",Shane Carwin,['Randy Couture.']
82,When did Typhoon Kujira occur?,"[. however, ultimately any effects in the arch...",April,['April.']
83,What is the main sport in Manila?,[sports in manila have a long and distinguishe...,Basketball.,['Basketball.']
84,Who ordered No. 202 Squadron RAF to Gibraltar?,[. so at 09 : 00 ( utc ) on the 9 september 19...,The order was given by an unspecified authority.,['Admiralty']


In [14]:
import ast

# df_eval_results ground_truths to list
df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

In [15]:
# Save the df_eval_results DataFrame to a csv file
import time
EXP_NAME = "SIMPLE_RAG_NO_OVERLAP"
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
df_eval_results.to_csv('./results/inference_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

Now, that we have answers for all the questions, we can evaluate the RAG model.

In [16]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_similarity,
    context_relevancy
)
from datasets import Dataset
from ragas import evaluate

ds = Dataset.from_pandas(df_eval_results)

result = evaluate(
    ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_similarity,
        context_relevancy
    ],
)

evaluating with [context_precision]


100%|██████████| 6/6 [00:28<00:00,  4.83s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [faithfulness]


100%|██████████| 6/6 [00:29<00:00,  5.00s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_relevancy]


100%|██████████| 6/6 [00:52<00:00,  8.83s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_recall]


100%|██████████| 6/6 [00:22<00:00,  3.69s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_similarity]


100%|██████████| 6/6 [00:03<00:00,  1.56it/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_relevancy]


100%|██████████| 6/6 [00:12<00:00,  2.01s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [17]:
result

{'context_precision': 0.5798, 'faithfulness': 0.6705, 'answer_relevancy': 0.8434, 'context_recall': 0.6988, 'answer_similarity': 0.8933, 'context_relevancy': 0.1628}

In [18]:
# save the result
result.to_pandas().to_csv('./results/evaluation_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

In [19]:
result.to_pandas().head(20)

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Unnamed: 0,question,contexts,answer,ground_truths,context_precision,faithfulness,answer_relevancy,context_recall,answer_similarity,context_relevancy
0,Who developed the Dvorak technique?,"[= dvorak technique =, . development of the ob...",Vernon Dvorak,[Vernon Dvorak],0.866667,1.0,0.999999,1.0,1.000000,0.125000
1,When did No. 20 Squadron relocate to Cairns?,"[. now comprising 252 officers and men, the sq...",11 November 1942.,[11 November 1942.],1.000000,1.0,0.952935,1.0,1.000000,0.181818
2,Who won the Claxton Shield's Most Valuable Pla...,"[. at the baseball australia diamond awards, h...",Wayne Lundgren,[Wayne Lundgren.],1.000000,1.0,0.844966,1.0,0.985646,0.000000
3,Q: Who did Lesnar attack after Rollins refused...,[. lesnar won the match and his third wwe cham...,"The new day, the league of nations, and Kevin ...","[A: Booker T, John ""Bradshaw"" Layfield, Michae...",0.000000,0.0,0.887968,0.0,0.792667,0.111111
4,"Who is the ghost in the poem ""Little Gidding""?","[. within the poem, the narrator meets a ghost...",Combination of poets and literary figures.,[Combination of literary figures.],0.200000,1.0,0.892179,1.0,0.958397,0.181818
...,...,...,...,...,...,...,...,...,...,...
81,Who did Brock Lesnar defeat to become the new ...,"[. with the victory, lesnar became the undispu...",Shane Carwin,[Randy Couture.],0.500000,1.0,0.982244,0.0,0.870798,0.428571
82,When did Typhoon Kujira occur?,"[. however, ultimately any effects in the arch...",April,[April.],0.638889,1.0,0.926135,1.0,0.929672,0.000000
83,What is the main sport in Manila?,[sports in manila have a long and distinguishe...,Basketball.,[Basketball.],1.000000,1.0,1.000000,1.0,1.000000,0.300000
84,Who ordered No. 202 Squadron RAF to Gibraltar?,[. so at 09 : 00 ( utc ) on the 9 september 19...,The order was given by an unspecified authority.,[Admiralty],1.000000,0.5,0.987047,0.0,0.812652,0.200000


In [20]:
# Logging to wandb

import wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-eval",
    
    # track hyperparameters and run metadata
    config={
        "number_of_questions": len(ds),
        "comments": "Simple QA RAG model with no teleprompter - chunk overlap size 0",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test-overlap-64",
        "chroma_persist_directory": "chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    }
)

wandb.log(result)

wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mprasadshreyas[0m. Use [1m`wandb login --relogin`[0m to force relogin




VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
answer_relevancy,▁
answer_similarity,▁
context_precision,▁
context_recall,▁
context_relevancy,▁
faithfulness,▁

0,1
answer_relevancy,0.84338
answer_similarity,0.89332
context_precision,0.57984
context_recall,0.69884
context_relevancy,0.16279
faithfulness,0.67054


----

Now, let's compile the RAG using teleprompters.

In [21]:
train.reset_index(inplace=True, drop=True)

In [22]:
train = train[:10]

In [23]:
train

Unnamed: 0,question,ground_truths
0,Who was the youngest Cy Young winner before th...,['Dwight Gooden']
1,How many crewmen were estimated to be wounded ...,['111-121 crewmen.']
2,What were the design faults of the Tetrarch tank?,"['Size limitation, crew shortage, cooling syst..."
3,What was the maximum speed of the Mk VII tank?,['40 miles per hour.']
4,What is the distinguishing feature of metoposa...,['Positioning of eye sockets.']
5,How do temnospondyls change during metamorphosis?,['Reshaping and strengthening of bones.']
6,"Who directed the film ""The Heart of Ezra Greer""?",['Emile Chautard']
7,Who owned the Spanish villa used as a base by ...,['Italian officer']
8,Where is San Lorenzo Colossal Head 2 currently...,['Mexico City.']
9,Q: What was the mission of the US 2nd Division...,['A: Destroy North Koreans and restore river l...


In [24]:
import ast

trainset = []
for i in range(5):
    ex = dspy.Example(
        question=train['question'].iloc[i],
        answer=ast.literal_eval(train['ground_truths'].iloc[i])[0]
    )
    ex = ex.with_inputs('question')
    trainset.append(ex)

In [25]:
trainset

[Example({'question': 'Who was the youngest Cy Young winner before the person mentioned in the context?', 'answer': 'Dwight Gooden'}) (input_keys={'question'}),
 Example({'question': 'How many crewmen were estimated to be wounded during the attack on Hyūga?', 'answer': '111-121 crewmen.'}) (input_keys={'question'}),
 Example({'question': 'What were the design faults of the Tetrarch tank?', 'answer': 'Size limitation, crew shortage, cooling system.'}) (input_keys={'question'}),
 Example({'question': 'What was the maximum speed of the Mk VII tank?', 'answer': '40 miles per hour.'}) (input_keys={'question'}),
 Example({'question': 'What is the distinguishing feature of metoposaurids compared to capitosauroids?', 'answer': 'Positioning of eye sockets.'}) (input_keys={'question'})]

In [26]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

100%|██████████| 5/5 [00:05<00:00,  1.01s/it]

Bootstrapped 1 full traces after 5 examples in round 0.





In [27]:
import ast
def get_evals(dataset, rag):
    # Create an empty list to store rows
    eval_results_rows = []

    for index, row in dataset.iterrows():
        # Get the question
        question = row['question']
        # Response from rag
        response = rag(question)
        # Create a dictionary to represent a row
        row_dict = {'question': question, 'contexts': response.context, 'answer': response.answer, 'ground_truths' : row['ground_truths']}
        # Append the row dictionary to the list
        eval_results_rows.append(row_dict)

    # Create the df_eval_results DataFrame from the list of rows
    df_eval_results = pd.DataFrame(eval_results_rows)

    # Convert 'ground_truths' column to list
    df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

    return df_eval_results


In [28]:

df_eval_results = get_evals(test, compiled_rag)


In [29]:
# Save the df_eval_results DataFrame to a csv file
import time
EXP_NAME = "COMPILED_RAG_OVERLAP_0"
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
df_eval_results.to_csv('./results/inference_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

Now, that we have answers for all the questions, we can evaluate the RAG model.

In [30]:
ds = Dataset.from_pandas(df_eval_results)

result = evaluate(
    ds,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        answer_similarity,
        context_relevancy
    ],
)

evaluating with [context_precision]


100%|██████████| 6/6 [00:19<00:00,  3.27s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [faithfulness]


100%|██████████| 6/6 [00:29<00:00,  4.97s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_relevancy]


100%|██████████| 6/6 [00:48<00:00,  8.14s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_recall]


100%|██████████| 6/6 [00:19<00:00,  3.19s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [answer_similarity]


100%|██████████| 6/6 [00:05<00:00,  1.08it/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


evaluating with [context_relevancy]


100%|██████████| 6/6 [00:12<00:00,  2.02s/it]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [31]:
result

{'context_precision': 0.5776, 'faithfulness': 0.6512, 'answer_relevancy': 0.8535, 'context_recall': 0.6734, 'answer_similarity': 0.8925, 'context_relevancy': 0.1689}

In [32]:
# save the result
result.to_pandas().to_csv('./results/evaluation_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

In [33]:
result.to_pandas()

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Unnamed: 0,question,contexts,answer,ground_truths,context_precision,faithfulness,answer_relevancy,context_recall,answer_similarity,context_relevancy
0,Who developed the Dvorak technique?,"[= dvorak technique =, . development of the ob...",Vernon Dvorak.,[Vernon Dvorak],0.866667,1.0,1.000000,1.0,0.986971,0.125000
1,When did No. 20 Squadron relocate to Cairns?,"[. now comprising 252 officers and men, the sq...","November 11, 1942.",[11 November 1942.],1.000000,1.0,0.952935,1.0,0.979184,0.181818
2,Who won the Claxton Shield's Most Valuable Pla...,"[. at the baseball australia diamond awards, h...",Wayne Lundgren.,[Wayne Lundgren.],1.000000,1.0,0.844997,1.0,1.000000,0.000000
3,Q: Who did Lesnar attack after Rollins refused...,[. lesnar won the match and his third wwe cham...,"The New Day, the League of Nations, and Kevin ...","[A: Booker T, John ""Bradshaw"" Layfield, Michae...",0.000000,0.0,0.887968,0.0,0.812121,0.111111
4,"Who is the ghost in the poem ""Little Gidding""?","[. within the poem, the narrator meets a ghost...",The ghost represents a combination of various ...,[Combination of literary figures.],0.200000,1.0,0.918157,1.0,0.879476,0.181818
...,...,...,...,...,...,...,...,...,...,...
81,Who did Brock Lesnar defeat to become the new ...,"[. with the victory, lesnar became the undispu...",Shane Carwin.,[Randy Couture.],0.500000,1.0,0.982244,0.0,0.880024,0.428571
82,When did Typhoon Kujira occur?,"[. however, ultimately any effects in the arch...",April.,[April.],0.638889,1.0,0.926135,1.0,0.999999,0.000000
83,What is the main sport in Manila?,[sports in manila have a long and distinguishe...,Basketball.,[Basketball.],1.000000,1.0,1.000000,1.0,0.999999,0.300000
84,Who ordered No. 202 Squadron RAF to Gibraltar?,[. so at 09 : 00 ( utc ) on the 9 september 19...,The person who gave the order is not mentioned...,[Admiralty],1.000000,0.0,0.000000,0.0,0.786328,0.200000


In [34]:
# Logging to wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-eval",
    
    # track hyperparameters and run metadata
    config={
        "number_of_questions": len(ds),
        "comments": "Compiled QA RAG model with teleprompter - OVERLAP 0",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test",
        "chroma_persist_directory": "chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    }
)

wandb.log(result)

wandb.finish()



VBox(children=(Label(value='0.003 MB of 0.020 MB uploaded\r'), FloatProgress(value=0.15790969820438716, max=1.…

0,1
answer_relevancy,▁
answer_similarity,▁
context_precision,▁
context_recall,▁
context_relevancy,▁
faithfulness,▁

0,1
answer_relevancy,0.85346
answer_similarity,0.89248
context_precision,0.57757
context_recall,0.67345
context_relevancy,0.16885
faithfulness,0.65116


-------

No Retrieval
---

In [None]:
class BasicQA(dspy.Signature):
    """Answer questions with short factoid answers."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [None]:
# Define the predictor.
generate_answer = dspy.Predict(BasicQA)

In [None]:
eval_results_rows = []

for index, row in test.iterrows():
    # Get the question
    question = row['question']
    # Response from rag
    response = generate_answer(question = question)
    # Create a dictionary to represent a row
    row_dict = {'question': question, 'answer': response.answer, 'ground_truths' : row['ground_truths']}
    # Append the row dictionary to the list
    eval_results_rows.append(row_dict)

# Create the df_eval_results DataFrame from the list of rows
df_eval_results = pd.DataFrame(eval_results_rows)

# Convert 'ground_truths' column to list
df_eval_results['ground_truths'] = df_eval_results['ground_truths'].apply(lambda x: ast.literal_eval(x))

In [None]:
ds = Dataset.from_pandas(df_eval_results)

result = evaluate(
    ds,
    metrics=[
        answer_similarity
    ],
)

evaluating with [answer_similarity]


100%|██████████| 6/6 [00:03<00:00,  1.59it/s]
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
result

{'answer_similarity': 0.8535}

In [None]:
EXP_NAME = "BASIC_QA_OVERLAP_64"
# save the result
result.to_pandas().to_csv('./results/evaluation_' + EXP_NAME + '_' + TIMESTAMP + '.csv', index=False)

In [None]:
result.to_pandas()

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Unnamed: 0,question,answer,ground_truths,answer_similarity
0,What type of teeth do temnospondyls have on th...,Conical teeth,[Tusks.],0.815168
1,Who were the Principal Architects for Palestin...,Sir Ronald Storrs and Sir William Fisher,[Sir John James Burnet and Thomas Smith Tait.],0.844589
2,What is the title of Brock Lesnar's autobiogra...,Death Clutch,[Death Clutch],1.000000
3,Where is the replica of San Lorenzo Head 8 loc...,"Museo Nacional de Antropología, Mexico City",[Utah Cultural Celebration Center.],0.809204
4,What was the main flaw in the design of the Fu...,Weak armor,[Midships gun turrets.],0.792137
...,...,...,...,...
81,Q: How many Marines and Navy SEALs were part o...,A: 30,[A: 51 Marines and 9 Navy SEALs.],0.803497
82,What is the estimated weight of the La Cobata ...,Approximately 20 tons.,[40 tons.],0.878407
83,What is the name of the group of temnospondyls...,Metoposaurids,[Stereospondyli.],0.824162
84,Who promoted Brad Stevens to a full-time assis...,Doc Rivers,[Todd Lickliter.],0.790983


In [None]:
# Logging to wandb

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="wikitext-rag-eval",
    
    # track hyperparameters and run metadata
    config={
        "number_of_questions": len(ds),
        "comments": "No RAG model - just basic QA model - OVERLAP 64",
        "model": "RAG",
        "dataset": "Synthetic",
        "num_passages": 5,
        "openai_model": "gpt-3.5-turbo",
        "chroma_collection_name": "test",
        "chroma_persist_directory": "chroma.db",
        "chroma_local_embed_model": "sentence-transformers/paraphrase-MiniLM-L6-v2",

    }
)

wandb.log(result)

wandb.finish()



VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
answer_similarity,▁

0,1
answer_similarity,0.85353
