# Notebook for exploring variations on LLM, embeddings, RAG arch and prompt

## DBs that are going to be retrieved

In [1]:
dbs = [
    "full_docs",
    "fragments_docs",
    "posts_forum"
]

## Options of embedding, chat and vectorscore to be tested

In [2]:
# we are going to test the open ai api for embeddings
embedding_models = ["text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002"]

# we are going to test the open ai api for chat
chat_models = ["gpt-3.5-turbo-0125", "gpt-4o"]

# we are going to use faiss
vectorstores = ['faiss']

## Imports

In [3]:
# general
import pandas as pd
import numpy as np
import os, asyncio, time, re
from getpass import getpass
from datetime import datetime
import tiktoken # metrics
import nest_asyncio
nest_asyncio.apply()

# embedding and chat
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
# openai api key
openai_api_key = getpass("Enter the OpenAI API key: ")

# vectorstore
if 'faiss' in vectorstores:
    from langchain_community.vectorstores import FAISS

# for tracking
import weave
from weave import Evaluation

## Import test data 

In [4]:
test_path = "../002_create_test_dataset/questions_test_dataset.csv"
test_dataset = pd.read_csv(test_path)

# drop origin
#test_dataset = test_dataset.drop(columns=['origin'])

# change columns
test_dataset = test_dataset.rename(columns={'answer': 'expected'})

# sample 50 questions
test_dataset = test_dataset.sample(50, random_state=42)

# as dict
test_dataset = test_dataset.to_dict(orient='records')
test_dataset

[{'question': 'How can I get involved in the Optimism Collective?',
  'expected': 'You can get involved in the Optimism Collective by following three principles: do what you love, fix problems together, and do it with optimism. There are various ways to contribute, such as helping with translations, improving documentation, participating in local events, or joining support programs like the NERD program.',
  'origin': 'documentation'},
 {'question': 'Can Optimism currently censor user transactions?',
  'expected': "No, even though the Optimism Foundation currently runs the sole sequencer on OP Mainnet, it does not have the ability to censor user transactions. However, decentralizing the sequencer is still a goal to further enhance the network's robustness and inclusivity.",
  'origin': 'documentation'},
 {'question': 'Who are the members of the proposed Decentralized Finance Governance Committee for Optimism?',
  'expected': 'The committee consists of Katie Garcia, GFX Labs, Flipside C

## General definitions for accessing data and creating model

In [5]:
def load_db(dbs, model_embeddings, vectorstore = 'faiss'):
    embeddings = OpenAIEmbeddings(model=model_embeddings, openai_api_key=openai_api_key)
    if vectorstore == 'faiss':
        dbs = [f"dbs/{name}_db/faiss/{model_embeddings}" for name in dbs]
        dbs = [FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) for db_path in dbs]
        db = dbs[0]
        for db_ in dbs[1:]:
            db.merge_from(db_)
    
    return db

In [7]:
@weave.op()
def build_model(dbs_name, embeddings_name, chat_pars, prompt_template, vectorstore = 'faiss', retriever_pars = {}):
    db = load_db(dbs_name, embeddings_name, vectorstore)

    prompt = ChatPromptTemplate.from_template(prompt_template)
    llm = ChatOpenAI(**chat_pars, openai_api_key=openai_api_key)
    chain = prompt | llm

    if vectorstore == 'faiss':
        retriever = db.as_retriever(**retriever_pars)

    return retriever, chain
    

class RAGModel(weave.Model):
    structure : str = "simple-rag" # just a retriever and a llm

    dbs_name : list
    embeddings_name : str

    vectorstore : str
    retriever_pars : dict

    prompt_template : str
    chat_pars : dict[str, str|int]

    @weave.op()
    def predict(self, question: str):
        retriever, chain = build_model(self.dbs_name, self.embeddings_name, self.chat_pars, self.prompt_template, self.vectorstore, self.retriever_pars)

        if self.vectorstore == 'faiss':
            context = retriever.invoke(question)

        response = chain.invoke(
            {
                "context": context,
                "question": question,
            }
        )
        
        return {"context": str(context), "answer": response.content}
    
    def ask(self, question: str):
        out = self.predict(question)
        return out["answer"]
    

## Metrics

In [8]:
# reference embedding
reference_embedding = "text-embedding-ada-002"
reference_embedding = OpenAIEmbeddings(model=reference_embedding, openai_api_key=openai_api_key)

# reference chat
reference_chat = "gpt-4o"
reference_chat = ChatOpenAI(model=reference_chat, openai_api_key=openai_api_key)

# reference tokenization
reference_tokenization = tiktoken.get_encoding("cl100k_base")

In [9]:
# EVALUATE THE RETRIEVER
def calc_context_recall(context, question, expected):
    # measure if the expected meaning is contained in the context meaning
    # info: https://aclanthology.org/2024.eacl-demo.16.pdf

    # we try to find a set of fundamental statements that encompass the meaning of the answer
    statements = reference_chat.invoke(
        f"Given a question and answer, return the fundamental statements from the answer's meaning. \n question: {question} \n answer: {expected}"
    ).content
    # we try to find if the statements are supported by the context
    veredicts = reference_chat.invoke(
        f"Consider the given context and following statements, then determine whether they are supported by the information present in the context. Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Let the veredict be the final word of each line. \n\n <context> \n {context} <\\context> \n\n <statements> \n {statements} \n <\\statements>"
    ).content

    # get the last word of each line
    veredicts = veredicts.split("\n")
    veredicts = [veredict.split(" ")[-1].lower() for veredict in veredicts]
    n_yes = 0
    n_no = 0
    for veredict in veredicts:
        if "yes" in veredict:
            n_yes += 1
        elif "no" in veredict:
            n_no += 1
        else:
            None

    try:
        recall = n_yes / (n_yes + n_no)
    except ZeroDivisionError:
        recall = 0

    return recall

def calc_context_conciseness(context, expected):
    # measure the size of the context compared to the expected answer
    num_tokens_context = len(reference_tokenization.encode(context))
    num_tokens_expected = len(reference_tokenization.encode(expected))

    return num_tokens_expected / num_tokens_context

# EVALUATE THE LLM
def calc_answer_relevance(answer, question):
    # measure how much the answer resembles to be answering the question
    # info: https://aclanthology.org/2024.eacl-demo.16.pdf
    hipot_question = reference_chat.invoke(
        f"Generate a question for the given answer. \n answer: {answer}"
    ).content
    hipot_question_embedding = np.array(reference_embedding.embed_query(hipot_question))
    question_embedding = np.array(reference_embedding.embed_query(question))
    return np.dot(hipot_question_embedding, question_embedding) / (np.linalg.norm(hipot_question_embedding) * np.linalg.norm(question_embedding))

def calc_faithfulness(context, question, answer):
    # measure if the answer's meaning is contained in the context meaning
    # info: https://aclanthology.org/2024.eacl-demo.16.pdf
    return calc_context_recall(context, question, answer)


# EVALUATE END-TO-END
def calc_answer_semantic_similarity(answer, expected):
    # measure similarity between the answer and the expected answer
    # info: https://docs.ragas.io/en/latest/concepts/metrics/semantic_similarity.html
    answer_embedding = np.array(reference_embedding.embed_query(answer))
    expected_embedding = np.array(reference_embedding.embed_query(expected))

    return np.dot(answer_embedding, expected_embedding) / (np.linalg.norm(answer_embedding) * np.linalg.norm(expected_embedding))

def calc_answer_conciseness(answer, expected):
    # measure the size of the answer compared to the expected answer
    num_tokens_answer = len(reference_tokenization.encode(answer))
    num_tokens_expected = len(reference_tokenization.encode(expected))

    return num_tokens_expected / num_tokens_answer

# ALL
def calc_metrics(question, expected, context, answer):
    return {
        "retriever": {
            "context_recall": calc_context_recall(context, question, expected),
            "context_conciseness": calc_context_conciseness(context, expected)
        },
        "llm": {
            "answer_relevance": calc_answer_relevance(answer, question),
            "faithfulness": calc_faithfulness(context, question, answer)
        },
        "end-to-end": {
            "answer_semantic_similarity": calc_answer_semantic_similarity(answer, expected),
            "answer_conciseness": calc_answer_conciseness(answer, expected)
        }
    }

In [10]:
@weave.op()
def eval_model(question: str, expected: str, model_output: dict) -> dict:
    return calc_metrics(question, expected, model_output["context"], model_output["answer"])

In [11]:
def run_rag_evaluation(rag, test_dataset=test_dataset):
    evaluation = Evaluation(
        dataset=test_dataset, scorers=[eval_model],
    )

    with weave.attributes({'dbs': rag.dbs_name, 'embeddings': rag.embeddings_name, 'chat_pars': rag.chat_pars, 'prompt_template': rag.prompt_template, 'retriever_pars': rag.retriever_pars, 'vectorstore': rag.vectorstore, "structure": rag.structure}):
        asyncio.run(evaluation.evaluate(rag.predict))

## Tests

In [12]:
weave.init('op-ai-tools')

weave version 0.50.6 is available!  To upgrade, please run:
 $ pip install weave --upgrade
Logged in as Weights & Biases user: victorsouza.
View Weave data at https://wandb.ai/bleu-builders/op-ai-tools/weave




In [20]:
chat_pars = {
    "model": chat_models[1],
    "temperature": 0,
    #"max_tokens": None,
    #"timeout": None,
    "max_retries": 2
}

prompt_template = f"""Answer politely the question at the end, using only the following context. The user is not necessarily a specialist, so please avoid jargon and explain any technical terms.

<context>
{{context}} 
</context>

Question: {{question}}
"""

rag = RAGModel(
    dbs_name = [dbs[0]],
    embeddings_name = embedding_models[2],
    chat_pars=chat_pars,
    prompt_template = prompt_template,
    retriever_pars = {
        "search_kwargs" : {'k': 1}
    },
    vectorstore = 'faiss'
)

run_rag_evaluation(rag)

🍩 https://wandb.ai/bleu-builders/op-ai-tools/r/call/054987ad-f50c-49be-a52d-a507849907f4
