# Notebook for exploring variations on LLM, embeddings, RAG arch and prompt

## DBs that are going to be retrieved

In [3]:
dbs = [
    "full_docs",
    "fragments_docs",
    "posts_forum"
]

## Options of embedding, chat and vectorscore to be tested

In [4]:
# we are going to test the open ai api for embeddings
embedding_models = ["text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002"]

# we are going to test the open ai api for chat
chat_models = ["gpt-3.5-turbo-0125", "gpt-3.5-turbo-instruct", "gpt-4o", "gpt-4o-2024-05-13"]

# we are going to use faiss
vectorstores = ['faiss']

## Imports

In [5]:
# general
import pandas as pd
import numpy as np
import os, asyncio, time, re
from getpass import getpass
from datetime import datetime

# embedding and chat
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
# openai api key
openai_api_key = getpass("Enter the OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = openai_api_key

# vectorstore
if 'faiss' in vectorstores:
    from langchain_community.vectorstores import FAISS

# for tracking
import weave
from weave import Evaluation

## Import test data 

In [6]:
test_path = "../002_create_test_dataset/questions_test_dataset.csv"
test_dataset = pd.read_csv(test_path)

# drop origin
test_dataset = test_dataset.drop(columns=['origin'])

# change columns
test_dataset = test_dataset.rename(columns={'answer': 'expected'})

# sample 50 questions
test_dataset = test_dataset.sample(5, random_state=42)

# as dict
test_dataset = test_dataset.to_dict(orient='records')
test_dataset

[{'question': 'How can I get involved in the Optimism Collective?',
  'expected': 'You can get involved in the Optimism Collective by following three principles: do what you love, fix problems together, and do it with optimism. There are various ways to contribute, such as helping with translations, improving documentation, participating in local events, or joining support programs like the NERD program.'},
 {'question': 'Can Optimism currently censor user transactions?',
  'expected': "No, even though the Optimism Foundation currently runs the sole sequencer on OP Mainnet, it does not have the ability to censor user transactions. However, decentralizing the sequencer is still a goal to further enhance the network's robustness and inclusivity."},
 {'question': 'Who are the members of the proposed Decentralized Finance Governance Committee for Optimism?',
  'expected': 'The committee consists of Katie Garcia, GFX Labs, Flipside Crypto, StableNode, and Linda Xie.'},
 {'question': 'What w

## General definitions for accessing data and creating model

In [7]:
def load_db(dbs, model_embeddings, vectorstore = 'faiss'):
    embeddings = OpenAIEmbeddings(model=model_embeddings, openai_api_key=openai_api_key)
    if vectorstore == 'faiss':
        dbs = [f"dbs/{name}_db/faiss/{model_embeddings}" for name in dbs]
        dbs = [FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) for db_path in dbs]
        db = dbs[0]
        for db_ in dbs[1:]:
            db.merge_from(db_)
    
    return db

In [8]:
class RAGModel():
    @weave.op()
    def __init__(self, dbs, model_embeddings, chat_pars, prompt_template, vectorstore = 'faiss'):
        self.dbs_name = dbs
        self.embeddings_name = model_embeddings
        self.vectorstore_name = vectorstore

        self.db = load_db(dbs, model_embeddings, vectorstore)

        prompt = ChatPromptTemplate.from_template(prompt_template)
        llm = ChatOpenAI(**chat_pars, openai_api_key=openai_api_key)
        self.chain = prompt | llm

        self.retriever = None
            
    @weave.op()
    def def_faiss_retriever(self, **retriever_kwargs):
        self.retriever = self.db.as_retriever(**retriever_kwargs)
        return self.retriever
    
    @weave.op()
    def find_similar_docs(self, query):
        if self.vectorstore_name == 'faiss':
            return self.retriever.invoke(query)
        
    @weave.op()
    def get_answer(self, question: str):
        context = self.find_similar_docs(question)

        response = self.chain.invoke(
            {
                "context": context,
                "question": question,
            }
        )
        
        return {"context": str(context), "answer": response.content}
    
    def ask(self, question: str):
        out = self.get_answer(question)
        return out["answer"]
    

## Tests

In [9]:
# reference embedding
reference_embedding = "text-embedding-ada-002"
reference_embedding = OpenAIEmbeddings(model=reference_embedding, openai_api_key=openai_api_key)

# reference chat
reference_chat = "gpt-4o"
reference_chat = ChatOpenAI(model=reference_chat, openai_api_key=openai_api_key)

import nest_asyncio
nest_asyncio.apply()

def calc_answer_semantic_similarity(answer, expected):
    answer_embedding = np.array(reference_embedding.embed_query(answer))
    expected_embedding = np.array(reference_embedding.embed_query(expected))

    # https://docs.ragas.io/en/latest/concepts/metrics/semantic_similarity.html
    return np.dot(answer_embedding, expected_embedding) / (np.linalg.norm(answer_embedding) * np.linalg.norm(expected_embedding))

def calc_answer_relevance(answer, question):
    # https://aclanthology.org/2024.eacl-demo.16.pdf
    hipot_question = reference_chat.invoke(
        f"Generate a question for the given answer. \n answer: {answer}"
    ).content
    hipot_question_embedding = np.array(reference_embedding.embed_query(hipot_question))
    question_embedding = np.array(reference_embedding.embed_query(question))
    return np.dot(hipot_question_embedding, question_embedding) / (np.linalg.norm(hipot_question_embedding) * np.linalg.norm(question_embedding))

def calc_faithfulness(question, answer):
    # https://aclanthology.org/2024.eacl-demo.16.pdf
    statements = reference_chat.invoke(
        f"Given a question and answer, create one or more statements from each sentence in the given answer. \n question: {question} \n answer: {answer} \n\n Do not deviate from the specified format. \n statement: [statement 1] \n ... \n statement: [statement n]"
    ).content
    veredicts = reference_chat.invoke(
        f"Consider the given context and following statements, then determine whether they are supported by the information present in the context. Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Provide a final verdict for each statement in order at the end in the given format. List all veredicts at the end. \n\n Statements: \n {statements}"
    ).content
    veredicts = veredicts.split("\n")[::-1]
    veredicts = [veredict.lower() for veredict in veredicts]
    n_yes = 0
    n_no = 0
    for veredict in veredicts:
        if "yes" in veredict or "true" in veredict or "correct" in veredict:
            n_yes += 1
        elif "no" in veredict or "false" in veredict or "incorrect" in veredict:
            n_no += 1
        else:
            break
    try:
        faithfulness = n_yes / (n_yes + n_no)
    except ZeroDivisionError:
        faithfulness = 0

    return faithfulness


@weave.op()
def eval_model(question: str, expected: str, model_output: dict) -> dict:
    question = question
    answer = model_output['answer']
    context = model_output['context']
    expected = expected

    answer_semantic_similarity = calc_answer_semantic_similarity(answer, expected)

    answer_relevance = calc_answer_relevance(answer, question)

    faithfulness = calc_faithfulness(question, answer)

    return {
        "end_to_end": {
            "answer_semantic_similarity": answer_semantic_similarity, 
            },
        "component_wise": {
            "answer_relevance": answer_relevance,
            "faithfulness": faithfulness
        }
    }

In [10]:
weave.init('first-test')
chat_pars = {
    "model": chat_models[0],
    "temperature": 0,
    "max_tokens": None,
    "timeout": None,
    "max_retries": 2
}

prompt_template = f"""Answer politely the question at the end, using only the following context. The user is not necessarily a specialist, so please avoid jargon and explain any technical terms.

<context>
{{context}} 
</context>

Question: {{question}}
"""

rag = RAGModel(
    dbs = dbs,
    model_embeddings = embedding_models[0],
    chat_pars=chat_pars,
    prompt_template = prompt_template
)

evaluation = Evaluation(
    dataset=test_dataset, scorers=[eval_model]
)

rag.def_faiss_retriever()
asyncio.run(evaluation.evaluate(rag.get_answer))

Logged in as Weights & Biases user: victorsouza.
View Weave data at https://wandb.ai/bleu-builders/first-test/weave
🍩 https://wandb.ai/bleu-builders/first-test/r/call/d7a5d9b9-d432-4259-9260-d978620a9933
🍩 https://wandb.ai/bleu-builders/first-test/r/call/22bb2160-efe5-4265-8d86-79c1bbfdd870


🍩 https://wandb.ai/bleu-builders/first-test/r/call/99cbcd8e-b9a1-4b2e-bc03-01bafef1ef5c


{'eval_model': {'end_to_end': {'answer_semantic_similarity': {'mean': 0.9370337957867892}},
  'component_wise': {'answer_relevance': {'mean': 0.9759252812378427},
   'faithfulness': {'mean': 0.72}}},
 'model_latency': {'mean': 4.5994775772094725}}