# Measure response time for the different models

In [None]:
import os
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from getpass import getpass
from langchain_core.prompts import ChatPromptTemplate

In [None]:
# List of models tested:
# gpt-3.5-turbo
# gpt-4-turbo
# gpt-4o
# gpt-4o-mini
# claude-3-haiku-20240307
# claude-3-sonnet-20240229
# claude-3-opus-20240229
# claude-3-5-sonnet-20240620
# open-mistral-7b
# open-mixtral-8x7b
# open-mixtral-8x22b
# mistral-large-2407
# open-mistral-nemo-2407
# llama-7b-chat
# llama-13b-chat
# llama-70b-chat
# llama3-8b
# llama3-70b
# llama3.1-8b
# llama3.1-70b
# llama3.1-405b
# Qwen2-72B
# gemma-7b
# gemma-2b

# Model to use
llm_name = "llama3-70b"

embedding_dimensions = 3072 #1536  # 3072

# API key 
if "gpt" in llm_name:
    client = os.environ['OPENAI_API_KEY']
elif "claude" in llm_name:
    client = os.environ['ANTHROPIC_API_KEY']
elif "mistral" in llm_name or "mixtral" in llm_name:
    client = os.environ['MISTRAL_API_KEY']
elif "llama" in llm_name or "gemma" in llm_name:
    client = os.environ['LLAMA_API_KEY']
else:
    print("INVALID MODEL!")
    
print(f"Using model {llm_name}")

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=embedding_dimensions)

# Vector dataset
vectordb_directory = f'vector_database_chspark_{embedding_dimensions}'
print(f"Using vector database {vectordb_directory}")

## Load vector dataset

In [None]:
# Create chroma db from existing vectordb_directory
vectordb = Chroma(
    embedding_function=embedding_model,
    persist_directory=vectordb_directory
)

print(f"Load {vectordb._collection.count()} collections from vector database")

In [None]:
template = """\
You are an intelligent vehicle assistant and you have to answer the questions that are asked of you. 
If the question is about the vehicle, use the provided car manual information to answer the question at the end. 
If you don’t know the answer even with the car manual provided say "I am sorry, I did not find the answer in the car manual"
Don’t try to make up an answer.
Respond in a concrete way, provide the information from the car manual,
do not say where in the manual to look unless the user query asks for that but give the concrete response taken from the car manual.
Keep the answer as concise as possible. 
Always say “Do you have any other questions?” at the end of the answer.
Context: {context}
Question: {question}
Helpful Answer:
"""

## Set chat model and RAG pipeline

In [None]:

# create prompt template object
qa_chain_prompt = PromptTemplate.from_template(template)

if "gpt" in llm_name:
    llm = ChatOpenAI(model_name=llm_name, temperature=0) 
elif "claude" in llm_name:    
    llm = ChatAnthropic(model_name=llm_name, api_key=client, temperature=0)
elif "mistral" in llm_name or "mixtral" in llm_name:
    llm = ChatMistralAI(model=llm_name, api_key=client, temperature=0)
elif "llama" in llm_name or "gemma" in llm_name or "Qwen" in llm_name:
    llm = ChatOpenAI(model_name=llm_name, api_key=client, temperature=0,
                     base_url="https://api.llama-api.com") 

#print(f"Using Model: {llm.model_name}")

# QA RAG object
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": qa_chain_prompt}
)

In [None]:
#query = "what is the recommended fuel for this vehicle?"
query = "cual es la presion de aire optima para las llantas? Dimelo en unidades"

model_response = qa_chain.invoke({"query": query})

print(model_response["result"])

## Measure response time

In [None]:
import time
import json
import os

# Define the path to the JSON file
file_path = 'llm_response_times.json'

# Load existing results from the JSON file if it exists
if os.path.exists(file_path):
    with open(file_path, 'r') as file:
        results = json.load(file)
else:
    results = []

# Define a function to measure response time and store the result
def measure_response_time(llm_name, qa_chain, query, runs=10):
    total_time = 0
    for _ in range(runs):
        start_time = time.time()
        model_response = qa_chain.invoke({"query": query})
        end_time = time.time()
        total_time += (end_time - start_time)
        print(model_response["result"])

    average_time = total_time / runs
    results.append({"llm_name": llm_name, "average_response_time": average_time})
    print(f"Average response time for {llm_name} over {runs} runs: {average_time} seconds")

# Example usage for multiple LLMs (replace `qa_chain_llm1`, `qa_chain_llm2` with your actual LLM objects)
query = "what is the recommended air pressure for the tires? Tell in pressure units."

measure_response_time(llm_name, qa_chain, query)

# Save the updated results to the JSON file
with open(file_path, 'w') as file:
    json.dump(results, file, indent=4)

print("Updated results saved to llm_response_times.json")

In [None]:
# Define the path to the JSON file
file_path = 'llm_response_times.json'

# Load existing results from the JSON file if it exists
if os.path.exists(file_path):
    with open(file_path, 'r') as file:
        results = json.load(file)

print(len(results))