In [45]:
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from openai import OpenAI
from tqdm import tqdm
import pandas as pd
import os
import json
import random

random.seed(42)

client = OpenAI()
API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)
doc_idx = {d['doc_id']: d for d in documents}    

In [3]:
doc_idx['46c6323c']

{'question': 'What is Garmin Connect and what purpose does it serve?',
 'context': 'A web and mobile platform where users can track and analyze their fitness, activities, and wellness data',
 'company_id': 'GRMN',
 'doc_id': '46c6323c'}

In [4]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [5]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

#### This file is created after Elasticsearch service and index are created using evaluate-retrieval.ipynb file. The file is operating while Elasticsearch is running and indexed.

In [6]:
es_client = Elasticsearch('http://localhost:9200') 
index_name = "financial-faq"

### Elasticsearch Initilisation

In [7]:
def elastic_search_hybrid(field, query, vector, company_id):
    knn_query = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5,
        "filter": {
            "term": {
                "company_id": company_id
            }
        }
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question", "context"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            },
            "filter": {
                "term": {
                    "company_id": company_id
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ["question", "context", "company_id", "doc_id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [8]:
def search_hybrid(search_function, q):
    question = q['question']
    company_id = q['company_id']

    v_q = model.encode(question)

    return elastic_search_hybrid(search_function, question, v_q, company_id)

### LLM and RAG Initilisation

In [9]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course financial assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"question: {doc['question']}\nanswer: {doc['context']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [10]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [11]:
def rag(query: dict, model='gpt-4o-mini') -> str:
    search_results = search_hybrid('question_context_vector', query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [12]:
search_hybrid('question_context_vector', dict(question='What other advanced features does Garmin offer for sailing besides SailAssist?',
         company_id = 'GRMN'))

[{'question': 'What advanced features does Garmin’s SailAssist include?',
  'company_id': 'GRMN',
  'context': "Garmin's SailAssist features include an enhanced wind rose with true and apparent wind data, POLAR tables, pre-race guidance, synchronized race timer, virtual starting line, time to burn, and lay line data fields.",
  'doc_id': '3b5858c1'},
 {'question': 'What are the characteristics of the Garmin quatix series wearable devices?',
  'company_id': 'GRMN',
  'context': 'The Garmin quatix series wearable devices are GPS-enabled smartwatches with features tailored for mariners, such as navigation, sailing features, stereo control, autopilot functions, tidal info, a built-in LED flashlight, and solar charging depending on the model.',
  'doc_id': '36e4cff1'},
 {'question': 'What additional satellite systems do Garmin products utilize besides GPS?',
  'company_id': 'GRMN',
  'context': 'GLONASS, Galileo, BeiDou, and others',
  'doc_id': 'bf63bcc5'},
 {'question': 'What unique featu

In [13]:
rag(dict(question='What other advanced features does Garmin offer for sailing besides SailAssist?',
         company_id = 'GRMN'))

'Besides SailAssist, Garmin offers advanced features in their quatix series wearable devices, such as navigation, stereo control, autopilot functions, tidal info, and a built-in LED flashlight, along with solar charging depending on the model.'

In [18]:
pool = ThreadPoolExecutor(max_workers=4)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [19]:
# ChatGPT model daily limits fixed to 10000, therefore we cannot evaluate the entire dataset.
# Therefore, I sampled the ground truth dataset with 10% which is around 2100.
sample_size = 2100
ground_truth_sample = random.sample(ground_truth, sample_size)

### Cosine Similarity and LLM as Judge for RAG Evaluation

In [27]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [49]:
llm_as_judge_prompt = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Question: {question}
Original Answer: {answer_orig}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [54]:
def llm_as_judge(samples):
    json_evaluations = []

    for record in tqdm(samples):
        prompt = llm_as_judge_prompt.format(**record)
        evaluation = llm(prompt, model='gpt-4o-mini')
        json_eval = json.loads(evaluation)
        json_evaluations.append(json_eval)
    
    df_evaluations = pd.DataFrame(json_evaluations)
    
    return df_evaluations

### ChatGPT-3.5 results and evaluation

In [20]:
def process_record(rec):
    model = 'gpt-3.5-turbo'
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document_id']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['context']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'company_id': rec['company_id'],
    }

In [21]:
results_gpt35 = map_progress(pool, ground_truth_sample, process_record)

100%|██████████| 2100/2100 [07:40<00:00,  4.56it/s]


In [32]:
df_gpt35 = pd.DataFrame(results_gpt35)
df_gpt35.to_csv('data/results-gpt35.csv', index=False)

In [28]:
similarity = []

for record in tqdm(results_gpt35):
    sim = compute_similarity(record)
    similarity.append(sim)

df_gpt35['cosine'] = similarity
df_gpt35['cosine'].describe()

100%|██████████| 2100/2100 [00:32<00:00, 63.86it/s]


count    2100.000000
mean        0.636314
std         0.270827
min        -0.190102
25%         0.410008
50%         0.677585
75%         0.865943
max         1.000000
Name: cosine, dtype: float64

In [55]:
# LLM AS JUDGE, resample it to 210
df_gpt35_sample = df_gpt35.sample(n=210, random_state=1)
gpt35_sample = df_gpt35_sample.to_dict(orient='records')

gpt35_evaluations = llm_as_judge(gpt35_sample)

gpt35_evaluations.Relevance.value_counts()

100%|██████████| 210/210 [06:20<00:00,  1.81s/it]


Relevance
RELEVANT           167
NON_RELEVANT        22
PARTLY_RELEVANT     21
Name: count, dtype: int64

In [56]:
gpt35_evaluations.to_csv('data/results-gpt35-evaluations.csv', index=False)

### ChatGPT-4o-mini results and evaluation

In [29]:
def process_record(rec):
    model = 'gpt-4o-mini'
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document_id']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['context']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'company_id': rec['company_id'],
    }

In [30]:
results_gpt4o = map_progress(pool, ground_truth_sample, process_record)

100%|██████████| 2100/2100 [12:38<00:00,  2.77it/s]


100%|██████████| 2100/2100 [12:38<00:00,  2.27it/s]

In [33]:
df_gpt4o = pd.DataFrame(results_gpt4o)
df_gpt4o.to_csv('data/results-gpt4o.csv', index=False)

In [34]:
similarity = []

for record in tqdm(results_gpt4o):
    sim = compute_similarity(record)
    similarity.append(sim)

df_gpt4o['cosine'] = similarity
df_gpt4o['cosine'].describe()

100%|██████████| 2100/2100 [00:32<00:00, 64.64it/s]


count    2100.000000
mean        0.630666
std         0.252160
min        -0.074854
25%         0.432983
50%         0.668283
75%         0.831950
max         1.000000
Name: cosine, dtype: float64

In [57]:
# LLM AS JUDGE, resample it to 210
df_gpt4o_sample = df_gpt4o.sample(n=210, random_state=1)
gpt4o_sample = df_gpt4o_sample.to_dict(orient='records')

gpt4o_evaluations = llm_as_judge(gpt4o_sample)

gpt4o_evaluations.Relevance.value_counts()

100%|██████████| 210/210 [06:12<00:00,  1.77s/it]


Relevance
RELEVANT           172
PARTLY_RELEVANT     26
NON_RELEVANT        12
Name: count, dtype: int64

In [58]:
gpt4o_evaluations.to_csv('data/results-gpt4o-evaluations.csv', index=False)

````
count    2100.000000
mean        0.630666
std         0.252160
min        -0.074854
25%         0.432983
50%         0.668283
75%         0.831950
max         1.000000
Name: cosine, dtype: float64

Relevance
RELEVANT           172
PARTLY_RELEVANT     26
NON_RELEVANT        12
Name: count, dtype: int64
````

### GPT-4o-mini answer prompts better with retreived context rather than GPT-3.5. For app and interface hybrid search (question-context-vector) and GPT-4o-mini will operate.