In [42]:
import pandas as pd
import os
import sys
import importlib
from dotenv import load_dotenv
from tqdm.auto import tqdm
import random
from openai import OpenAI
import json

In [25]:
# pip install openai

In [26]:
#importing minsearch
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

minsearch = importlib.import_module("med-conv-ai.minsearch")

## Loading data

In [27]:
df = pd.read_csv("../data/qa_data.csv")

In [28]:
df.tail(10)

Unnamed: 0,id,question_type,question,answer
200,224,outlook,What is the outlook for Narcolepsy ?,None of the currently available medications en...
201,158,outlook,What is the outlook for Inclusion Body Myositis ?,IBM is generally resistant to all therapies an...
202,109,outlook,What is the outlook for Dyssynergia Cerebellar...,The progression of the disorder is usually 10 ...
203,266,outlook,What is the outlook for Peripheral Neuropathy ?,"In acute neuropathies, such as Guillain-Barr s..."
204,61,outlook,What is the outlook for Cerebral Palsy ?,Cerebral palsy doesnt always cause profound di...
205,114,outlook,What is the outlook for Encephaloceles ?,The prognosis for individuals with encephaloce...
206,330,outlook,What is the outlook for Todd's Paralysis ?,Todd's paralysis is an indication that an indi...
207,110,outlook,What is the outlook for Dystonias ?,The initial symptoms can be very mild and may ...
208,290,outlook,What is the outlook for Restless Legs Syndrome ?,RLS is generally a life-long condition for whi...
209,138,outlook,What is the outlook for Gestational Trophoblas...,Certain factors affect prognosis (chance of re...


## Ingestion test

In [29]:
load_dotenv()

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [30]:
docs = df.to_dict(orient='records')

In [31]:
index = minsearch.Index(
    text_fields=[ 
                    'question_type',
                    'question',
                    'answer'],
    keyword_fields=["id"]
) 

In [32]:
index.fit(docs)

<med-conv-ai.minsearch.Index at 0x7e250ce453a0>

## Construction of RAG

In [33]:
client = OpenAI()

In [34]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [35]:
prompt_template = """
You're a medical doctor. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
question: {question}
answer: {answer}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [36]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [37]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

## Testing RAG answers

In [33]:
#question that has context
question = 'What is the prognosis for Cerebral Palsy ?'
answer = rag(question)
print(answer)

The prognosis for cerebral palsy (CP) varies widely among individuals. Many people with CP do not experience profound disabilities, and for most, the disorder does not affect life expectancy. Many children with CP have average to above-average intelligence and can attend the same schools as their peers. Through supportive treatments, medications, and surgery, many individuals can improve their motor skills and ability to communicate. However, the severity of CP can differ greatly; while some children may not need special assistance, others with severe CP may be unable to walk and require extensive, lifelong care.


In [15]:
#question that doesn't have context
question = 'What is the prognosis for sea sickness ?'
answer = rag(question)
print(answer)

The context provided does not contain specific information about the prognosis for sea sickness. Therefore, based on the facts available, I cannot provide an answer regarding the prognosis for sea sickness.


## Retrieval evaluation

In [5]:
gt_df = pd.read_csv('../data/ground-truth-retrieval-gpt-4-mini.csv')

In [8]:
ground_truth = gt_df.to_dict(orient='records')

In [9]:
ground_truth[0]

{'id': 681,
 'question': 'What kind of studies are being conducted regarding Striatonigral Degeneration?'}

In [13]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [14]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [38]:
evaluate(ground_truth, lambda q: search(q['question']))

  0%|          | 0/1050 [00:00<?, ?it/s]

{'hit_rate': 0.959047619047619, 'mrr': 0.8563711262282692}

## RAG evaluation

### first prompt

In [47]:
prompt_template_rag_eval = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [44]:
samp_df = gt_df.sample(n=200, random_state=1)

In [45]:
sample = samp_df.to_dict(orient='records')

In [48]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt_template_rag_eval.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [49]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [50]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.91
PARTLY_RELEVANT    0.08
NON_RELEVANT       0.01
Name: proportion, dtype: float64

In [52]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
67,The context provided does not specifically men...,388,Which organizations are involved in ADHD resea...,NON_RELEVANT,The generated answer does not address the ques...
116,The context provided does not explicitly menti...,430,What are the two types of radiation therapy us...,NON_RELEVANT,The generated answer does not address the ques...


In [53]:
df_eval.to_csv('../data/rag-eval-first-prompt.csv', index=False)

## second prompt

In [54]:
prompt_template_rag_eval_2 = """
Imagine you are a seasoned quality assurance specialist for an advanced question-answering system. 
Your expertise lies in assessing the precision and relevance of AI-generated responses. 
Today, your mission is to scrutinize the correlation between a given question and its corresponding AI-generated answer.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [55]:
evaluations_2 = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt_template_rag_eval_2.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_2.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [56]:
df_eval_2 = pd.DataFrame(evaluations_2, columns=['record', 'answer', 'evaluation'])

df_eval_2['id'] = df_eval_2.record.apply(lambda d: d['id'])
df_eval_2['question'] = df_eval_2.record.apply(lambda d: d['question'])

df_eval_2['relevance'] = df_eval_2.evaluation.apply(lambda d: d['Relevance'])
df_eval_2['explanation'] = df_eval_2.evaluation.apply(lambda d: d['Explanation'])

del df_eval_2['record']
del df_eval_2['evaluation']

In [57]:
df_eval_2.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.955
PARTLY_RELEVANT    0.040
NON_RELEVANT       0.005
Name: proportion, dtype: float64

In [None]:
df_eval_2.to_csv('../data/rag-eval-second-prompt.csv', index=False)