In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("..\data\data.csv")

In [3]:
df.head()

Unnamed: 0,id,exercise_name,type_of_activity,type_of_equipment,body_part,type,muscle_groups_activated,instructions
0,0,Push-Ups,Strength,Bodyweight,Upper Body,Push,"Pectorals, Triceps, Deltoids",Start in a high plank position with your hands...
1,1,Squats,Strength,Bodyweight,Lower Body,Push,"Quadriceps, Glutes, Hamstrings",Stand with feet shoulder-width apart. Lower yo...
2,2,Plank,Strength/Mobility,Bodyweight,Core,Hold,"Rectus Abdominis, Transverse Abdominis",Start in a forearm plank position with your el...
3,3,Deadlift,Strength,Barbell,Lower Body,Pull,"Glutes, Hamstrings, Lower Back","Stand with feet hip-width apart, barbell in fr..."
4,4,Bicep Curls,Strength,Dumbbells,Upper Body,Pull,"Biceps, Forearms","Stand with a dumbbell in each hand, arms fully..."


In [4]:
df.shape

(207, 8)

In [5]:
# importing minsearch as a python module 

import minsearch

In [6]:
documents = df.to_dict(orient="records")

In [7]:
documents[0]

{'id': 0,
 'exercise_name': 'Push-Ups',
 'type_of_activity': 'Strength',
 'type_of_equipment': 'Bodyweight',
 'body_part': 'Upper Body',
 'type': 'Push',
 'muscle_groups_activated': 'Pectorals, Triceps, Deltoids',
 'instructions': 'Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.'}

In [8]:
# indexing using MinSearch

index = minsearch.Index(
    text_fields=['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part',
       'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields=['id']
)

In [9]:
index.fit(documents)

<minsearch.Index at 0x18ac69d55d0>

In [10]:
# sample search 

query = "give me exercises for chest"

In [11]:
index.search(query, num_results=5)

[{'id': 26,
  'exercise_name': 'Chest Press',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Dumbbells',
  'body_part': 'Upper Body',
  'type': 'Push',
  'muscle_groups_activated': 'Pectorals, Triceps, Deltoids',
  'instructions': 'Lie on a bench with a dumbbell in each hand. Press the weights straight up over your chest, then lower them back down.'},
 {'id': 80,
  'exercise_name': 'Cable Chest Press',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Cable Machine',
  'body_part': 'Upper Body',
  'type': 'Push',
  'muscle_groups_activated': 'Pectorals, Triceps, Deltoids',
  'instructions': 'Stand in front of a cable machine with handles at chest height. Press the handles forward, then return to the starting position.'},
 {'id': 156,
  'exercise_name': 'Machine Chest Press',
  'type_of_activity': 'Strength',
  'type_of_equipment': 'Machine',
  'body_part': 'Upper Body',
  'type': 'Push',
  'muscle_groups_activated': 'Pectorals, Triceps, Deltoids',
  'instructions': '

### Implementing RAG flow

In [None]:
import os 

os.environ["OPENAI_API_KEY"] = "your_own_key"

key = os.getenv("OPENAI_API_KEY")

In [13]:
from openai import OpenAI

client = OpenAI(api_key=key)

In [14]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [43]:
prompt_template = """
You're a fitness instructor. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [16]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [17]:
# full RAG workflow

def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [18]:
query = "give me exercises for chest"

In [19]:
print(rag(query))

Here are some exercises for the chest:

1. **Chest Press**
   - *Type of Activity:* Strength
   - *Equipment:* Dumbbells
   - *Instructions:* Lie on a bench with a dumbbell in each hand. Press the weights straight up over your chest, then lower them back down.

2. **Cable Chest Press**
   - *Type of Activity:* Strength
   - *Equipment:* Cable Machine
   - *Instructions:* Stand in front of a cable machine with handles at chest height. Press the handles forward, then return to the starting position.

3. **Machine Chest Press**
   - *Type of Activity:* Strength
   - *Equipment:* Machine
   - *Instructions:* Sit at a chest press machine with your feet flat on the floor. Press the handles forward until your arms are fully extended.

4. **Machine Chest Fly**
   - *Type of Activity:* Strength
   - *Equipment:* Machine
   - *Instructions:* Sit at a chest fly machine, grasp the handles, and bring them together in front of your chest, then return to the starting position.

5. **Incline Chest Pre

## Evaluation

### Evaluation for MinSearch Retreival Engine

The ground truth dataset has been generated already using LLM model based on prompt where 5 questions for each exercise is created and stored for each document id.

In [20]:
df_question = pd.read_csv('..\data\ground-truth-retrieval.csv')

In [21]:
df_question.head()

Unnamed: 0,id,question
0,0,What is the starting position for doing push-ups?
1,0,Which muscle groups are activated during push-...
2,0,How do you know when to push back up while doi...
3,0,Do you need any equipment to perform push-ups?
4,0,What part of the body do push-ups primarily ta...


In [22]:
ground_truth = df_question.to_dict(orient="records")

In [23]:
ground_truth[0]

{'id': 0, 'question': 'What is the starting position for doing push-ups?'}

In [24]:
# evaluation metrics

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [25]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [26]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q["question"])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [27]:
from tqdm.auto import tqdm

In [28]:
evaluate(ground_truth, minsearch_search)

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.9478260869565217, 'mrr': 0.8227612913120158}

> Basic retrieval evaluation without any boosting parameters gives us hit rate of 94.7% and MRR of 82.27%.

#### Finding best parameters (for boost paramter) for retrieval search 

In [29]:
# split our data into test and validation

df_validation = df_question[:100]
df_test = df_question[100:]

In [30]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters in loop
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [31]:
ground_truth_val = df_validation.to_dict(orient="records")

In [32]:
# performing hyper paramter tuning for boost parameter

def minsearch_search(query, boost= None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [33]:
# paramters to tune 

param_ranges = {
    'exercise_name': (0.0, 3.0),
    'type_of_activity': (0.0, 3.0),
    'type_of_equipment': (0.0, 3.0),
    'body_part': (0.0, 3.0),
    'type': (0.0, 3.0),
    'muscle_groups_activated': (0.0, 3.0),
    'instructions': (0.0, 3.0),
}

# objective fucntion is to maximize MRR metric

def objective_function(boost_params):
    def search_function(q):
        return minsearch_search(q, boost_params)

    results = evaluate(ground_truth_val, search_function)
    return results['mrr']

In [34]:
# running the tuning pipeline

simple_optimize(param_ranges, objective_function, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'exercise_name': 2.2712194763992146,
  'type_of_activity': 2.242684967456583,
  'type_of_equipment': 1.029466052402638,
  'body_part': 1.949161024733541,
  'type': 0.09471758441928757,
  'muscle_groups_activated': 0.06839437488384126,
  'instructions': 1.0572757748219206},
 0.8419444444444446)

In [35]:
# now with best hyper parameters on full datset 
def minsearch_improved(query):
    boost = {
        'exercise_name': 2.11,
        'type_of_activity': 1.46,
        'type_of_equipment': 0.65,
        'body_part': 2.65,
        'type': 1.31,
        'muscle_groups_activated': 2.54,
        'instructions': 0.74
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, minsearch_improved)

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.9468599033816425, 'mrr': 0.9030039874242772}

### Evaluating RAG workflow

Here, we will use LLM As a judge approach to evaluate RAG.

In [44]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [40]:
len(ground_truth)

1035

In [45]:
# example of how prompt looks for one record

record = ground_truth[0]
question = record["question"]
answer_llm = rag(question)

prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What is the starting position for doing push-ups?
Generated Answer: The starting position for doing push-ups is to start in a high plank position with your hands under your shoulders.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [46]:
llm(prompt)

'{\n  "Relevance": "RELEVANT",\n  "Explanation": "The generated answer accurately describes the starting position for doing push-ups, which directly addresses the question asked."\n}'

In [47]:
import json

In [48]:
# smapling only 200 records

df_sample = df_question.sample(n=200, random_state=1)

In [49]:
samples = df_sample.to_dict(orient="records")

In [None]:
# evaluations = []

# for record in tqdm(samples):
#     question = record["question"]
#     answer_llm = rag(question)

#     prompt = prompt2_template.format(
#         question=question, 
#         answer_llm=answer_llm)

#     evaluation = llm(prompt)
#     evaluation = json.loads(evaluation)

#     evaluations.append((record, answer_llm, evaluation))

In [52]:
# getting the dataset from author after evaluation 

df_eval_gpt_mini = pd.read_csv('../data/rag-eval-gpt-4o-mini.csv')
df_eval_gpt_mini.head(5)


Unnamed: 0,answer,id,question,relevance,explanation
0,The primary muscle group activated during the ...,171,What is the primary muscle group activated dur...,RELEVANT,The generated answer accurately identifies the...
1,"Yes, jumping squats can be performed without a...",115,Can jumping squats be performed without any eq...,RELEVANT,The generated answer directly addresses the qu...
2,"To properly execute a Dumbbell Lateral Raise, ...",53,Can you explain the proper technique for execu...,RELEVANT,The generated answer provides a clear and deta...
3,To ensure your arms are fully extended during ...,198,How do I ensure my arms are fully extended dur...,RELEVANT,The generated answer provides detailed instruc...
4,The part of the body primarily engaged in Gobl...,19,What part of the body is primarily engaged in ...,RELEVANT,The generated answer accurately identifies tha...


In [53]:
df_eval_gpt_mini["relevance"].value_counts(normalize=True)

relevance
RELEVANT           0.835
PARTLY_RELEVANT    0.150
NON_RELEVANT       0.015
Name: proportion, dtype: float64

> 83.5% of the records have relevant answers generated by RAG which is good to achieve for our RAG model using gpt-4o-mini.