In [2]:
import pandas as pd
import minsearch



In [3]:
df = pd.read_csv('data.csv')

In [4]:
documents = df.to_dict(orient='records')

In [5]:
documents[40]

{'id': 40,
 'exercise_name': 'Single-Leg Deadlift',
 'type_of_activity': 'Strength',
 'type_of_equipment': 'Dumbbells',
 'body_part': 'Lower Body',
 'type': 'Pull',
 'muscle_groups_activated': 'Hamstrings, Glutes, Lower Back',
 'instructions': 'Stand on one leg, holding a dumbbell in the opposite hand. Bend at the hips to lower the weight while extending the free leg behind you.'}

In [6]:
index = minsearch.Index(
    text_fields=['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part',
       'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields=['id']
)

In [7]:
index.fit(documents)

<minsearch.Index at 0x755540191010>

In [8]:
prompt_template = """
You emulate a user of our finess assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercises.
The record should contain the answer to the questions, and the questions should be 
complete and not too short. Use as fewer words as possible from the record. 

The record: 

exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()



In [9]:
prompt = prompt_template.format(**documents[0])

In [10]:
print(prompt)

You emulate a user of our finess assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercises.
The record should contain the answer to the questions, and the questions should be 
complete and not too short. Use as fewer words as possible from the record. 

The record: 

exercise_name: Push-Ups
type_of_activity: Strength
type_of_equipment: Bodyweight
body_part: Upper Body
type: Push
muscle_groups_activated: Pectorals, Triceps, Deltoids
instructions: Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.

Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2", ..., "question5"]}


# RAG Flow

In [11]:
from mistralai import Mistral
from mistralai.models import UserMessage
import os
from dotenv import load_dotenv
import json

In [12]:
# loads variables from .env
load_dotenv()  

True

In [13]:
api_key = os.getenv("API_KEY")

In [14]:
def llm(prompt, model='mistral-large-2411'):
    client = Mistral(api_key = api_key)
    response = client.chat.complete(
        model= model,
        messages=[UserMessage(content=prompt)],
    )    
    return response.choices[0].message.content

In [15]:
questions = llm(prompt)

In [16]:
json.loads(questions)

{'questions': ['Which muscle groups are activated when performing Push-Ups?',
  'What type of equipment is needed to perform Push-Ups?',
  'What is the starting position for Push-Ups?',
  'Which part of the body does Push-Ups primarily target?',
  'What is the movement type involved in Push-Ups?']}

# Generate Questions

In [17]:
def generate_question(doc):
    prompt = prompt_template.format(**doc)
    client = Mistral(api_key = api_key)
    response = client.chat.complete(
        model= 'open-mistral-nemo',
        messages=[UserMessage(content=prompt)],
    )    
    json_response = response.choices[0].message.content
    return json_response

In [18]:
from tqdm.auto import tqdm

In [128]:
results = {}

In [129]:
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue 
    questions_raw = generate_question(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/207 [00:00<?, ?it/s]

In [140]:
results[0]

['What muscles are primarily targeted during push-ups?',
 'How low should I go in a push-up to ensure proper form?',
 'Can I do push-ups without any equipment?',
 'What is the starting position for a push-up?',
 'How many push-ups should I aim for in a set?']

In [141]:
# save the results

final_results = []
for doc_id, qs in results.items():
    for q in qs:
        final_results.append((doc_id, q))


In [142]:
final_results[10]

(2, 'What is the primary muscle group targeted in this exercise?')

In [145]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [147]:
df_results.to_csv('groud-truth-retrieval.csv', index=False)

In [148]:
!head groud-truth-retrieval.csv

id,question
0,What muscles are primarily targeted during push-ups?
0,How low should I go in a push-up to ensure proper form?
0,Can I do push-ups without any equipment?
0,What is the starting position for a push-up?
0,How many push-ups should I aim for in a set?
1,What are the primary muscle groups targeted during squats?
1,How should I position my feet for proper form during squats?
1,What is the main movement type of a squat?
1,What should I imagine to maintain a straight back while squatting?


# Retrieval Evaluation

In [43]:
df_question = pd.read_csv('groud-truth-retrieval.csv')

In [44]:
df_question

Unnamed: 0,id,question
0,0,What muscles are primarily targeted during pus...
1,0,How low should I go in a push-up to ensure pro...
2,0,Can I do push-ups without any equipment?
3,0,What is the starting position for a push-up?
4,0,How many push-ups should I aim for in a set?
...,...,...
1030,206,What muscles are primarily targeted in the Dum...
1031,206,How many dumbbells are used in this exercise?
1032,206,What is the direction of the movement when low...
1033,206,What body part is the Dumbbell Bench Press bes...


In [45]:
ground_truth = df_question.to_dict(orient='records')


In [46]:
ground_truth[0]

{'id': 0, 'question': 'What muscles are primarily targeted during push-ups?'}

In [47]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [48]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [49]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [50]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.4502415458937198, 'mrr': 0.3409592822636299}

# Finding the best paramters

In [51]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [62]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [63]:
gt_val = df_validation.to_dict(orient='records')

In [64]:
def minsearch_search(query, boost=None):
    
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [68]:
param_ranges = {
 'exercise_name': (0.0, 3.0),
 'type_of_activity': (0.0, 3.0),
 'type_of_equipment': (0.0, 3.0),
 'body_part': (0.0, 3.0),
 'type': (0.0, 3.0),
 'muscle_groups_activated': (0.0, 3.0),
 'instructions': (0.0, 3.0)
}

In [69]:
def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']
        

In [70]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'exercise_name': 2.5849030051781536,
  'type_of_activity': 2.155834026824828,
  'type_of_equipment': 2.928957961281172,
  'body_part': 0.9060619449185641,
  'type': 1.6174118087979952,
  'muscle_groups_activated': 0.04807886542150075,
  'instructions': 1.7014512045047296},
 0.4590396825396826)

'mrr': 0.3409592822636299

In [73]:
# Apply the boost params in the whole dataset
def minsearh_improved(query):
    
    boost = {
          'exercise_name': 2.58,
          'type_of_activity': 2.15,
          'type_of_equipment': 2.92,
          'body_part': 0.90,
          'type': 1.61,
          'muscle_groups_activated': 0.04,
          'instructions': 1.70
        }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results


evaluate(ground_truth, lambda q: minsearh_improved(q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.4579710144927536, 'mrr': 0.3408327582240625}

{'hit_rate': 0.4502415458937198, 'mrr': 0.3409592822636299}