In [1]:
import pandas as pd
import minsearch



In [2]:
import os
os.environ["SSL_CERT_FILE"] = "Fortinet_CA_SSL(15).cer"

In [3]:
df = pd.read_csv('data.csv')

In [4]:
documents = df.to_dict(orient='records')

In [5]:
documents[40]

{'id': 40,
 'exercise_name': 'Single-Leg Deadlift',
 'type_of_activity': 'Strength',
 'type_of_equipment': 'Dumbbells',
 'body_part': 'Lower Body',
 'type': 'Pull',
 'muscle_groups_activated': 'Hamstrings, Glutes, Lower Back',
 'instructions': 'Stand on one leg, holding a dumbbell in the opposite hand. Bend at the hips to lower the weight while extending the free leg behind you.'}

In [6]:
index = minsearch.Index(
    text_fields=['exercise_name', 'type_of_activity', 'type_of_equipment', 'body_part',
       'type', 'muscle_groups_activated', 'instructions'],
    keyword_fields=['id']
)

In [7]:
index.fit(documents)

<minsearch.Index at 0x79a8f4bc4440>

In [8]:
prompt_template = """
You emulate a user of our finess assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercises.
The record should contain the answer to the questions, and the questions should be 
complete and not too short. Use as fewer words as possible from the record. 

The record: 

exercise_name: {exercise_name}
type_of_activity: {type_of_activity}
type_of_equipment: {type_of_equipment}
body_part: {body_part}
type: {type}
muscle_groups_activated: {muscle_groups_activated}
instructions: {instructions}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()



In [9]:
prompt = prompt_template.format(**documents[0])

In [10]:
print(prompt)

You emulate a user of our finess assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercises.
The record should contain the answer to the questions, and the questions should be 
complete and not too short. Use as fewer words as possible from the record. 

The record: 

exercise_name: Push-Ups
type_of_activity: Strength
type_of_equipment: Bodyweight
body_part: Upper Body
type: Push
muscle_groups_activated: Pectorals, Triceps, Deltoids
instructions: Start in a high plank position with your hands under your shoulders. Lower your body until your chest nearly touches the floor. Push back up to the starting position.

Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2", ..., "question5"]}


# RAG Flow

In [11]:
from mistralai import Mistral
from mistralai.models import UserMessage
import os
from dotenv import load_dotenv
import json

In [12]:
# loads variables from .env
load_dotenv()  

True

In [13]:
api_key = os.getenv("API_KEY")

In [14]:
def llm(prompt, model='ministral-8b-latest'):
    client = Mistral(api_key = api_key)
    response = client.chat.complete(
        model= model,
        messages=[UserMessage(content=prompt)],
    )    
    return response.choices[0].message.content

In [15]:
questions = llm(prompt)

In [16]:
json.loads(questions)

{'questions': ['What is the starting position for push-ups?',
  'Which muscles are primarily activated during push-ups?',
  'How do you lower your body during a push-up?',
  'What is the final position after pushing back up?',
  'What equipment is needed for push-ups?']}

# Generate Questions

In [17]:
def generate_question(doc):
    prompt = prompt_template.format(**doc)
    client = Mistral(api_key = api_key)
    response = client.chat.complete(
        model= 'mistral-large-2411',
        messages=[UserMessage(content=prompt)],
    )    
    json_response = response.choices[0].message.content
    return json_response


In [18]:
from tqdm.auto import tqdm


In [19]:
results = {}

In [29]:
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue 
    questions_raw = generate_question(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']


  0%|          | 0/207 [00:00<?, ?it/s]

In [30]:
results[0]

['What is the name of the exercise that involves lowering your body from a high plank position until your chest nearly touches the floor?',
 'Which type of activity does the exercise involving pectorals, triceps, and deltoids fall under?',
 'What kind of equipment is required for the exercise where you push your body back up to the starting position from a lowered plank?',
 'Which part of the body is primarily targeted by the exercise that activates pectorals, triceps, and deltoids?',
 'Can you describe the steps to perform the exercise that works the upper body using only bodyweight?']

In [31]:
# save the results

final_results = []
for doc_id, qs in results.items():
    for q in qs:
        final_results.append((doc_id, q))


In [32]:
final_results[10]

(2, 'What muscle groups are activated during a Plank?')

In [33]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [34]:
df_results.to_csv('groud-truth-retrieval.csv', index=False)

In [35]:
!head groud-truth-retrieval.csv

id,question
0,What is the name of the exercise that involves lowering your body from a high plank position until your chest nearly touches the floor?
0,"Which type of activity does the exercise involving pectorals, triceps, and deltoids fall under?"
0,What kind of equipment is required for the exercise where you push your body back up to the starting position from a lowered plank?
0,"Which part of the body is primarily targeted by the exercise that activates pectorals, triceps, and deltoids?"
0,Can you describe the steps to perform the exercise that works the upper body using only bodyweight?
1,What muscles do squats primarily work on?
1,Do squats require any specific equipment?
1,Which part of the body do squats target?
1,How do you perform a squat correctly?


# Retrieval Evaluation

In [36]:
df_question = pd.read_csv('groud-truth-retrieval.csv')

In [37]:
df_question

Unnamed: 0,id,question
0,0,What is the name of the exercise that involves...
1,0,Which type of activity does the exercise invol...
2,0,What kind of equipment is required for the exe...
3,0,Which part of the body is primarily targeted b...
4,0,Can you describe the steps to perform the exer...
...,...,...
1030,206,What muscles does the Dumbbell Bench Press pri...
1031,206,What type of equipment is needed for the Dumbb...
1032,206,Which body part is worked by the Dumbbell Benc...
1033,206,How do you perform the Dumbbell Bench Press us...


In [38]:
ground_truth = df_question.to_dict(orient='records')


In [39]:
ground_truth[0]

{'id': 0,
 'question': 'What is the name of the exercise that involves lowering your body from a high plank position until your chest nearly touches the floor?'}

In [40]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [41]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [42]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [43]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.9797101449275363, 'mrr': 0.8649490069779925}

# Finding the best paramters

In [44]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [45]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [46]:
gt_val = df_validation.to_dict(orient='records')

In [47]:
def minsearch_search(query, boost=None):
    
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [48]:
param_ranges = {
 'exercise_name': (0.0, 3.0),
 'type_of_activity': (0.0, 3.0),
 'type_of_equipment': (0.0, 3.0),
 'body_part': (0.0, 3.0),
 'type': (0.0, 3.0),
 'muscle_groups_activated': (0.0, 3.0),
 'instructions': (0.0, 3.0)
}

In [49]:
def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']
        

In [50]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'exercise_name': 2.8680677352590713,
  'type_of_activity': 2.155045199162053,
  'type_of_equipment': 0.6202095792364494,
  'body_part': 2.322273524060162,
  'type': 2.2251659888513893,
  'muscle_groups_activated': 1.4368469636881422,
  'instructions': 1.9296650279967351},
 0.8911666666666668)

'mrr': 0.8649490069779925

In [51]:
# Apply the boost params in the whole dataset
def minsearh_improved(query):
    
    boost = {
          'exercise_name': 2.86,
          'type_of_activity': 2.15,
          'type_of_equipment': 0.62,
          'body_part': 2.32,
          'type': 2.22,
          'muscle_groups_activated': 1.43,
          'instructions': 1.92
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results


evaluate(ground_truth, lambda q: minsearh_improved(q['question']))

  0%|          | 0/1035 [00:00<?, ?it/s]

{'hit_rate': 0.978743961352657, 'mrr': 0.9417149758454105}

{'hit_rate': 0.9797101449275363, 'mrr': 0.8649490069779925}