In [1]:
import os
import json
import typing as tp
import warnings
import random

import pandas as pd
import numpy as np

warnings.filterwarnings("ignore")

client_profiles: pd.DataFrame = pd.read_csv("data/client_profiles.csv")
course_descriptions: tp.List[tp.Dict[str, tp.Any]] = json.load(
    open("data/200_sport_programs.json")
)

In [2]:
def prepare_documents(raw_docs: tp.List[tp.Dict[str, tp.Any]]) -> tp.List[str]:
    docs: tp.List[str] = []
    for doc in raw_docs:
        formatted_doc = "\n".join([f"{k}: {str(v)}" for k, v in doc.items()])
        docs.append(formatted_doc)
    return docs

In [3]:
def safe_str(val: tp.Any) -> str:
    if pd.isna(val):
        return "N/A"
    if isinstance(val, list):
        return '\n'.join(map(str, val))
    return str(val)

def stringify_profile(row):
    return (
        f"Name: {safe_str(row['personal_data.full_name'])}.\n"
        f"Gender: {safe_str(row['basic_information.gender'])}, Age: {safe_str(row['basic_information.age'])}\n"
        f"Height: {safe_str(row['basic_information.height_cm'])} cm, Weight: {safe_str(row['basic_information.weight_kg'])} kg.\n"
        f"Training goals: {safe_str(row['training_goals'])}.\n"
        f"Experience: {safe_str(row['training_experience.level'])}\n"
        f"({safe_str(row['training_experience.frequency_last_3_months'])}).\n"
        f"Prefers training at: {safe_str(row['preferences.training_location'])}\n"
        f"({safe_str(row['preferences.location_details'])}),\n"
        f"Session duration: {safe_str(row['preferences.session_duration'])}.\n"
        f"Joint/back problems: {safe_str(row['health.joint_back_problems'])},\n"
        f"Chronic conditions: {safe_str(row['health.chronic_conditions'])},\n"
        f"Health details: {safe_str(row['health.health_details'])}.\n"
        f"Strength training: {safe_str(row['training_types.strength_training'])}\n"
        f"Cardio: {safe_str(row['training_types.cardio'])}\n"
        f"HIIT: {safe_str(row['training_types.hiit'])}\n"
        f"Yoga/Pilates: {safe_str(row['training_types.yoga_pilates'])}\n"
        f"Functional training: {safe_str(row['training_types.functional_training'])}\n"
        f"Stretching: {safe_str(row['training_types.stretching'])}"
    )

client_profiles['stringified'] = client_profiles.apply(stringify_profile, axis=1)

---

#### Create metadata for each user profile to ensure user vector diversity

In [12]:
from prompts import GENERATE_USER_METADATA_PROMPT as prompt

available_programs: tp.List[tp.Dict[str, str]] = [
    {"program_id": "P001", "name": "Beginner Home Strength"},
    {"program_id": "P002", "name": "Cardio Burnout"},
    {"program_id": "P003", "name": "Yoga for Flexibility"},
    {"program_id": "P004", "name": "Senior Functional Fitness"},
    {"program_id": "P005", "name": "Boxing Bootcamp"},
    {"program_id": "P006", "name": "HIIT Express"},
    {"program_id": "P007", "name": "Pilates Core Builder"},
    {"program_id": "P008", "name": "CrossFit Challenge"},
    {"program_id": "P009", "name": "Bodybuilding Basics"},
    {"program_id": "P010", "name": "Stretch & Recover"},
    {"program_id": "P011", "name": "Swimming Endurance"},
    {"program_id": "P012", "name": "Cycling Power"},
    {"program_id": "P013", "name": "Dance Cardio"},
    {"program_id": "P014", "name": "Rehabilitation Mobility"},
    {"program_id": "P015", "name": "Morning Energy Yoga"},
]

seed_phrases: tp.List[str] = [
    "Motivated beginner eager to get fit",
    "Experienced athlete recovering from injury",
    "Busy professional with limited time for workouts",
    "Senior looking to improve mobility and balance",
    "Young adult training for a marathon",
    "New mother seeking postnatal fitness",
    "Student interested in stress relief and flexibility",
    "Overweight individual aiming for weight loss",
    "Fitness enthusiast exploring new training styles",
    "Person with joint issues needing low-impact routines",
    "Gym regular wanting to build muscle mass",
    "Outdoor lover preferring nature workouts",
    "Cardio fan aiming to boost endurance",
    "Yoga devotee seeking advanced poses",
    "Beginner intimidated by gym equipment",
    "Parent looking for family-friendly workouts",
    "Traveler needing portable, equipment-free routines",
    "Retiree focused on active longevity",
    "Person with diabetes managing health through exercise",
    "Night owl preferring late evening sessions",
]
metadata_prompts: tp.List[str] = []
for i in range(len(client_profiles)):
    user_seed_phrases = random.sample(seed_phrases, 3)
    for seed_phrase in user_seed_phrases:
        metadata_prompts.append(
            prompt.format(
                user_profile=client_profiles.iloc[i]["stringified"],
                seed_phrase=seed_phrase,
                available_programs="\n".join(
                    [f"{p['program_id']}: {p['name']}" for p in available_programs]
                ),
            )
        )

In [None]:
from client_form_synt_data import get_response

metadata: tp.List[tp.List[str]] = []

for i in range(len(metadata_prompts)):
    response: str = get_response(
            model_name="gemma",
            user_prompt=metadata_prompts[i],
            system_prompt="You are a helpful assistant.",
        ).choices[0].message.content
    metadata.append(

    )




get_response(
    model_name="gemma",
    user_prompt="Hello, how are you?",
    system_prompt="You are a helpful assistant.",
)

ChatCompletion(id='chatcmpl-e7fd0ea0ed274113a9a586290b597553', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Hello! As an AI, I don't *experience* feelings, but I'm functioning optimally and ready to help! So you could say I'm doing well. 😊 \n\nHow are *you* doing today? Is there anything I can assist you with?\n\n\n\n", refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[], reasoning_content=None), stop_reason=106)], created=1750859071, model='google/gemma-3-27b-it', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=58, prompt_tokens=22, total_tokens=80, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None)