In [2]:
import os
import json
import typing as tp
import warnings

import pandas as pd
import numpy as np
import sentence_transformers as sts

warnings.filterwarnings("ignore")

client_profiles: pd.DataFrame = pd.read_csv("data/client_profiles.csv")
course_descriptions: tp.List[tp.Dict[str, tp.Any]] = json.load(
    open("data/200_sport_programs.json")
)

In [4]:
def prepare_documents(raw_docs: tp.List[tp.Dict[str, tp.Any]]) -> tp.List[str]:
    docs: tp.List[str] = []
    for doc in raw_docs:
        formatted_doc = "\n".join([f"{k}: {str(v)}" for k, v in doc.items()])
        docs.append(formatted_doc)
    return docs

In [14]:
def safe_str(val: tp.Any) -> str:
    if pd.isna(val):
        return "N/A"
    if isinstance(val, list):
        return '\n'.join(map(str, val))
    return str(val)

def stringify_profile(row):
    return (
        f"Name: {safe_str(row['personal_data.full_name'])}.\n"
        f"Gender: {safe_str(row['basic_information.gender'])}, Age: {safe_str(row['basic_information.age'])}\n"
        f"Height: {safe_str(row['basic_information.height_cm'])} cm, Weight: {safe_str(row['basic_information.weight_kg'])} kg.\n"
        f"Training goals: {safe_str(row['training_goals'])}.\n"
        f"Experience: {safe_str(row['training_experience.level'])}\n"
        f"({safe_str(row['training_experience.frequency_last_3_months'])}).\n"
        f"Prefers training at: {safe_str(row['preferences.training_location'])}\n"
        f"({safe_str(row['preferences.location_details'])}),\n"
        f"Session duration: {safe_str(row['preferences.session_duration'])}.\n"
        f"Joint/back problems: {safe_str(row['health.joint_back_problems'])},\n"
        f"Chronic conditions: {safe_str(row['health.chronic_conditions'])},\n"
        f"Health details: {safe_str(row['health.health_details'])}.\n"
        f"Strength training: {safe_str(row['training_types.strength_training'])}\n"
        f"Cardio: {safe_str(row['training_types.cardio'])}\n"
        f"HIIT: {safe_str(row['training_types.hiit'])}\n"
        f"Yoga/Pilates: {safe_str(row['training_types.yoga_pilates'])}\n"
        f"Functional training: {safe_str(row['training_types.functional_training'])}\n"
        f"Stretching: {safe_str(row['training_types.stretching'])}"
    )

client_profiles['stringified'] = client_profiles.apply(stringify_profile, axis=1)