# 03 — Model 1: Content-Based Filtering
## Fitness Workout Recommender

This model recommends exercises by matching user profiles to exercise features
using cosine similarity. No interaction history needed — solves cold-start!

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
np.random.seed(42)
print('Libraries loaded!')

Libraries loaded!


In [2]:
exercises = pd.read_csv('../data/processed/exercises_clean.csv')
users = pd.read_csv('../data/processed/user_profiles.csv')
interactions = pd.read_csv('../data/processed/user_exercise_interactions.csv')

print(f'Exercises: {len(exercises)}')
print(f'Users: {len(users)}')
print(f'Interactions: {len(interactions)}')
exercises.head()

Exercises: 52
Users: 1000
Interactions: 15000


Unnamed: 0,exercise_id,exercise_name,body_part,muscle_type,equipment,movement_type,movement_pattern,difficulty,sets_min,sets_max,reps_min,reps_max,est_duration_min
0,0,incline dumbbell press,chest,upper chest,dumbbells,compound,push,intermediate,3.0,4.0,8,12,7.4
1,1,incline cable crossovers,chest,upper chest,machine,isolation,push,beginner,3.0,4.0,10,15,8.0
2,2,incline dumbbell flyes,chest,upper chest,dumbbells,isolation,push,beginner,3.0,4.0,10,15,8.0
3,3,decline dumbbell press,chest,lower chest,dumbbells,compound,push,intermediate,3.0,4.0,8,12,7.4
4,4,decline cable crossovers,chest,lower chest,machine,isolation,push,beginner,3.0,4.0,10,15,8.0


## 1. Build Exercise Feature Vectors

We encode each exercise as a numerical vector using one-hot encoding
of its categorical attributes.

In [3]:
# Select categorical features to encode
feature_cols = ['body_part', 'equipment', 'movement_type', 'movement_pattern', 'difficulty']

# One-hot encode
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
exercise_features = encoder.fit_transform(exercises[feature_cols])
feature_names = encoder.get_feature_names_out(feature_cols)

print(f'Exercise feature matrix: {exercise_features.shape}')
print(f'Features: {list(feature_names)}')

# Preview as DataFrame
feat_df = pd.DataFrame(exercise_features, columns=feature_names)
feat_df.insert(0, 'exercise_name', exercises['exercise_name'].values)
feat_df.head()

Exercise feature matrix: (52, 24)
Features: ['body_part_abs', 'body_part_arms', 'body_part_back', 'body_part_chest', 'body_part_forearms', 'body_part_legs', 'body_part_shoulders', 'equipment_barbell', 'equipment_bodyweight', 'equipment_dumbbells', 'equipment_machine', 'equipment_other', 'equipment_pull-up bar', 'movement_type_compound', 'movement_type_isolation', 'movement_pattern_core', 'movement_pattern_hinge', 'movement_pattern_other', 'movement_pattern_pull', 'movement_pattern_push', 'movement_pattern_squat', 'difficulty_advanced', 'difficulty_beginner', 'difficulty_intermediate']


Unnamed: 0,exercise_name,body_part_abs,body_part_arms,body_part_back,body_part_chest,body_part_forearms,body_part_legs,body_part_shoulders,equipment_barbell,equipment_bodyweight,...,movement_type_isolation,movement_pattern_core,movement_pattern_hinge,movement_pattern_other,movement_pattern_pull,movement_pattern_push,movement_pattern_squat,difficulty_advanced,difficulty_beginner,difficulty_intermediate
0,incline dumbbell press,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,incline cable crossovers,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,incline dumbbell flyes,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,decline dumbbell press,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,decline cable crossovers,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## 2. Build User Preference Profiles

For each user, we create a "preference vector" in the same feature space as exercises.
This maps user attributes (goals, level, equipment) to exercise feature weights.

In [4]:
def build_user_profile(user, feature_names):
    """
    Map user attributes to a vector in exercise feature space.
    This is the core of content-based filtering.
    """
    profile = np.zeros(len(feature_names))
    fname_list = list(feature_names)
    
    # --- Body part preference ---
    focus = user['preferred_body_focus']
    body_map = {
        'upper': ['chest', 'back', 'shoulders', 'arms'],
        'lower': ['legs'],
        'core': ['core'],
        'full': ['chest', 'back', 'shoulders', 'arms', 'legs', 'core'],
    }
    for bp in body_map.get(focus, []):
        col = f'body_part_{bp}'
        if col in fname_list:
            profile[fname_list.index(col)] = 1.0 if focus != 'full' else 0.6
    
    # --- Equipment compatibility ---
    equip_map = {
        'bodyweight_only': {'bodyweight': 1.0, 'pull-up bar': 0.5},
        'dumbbells': {'bodyweight': 0.7, 'dumbbells': 1.0, 'pull-up bar': 0.3},
        'home_gym': {'bodyweight': 0.5, 'dumbbells': 0.8, 'barbell': 0.8, 'pull-up bar': 0.6},
        'full_gym': {'bodyweight': 0.4, 'dumbbells': 0.7, 'barbell': 0.9, 'machine': 1.0, 'pull-up bar': 0.6},
    }
    for equip, weight in equip_map.get(user['equipment_access'], {}).items():
        col = f'equipment_{equip}'
        if col in fname_list:
            profile[fname_list.index(col)] = weight
    
    # --- Difficulty preference ---
    diff_map = {
        'beginner': {'beginner': 1.0, 'intermediate': 0.3, 'advanced': 0.0},
        'intermediate': {'beginner': 0.2, 'intermediate': 1.0, 'advanced': 0.4},
        'advanced': {'beginner': 0.0, 'intermediate': 0.4, 'advanced': 1.0},
    }
    for diff, weight in diff_map.get(user['fitness_level'], {}).items():
        col = f'difficulty_{diff}'
        if col in fname_list:
            profile[fname_list.index(col)] = weight
    
    # --- Movement type preference based on goal ---
    goal = user['fitness_goal']
    if goal in ['muscle_gain', 'weight_loss']:
        col = 'movement_type_compound'
        if col in fname_list:
            profile[fname_list.index(col)] = 0.8
        col = 'movement_type_isolation'
        if col in fname_list:
            profile[fname_list.index(col)] = 0.5
    elif goal == 'endurance':
        if 'movement_type_compound' in fname_list:
            profile[fname_list.index('movement_type_compound')] = 0.6
        if 'movement_type_isolation' in fname_list:
            profile[fname_list.index('movement_type_isolation')] = 0.6
    elif goal == 'flexibility':
        if 'movement_type_isolation' in fname_list:
            profile[fname_list.index('movement_type_isolation')] = 0.8
    else:  # general_fitness
        if 'movement_type_compound' in fname_list:
            profile[fname_list.index('movement_type_compound')] = 0.6
        if 'movement_type_isolation' in fname_list:
            profile[fname_list.index('movement_type_isolation')] = 0.5
    
    return profile

# Build all user profiles
user_profiles = np.array([build_user_profile(row, feature_names) for _, row in users.iterrows()])
print(f'User profile matrix: {user_profiles.shape}')

# Preview one user
print(f'\nExample user: {users.iloc[0].to_dict()}')
print(f'Profile vector: {user_profiles[0]}')

User profile matrix: (1000, 24)

Example user: {'user_id': 'user_0000', 'fitness_goal': 'muscle_gain', 'fitness_level': 'beginner', 'equipment_access': 'dumbbells', 'time_per_session': 45, 'preferred_body_focus': 'full', 'age_group': 'adult'}
Profile vector: [0.  0.6 0.6 0.6 0.  0.6 0.6 0.  0.7 1.  0.  0.  0.3 0.8 0.5 0.  0.  0.
 0.  0.  0.  0.  1.  0.3]


## 3. Generate Recommendations

For each user, compute cosine similarity between their profile and all exercises,
then rank exercises by similarity score.

In [5]:
def recommend_content_based(user_idx, user_profiles, exercise_features, exercises_df, top_k=10, exclude_ids=None):
    """
    Recommend top-K exercises for a user using cosine similarity.
    """
    user_vec = user_profiles[user_idx].reshape(1, -1)
    similarities = cosine_similarity(user_vec, exercise_features).flatten()
    
    # Rank by similarity
    ranked_indices = np.argsort(similarities)[::-1]
    
    # Exclude already-interacted exercises if provided
    if exclude_ids is not None:
        ranked_indices = [i for i in ranked_indices if exercises_df.iloc[i]['exercise_id'] not in exclude_ids]
    
    top_indices = ranked_indices[:top_k]
    results = exercises_df.iloc[top_indices][['exercise_id', 'exercise_name', 'body_part', 'equipment', 'difficulty']].copy()
    results['similarity_score'] = similarities[top_indices]
    return results

# Test: Recommend for user 0
print(f'User 0: {users.iloc[0][["fitness_goal", "fitness_level", "equipment_access", "preferred_body_focus"]].to_dict()}')
print()
recs = recommend_content_based(0, user_profiles, exercise_features, exercises, top_k=10)
print('Top 10 Recommendations:')
recs

User 0: {'fitness_goal': 'muscle_gain', 'fitness_level': 'beginner', 'equipment_access': 'dumbbells', 'preferred_body_focus': 'full'}

Top 10 Recommendations:


Unnamed: 0,exercise_id,exercise_name,body_part,equipment,difficulty,similarity_score
35,35,dumbbell front raises,shoulders,dumbbells,beginner,0.598817
8,8,dumbbell flyes,chest,dumbbells,beginner,0.598817
2,2,incline dumbbell flyes,chest,dumbbells,beginner,0.598817
5,5,decline dumbbell flyes,chest,dumbbells,beginner,0.598817
17,17,dumbbell curls,arms,dumbbells,beginner,0.598817
14,14,hyperextensions,back,bodyweight,beginner,0.540867
33,33,standing calf raises,legs,bodyweight,beginner,0.540867
15,15,bird dog,back,bodyweight,beginner,0.540867
0,0,incline dumbbell press,chest,dumbbells,intermediate,0.521551
3,3,decline dumbbell press,chest,dumbbells,intermediate,0.521551


In [6]:
# Test a few different user types
test_users = [
    ('Beginner, bodyweight only', users[users['fitness_level'].eq('beginner') & users['equipment_access'].eq('bodyweight_only')].index[0]),
    ('Advanced, full gym', users[users['fitness_level'].eq('advanced') & users['equipment_access'].eq('full_gym')].index[0]),
    ('Intermediate, muscle gain', users[users['fitness_level'].eq('intermediate') & users['fitness_goal'].eq('muscle_gain')].index[0]),
]

for label, idx in test_users:
    print(f'=== {label} ===')
    print(f'Profile: {users.iloc[idx][["fitness_goal", "fitness_level", "equipment_access", "preferred_body_focus"]].to_dict()}')
    recs = recommend_content_based(idx, user_profiles, exercise_features, exercises, top_k=5)
    for _, row in recs.iterrows():
        print(f'  {row["exercise_name"]:40s} | {row["body_part"]:12s} | {row["equipment"]:12s} | {row["difficulty"]:12s} | sim={row["similarity_score"]:.3f}')
    print()

=== Beginner, bodyweight only ===
Profile: {'fitness_goal': 'endurance', 'fitness_level': 'beginner', 'equipment_access': 'bodyweight_only', 'preferred_body_focus': 'full'}
  bird dog                                 | back         | bodyweight   | beginner     | sim=0.649
  hyperextensions                          | back         | bodyweight   | beginner     | sim=0.649
  standing calf raises                     | legs         | bodyweight   | beginner     | sim=0.649
  leg raises                               | abs          | bodyweight   | beginner     | sim=0.527
  crunches                                 | abs          | bodyweight   | beginner     | sim=0.527

=== Advanced, full gym ===
Profile: {'fitness_goal': 'general_fitness', 'fitness_level': 'advanced', 'equipment_access': 'full_gym', 'preferred_body_focus': 'lower'}
  barbell squats                           | legs         | barbell      | advanced     | sim=0.662
  romanian deadlifts                       | legs         | 

## 4. Evaluate with Train/Test Split

Split interactions into train (80%) and test (20%).
Use test set as ground truth to evaluate ranking quality.

In [7]:
from sklearn.model_selection import train_test_split

# Split interactions per user
train_interactions = []
test_interactions = []

for user_id in interactions['user_id'].unique():
    user_ints = interactions[interactions['user_id'] == user_id]
    if len(user_ints) >= 4:  # need at least 4 to split
        train, test = train_test_split(user_ints, test_size=0.2, random_state=42)
        train_interactions.append(train)
        test_interactions.append(test)
    else:
        train_interactions.append(user_ints)

train_df = pd.concat(train_interactions).reset_index(drop=True)
test_df = pd.concat(test_interactions).reset_index(drop=True)

print(f'Train: {len(train_df)} interactions')
print(f'Test: {len(test_df)} interactions')

# Build ground truth: for each user, the exercises they liked in test set (rating >= 3)
ground_truth = {}
for user_id in test_df['user_id'].unique():
    user_test = test_df[test_df['user_id'] == user_id]
    liked = set(user_test[user_test['rating'] >= 3.0]['exercise_id'].values)
    if liked:
        ground_truth[user_id] = liked

print(f'Users with test ground truth: {len(ground_truth)}')

Train: 12000 interactions
Test: 3000 interactions
Users with test ground truth: 919


In [8]:
# --- Evaluation Metrics ---

def precision_at_k(recommended, relevant, k):
    top_k = recommended[:k]
    return len(set(top_k) & relevant) / k

def recall_at_k(recommended, relevant, k):
    if not relevant:
        return 0.0
    top_k = recommended[:k]
    return len(set(top_k) & relevant) / len(relevant)

def ndcg_at_k(recommended, relevant, k):
    top_k = recommended[:k]
    dcg = sum(1.0 / np.log2(i + 2) for i, item in enumerate(top_k) if item in relevant)
    ideal_k = min(len(relevant), k)
    idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_k))
    return dcg / idcg if idcg > 0 else 0.0

def hit_rate_at_k(recommended, relevant, k):
    return 1.0 if set(recommended[:k]) & relevant else 0.0

print('Evaluation metrics defined!')

Evaluation metrics defined!


In [9]:
# Generate recommendations for all users and evaluate
user_id_to_idx = {uid: idx for idx, uid in enumerate(users['user_id'])}
K_VALUES = [5, 10]

results = {k: {'precision': [], 'recall': [], 'ndcg': [], 'hit_rate': []} for k in K_VALUES}

all_recommendations = {}  # Save for later comparison

for user_id, relevant in ground_truth.items():
    user_idx = user_id_to_idx[user_id]
    
    # Get exercises the user already interacted with in training
    train_ids = set(train_df[train_df['user_id'] == user_id]['exercise_id'].values)
    
    # Get recommendations (excluding training exercises)
    recs = recommend_content_based(user_idx, user_profiles, exercise_features, exercises, top_k=20, exclude_ids=train_ids)
    rec_ids = recs['exercise_id'].tolist()
    all_recommendations[user_id] = rec_ids
    
    for k in K_VALUES:
        results[k]['precision'].append(precision_at_k(rec_ids, relevant, k))
        results[k]['recall'].append(recall_at_k(rec_ids, relevant, k))
        results[k]['ndcg'].append(ndcg_at_k(rec_ids, relevant, k))
        results[k]['hit_rate'].append(hit_rate_at_k(rec_ids, relevant, k))

print('=== Content-Based Filtering Results ===')
print(f'{"Metric":<20} {"@5":>10} {"@10":>10}')
print('-' * 42)
for metric in ['precision', 'recall', 'ndcg', 'hit_rate']:
    vals = [np.mean(results[k][metric]) for k in K_VALUES]
    print(f'{metric:<20} {vals[0]:>10.4f} {vals[1]:>10.4f}')

=== Content-Based Filtering Results ===
Metric                       @5        @10
------------------------------------------
precision                0.1071     0.0937
recall                   0.2795     0.4857
ndcg                     0.2029     0.2828
hit_rate                 0.4744     0.7138


In [10]:
# Save results for final comparison
import json

cb_results = {}
for k in K_VALUES:
    for metric in ['precision', 'recall', 'ndcg', 'hit_rate']:
        cb_results[f'{metric}@{k}'] = float(np.mean(results[k][metric]))

with open('../results/metrics/content_based_results.json', 'w') as f:
    json.dump(cb_results, f, indent=2)

print('Results saved to results/metrics/content_based_results.json')
print(cb_results)

Results saved to results/metrics/content_based_results.json
{'precision@5': 0.10707290533188248, 'recall@5': 0.27947043888284373, 'ndcg@5': 0.20292696743667085, 'hit_rate@5': 0.47442872687704024, 'precision@10': 0.09368879216539716, 'recall@10': 0.48567283278926365, 'ndcg@10': 0.2828460157499206, 'hit_rate@10': 0.7138193688792165}


## 5. Summary

### Content-Based Filtering
**How it works:** Maps user attributes to exercise feature space, ranks by cosine similarity.

**Strengths:**
- No interaction history needed (handles cold-start)
- Fully interpretable — we can explain WHY each exercise was recommended
- Fast inference

**Weaknesses:**
- Limited by hand-crafted feature mappings
- Can't discover surprising preferences (only recommends "obvious" matches)
- Doesn't learn from user behavior

**Next:** Notebook 04 — Collaborative Filtering (learns from interaction patterns)