# üí™ FitMatch ‚Äî Workout Plan Recommender System

---

## Step 1: Problem Definition

**Domain:** Fitness / Workout Recommendation

**What is being recommended?**  
Workout plans (e.g., "Morning Run Blast", "Full Body Strength") to users based on their fitness profile and past preferences.

**Who are the users?**  
Fitness enthusiasts with varying ages (18‚Äì59), fitness levels (beginner/intermediate/advanced), and goals (weight loss, muscle gain, endurance, flexibility).

**Objective:**  
Top-N recommendation (Top-5) ‚Äî for each user, suggest 5 workout plans they are most likely to enjoy.

**Algorithms:**
1. **Most Popular** (Baseline) ‚Äî Recommend plans with highest average rating
2. **User-Based Collaborative Filtering** (Primary) ‚Äî Cosine similarity on user-item rating matrix
3. **Content-Based Filtering** (Bonus) ‚Äî TF-IDF on plan descriptions + cosine similarity
4. **Hybrid** (Bonus) ‚Äî Weighted blend of CF + Content-Based

**Justification:**  
User-Based CF is ideal for this domain because users with similar fitness profiles tend to enjoy similar workouts. The content-based approach adds value by capturing workout attribute similarity, and the hybrid model combines the strengths of both.

---
## Step 2: Data Preparation

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Import our modules
from generate_dataset import generate_users, generate_workout_plans, generate_ratings
from recommender import (
    MostPopularRecommender,
    UserBasedCFRecommender,
    ContentBasedRecommender,
    HybridRecommender,
    split_train_test,
    evaluate_model,
    precision_at_k,
    recall_at_k
)

print('All modules imported successfully!')

In [None]:
# Generate the dataset
np.random.seed(42)

users = generate_users(n=100)
plans = generate_workout_plans(n=60)
ratings = generate_ratings(users, plans)

print(f'Users:    {len(users)}')
print(f'Plans:    {len(plans)}')
print(f'Ratings:  {len(ratings)}')

# Sparsity
total_possible = len(users) * len(plans)
sparsity = 1 - len(ratings) / total_possible
print(f'\nTotal possible interactions: {total_possible}')
print(f'Dataset Sparsity: {sparsity:.2%}')

In [None]:
# Explore user attributes
print('=== User Demographics ===')
print(f'\nAge range: {users["age"].min()} ‚Äì {users["age"].max()}')
print(f'Mean age: {users["age"].mean():.1f}')
print(f'\nFitness Level Distribution:')
print(users['fitness_level'].value_counts())
print(f'\nGoal Distribution:')
print(users['goal'].value_counts())

users.head(10)

In [None]:
# Explore workout plans
print('=== Workout Plans ===')
print(f'\nPlan Types:')
print(plans['type'].value_counts())
print(f'\nDifficulty Distribution:')
print(plans['difficulty'].value_counts())
print(f'\nTarget Goals:')
print(plans['target_goal'].value_counts())
print(f'\nDuration range: {plans["duration_min"].min()} ‚Äì {plans["duration_min"].max()} minutes')

plans[['plan_id', 'name', 'type', 'difficulty', 'duration_min', 'target_goal']].head(10)

In [None]:
# Explore ratings
print('=== Ratings ===')
print(f'\nRating Distribution:')
print(ratings['rating'].value_counts().sort_index())
print(f'\nMean rating: {ratings["rating"].mean():.2f}')
print(f'Ratings per user  ‚Äî min: {ratings.groupby("user_id").size().min()}, '
      f'max: {ratings.groupby("user_id").size().max()}, '
      f'mean: {ratings.groupby("user_id").size().mean():.1f}')
print(f'Ratings per plan  ‚Äî min: {ratings.groupby("plan_id").size().min()}, '
      f'max: {ratings.groupby("plan_id").size().max()}, '
      f'mean: {ratings.groupby("plan_id").size().mean():.1f}')

ratings.head(10)

In [None]:
# Visualize distributions
fig, axes = plt.subplots(1, 4, figsize=(18, 4))

# Rating distribution
ratings['rating'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='#667eea')
axes[0].set_title('Rating Distribution')
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Count')

# Fitness level
users['fitness_level'].value_counts().plot(kind='bar', ax=axes[1], color='#764ba2')
axes[1].set_title('Fitness Level Distribution')
axes[1].set_xlabel('Level')

# Goal distribution
users['goal'].value_counts().plot(kind='bar', ax=axes[2], color='#f093fb')
axes[2].set_title('User Goal Distribution')
axes[2].set_xlabel('Goal')

# Plans per type
plans['type'].value_counts().plot(kind='bar', ax=axes[3], color='#4facfe')
axes[3].set_title('Plans by Type')
axes[3].set_xlabel('Type')

plt.tight_layout()
plt.show()

In [None]:
# Train/Test Split (80/20)
train, test = split_train_test(ratings, test_size=0.2, seed=42)

print(f'Train set: {len(train)} ratings')
print(f'Test set:  {len(test)} ratings')
print(f'Ratio:     {len(train)/len(ratings):.0%} / {len(test)/len(ratings):.0%}')
print(f'\nUsers in train: {train["user_id"].nunique()}')
print(f'Users in test:  {test["user_id"].nunique()}')

---
## Step 3: Model Development

In [None]:
# 1. Most Popular Recommender (Baseline)
pop_model = MostPopularRecommender()
pop_model.fit(train)

print('=== Most Popular Baseline ===')
print('Top 10 most popular workout plans:')
top_popular = pop_model.popular_plans.head(10).merge(
    plans[['plan_id', 'name', 'type']], on='plan_id'
)
top_popular[['plan_id', 'name', 'type', 'mean', 'count']]

In [None]:
# 2. User-Based Collaborative Filtering
cf_model = UserBasedCFRecommender(k_neighbors=20)
cf_model.fit(train)

print('=== User-Based CF ===')
print(f'User-Item matrix shape: {cf_model.user_item_matrix.shape}')
print(f'Similarity matrix shape: {cf_model.similarity_matrix.shape}')

# Show similarity for first user
user1 = cf_model.user_ids[0]
top_similar = cf_model.similarity_matrix[user1].drop(user1).nlargest(5)
print(f'\nTop 5 similar users to User #{user1}:')
for uid, sim in top_similar.items():
    print(f'  User #{uid}: similarity = {sim:.4f}')

In [None]:
# 3. Content-Based Recommender (TF-IDF)
cb_model = ContentBasedRecommender()
cb_model.fit(plans, train)

print('=== Content-Based (TF-IDF) ===')
print(f'TF-IDF matrix shape: {cb_model.tfidf_matrix.shape}')

# Show plan similarity example
print(f'\nMost similar plans to "{plans.iloc[0]["name"]}":')
sim_scores = cb_model.similarity_matrix[1].drop(1).nlargest(5)
for pid, sim in sim_scores.items():
    plan_name = plans[plans['plan_id'] == pid]['name'].values[0]
    print(f'  Plan #{pid} ({plan_name}): similarity = {sim:.4f}')

In [None]:
# 4. Hybrid Recommender (CF + Content-Based)
hybrid_model = HybridRecommender(cf_weight=0.6, cb_weight=0.4)
hybrid_model.fit(train, plans)

print('=== Hybrid Recommender ===')
print('Weights: CF = 0.6, Content-Based = 0.4')
print('Model trained successfully!')

In [None]:
# Generate sample recommendations from all models for User #1
sample_uid = 1
user_info = users[users['user_id'] == sample_uid].iloc[0]
print(f'Sample User #{sample_uid}: Age={user_info["age"]}, '
      f'Fitness={user_info["fitness_level"]}, Goal={user_info["goal"]}')
print('=' * 70)

models = {
    'Most Popular': pop_model,
    'User-Based CF': cf_model,
    'Content-Based': cb_model,
    'Hybrid': hybrid_model
}

for name, model in models.items():
    rec_ids = model.recommend(sample_uid, train, n=5)
    rec_plans = plans[plans['plan_id'].isin(rec_ids)].set_index('plan_id').loc[rec_ids].reset_index()
    print(f'\n--- {name} ---')
    for _, row in rec_plans.iterrows():
        print(f'  ‚Ä¢ {row["name"]} ({row["type"]}, {row["difficulty"]}, '
              f'{row["duration_min"]}min, goal={row["target_goal"]})')

---
## Step 4: Evaluation

We evaluate all models using **Precision@5** and **Recall@5** on the held-out test set.

- **Precision@5** = fraction of the 5 recommended items that appear in the user's relevant test items (rated ‚â• 4)
- **Recall@5** = fraction of the user's relevant test items that appear in the top-5 recommendations

In [1]:
# Evaluate all models
print('Evaluating models (this may take a moment)...\n')

results = {}
for name, model in models.items():
    metrics = evaluate_model(model, train, test, k=5, threshold=4)
    results[name] = metrics
    print(f'{name:20s}  Precision@5={metrics["Precision@5"]:.4f}  '
          f'Recall@5={metrics["Recall@5"]:.4f}  '
          f'(evaluated on {metrics["Users Evaluated"]} users)')

# Comparison table
print('\n' + '=' * 70)
print('PERFORMANCE COMPARISON TABLE')
print('=' * 70)
comparison = pd.DataFrame(results).T
comparison.index.name = 'Model'
comparison

Evaluating models (this may take a moment)...



NameError: name 'models' is not defined

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

model_names = list(results.keys())
precisions = [results[m]['Precision@5'] for m in model_names]
recalls = [results[m]['Recall@5'] for m in model_names]
colors = ['#667eea', '#764ba2', '#f093fb', '#4facfe']

axes[0].barh(model_names, precisions, color=colors)
axes[0].set_title('Precision@5 Comparison', fontweight='bold')
axes[0].set_xlabel('Precision@5')
for i, v in enumerate(precisions):
    axes[0].text(v + 0.002, i, f'{v:.4f}', va='center')

axes[1].barh(model_names, recalls, color=colors)
axes[1].set_title('Recall@5 Comparison', fontweight='bold')
axes[1].set_xlabel('Recall@5')
for i, v in enumerate(recalls):
    axes[1].text(v + 0.002, i, f'{v:.4f}', va='center')

plt.tight_layout()
plt.show()

---
## Step 5: Demonstration

Below we show detailed recommendations for **3 different users** with different fitness profiles.

In [None]:
# Select 3 diverse demo users
demo_user_ids = []
for goal in ['weight_loss', 'muscle_gain', 'endurance']:
    uid = users[users['goal'] == goal].iloc[0]['user_id']
    demo_user_ids.append(uid)

for uid in demo_user_ids:
    user = users[users['user_id'] == uid].iloc[0]
    user_train_ratings = train[train['user_id'] == uid]
    
    print('=' * 70)
    print(f'üë§ USER #{uid}')
    print(f'   Age: {user["age"]}  |  Fitness: {user["fitness_level"]}  |  Goal: {user["goal"]}')
    print(f'   Training ratings: {len(user_train_ratings)}')
    print('-' * 70)
    
    # Show what they rated in training
    rated_plans = user_train_ratings.merge(plans[['plan_id', 'name', 'type']], on='plan_id')
    print('\n   üìù Plans they rated (training set):')
    for _, r in rated_plans.iterrows():
        stars = '‚≠ê' * r['rating']
        print(f'      {r["name"]:35s} ({r["type"]:15s}) ‚Üí {stars}')
    
    # Show CF recommendations
    print('\n   ü§ñ Top 5 User-Based CF Recommendations:')
    cf_recs = cf_model.recommend(uid, train, n=5)
    for i, pid in enumerate(cf_recs, 1):
        p = plans[plans['plan_id'] == pid].iloc[0]
        print(f'      {i}. {p["name"]:35s} | {p["type"]:15s} | '
              f'{p["difficulty"]:12s} | {p["duration_min"]}min | üéØ {p["target_goal"]}')
    
    # Brief explanation
    print(f'\n   üí° Explanation: These plans are recommended because users with similar')
    print(f'      rating patterns (who also enjoy {user["goal"].replace("_", " ")} workouts')
    print(f'      at the {user["fitness_level"]} level) rated these plans highly.')
    print()

In [None]:
# Final comparison summary table
print('\n' + '=' * 70)
print('FINAL PERFORMANCE COMPARISON')
print('=' * 70)

summary = pd.DataFrame({
    'Model': ['Most Popular (Baseline)', 'User-Based CF (Primary)', 
              'Content-Based TF-IDF (Bonus)', 'Hybrid CF+CB (Bonus)'],
    'Precision@5': [results['Most Popular']['Precision@5'],
                    results['User-Based CF']['Precision@5'],
                    results['Content-Based']['Precision@5'],
                    results['Hybrid']['Precision@5']],
    'Recall@5': [results['Most Popular']['Recall@5'],
                 results['User-Based CF']['Recall@5'],
                 results['Content-Based']['Recall@5'],
                 results['Hybrid']['Recall@5']],
})

print(summary.to_string(index=False))

print('\n\nConclusion:')
print('The User-Based Collaborative Filtering model outperforms the Most Popular')
print('baseline by providing personalized recommendations. The Hybrid model')
print('combines the strengths of both CF and Content-Based approaches.')