# Exercise Data Exploration

This notebook is for exploring and visualizing the exercise data fetched from the APIs.

In [3]:
# Import libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image, display
import requests
from PIL import Image as PILImage
import io

# Set plot style
plt.style.use('fivethirtyeight')
sns.set_palette('Set2')
%matplotlib inline

## Load Data

Let's load and examine the fetched data.

In [5]:
# Load raw data
raw_data_dir = "data/raw"
exercisedb_dir = f"{raw_data_dir}/exercisedb"
wger_dir = f"{raw_data_dir}/wger"

# Check if data exists
if os.path.exists(f"{exercisedb_dir}/all_exercises.csv"):
    exercisedb_df = pd.read_csv(f"{exercisedb_dir}/all_exercises.csv")
    print(f"ExerciseDB data loaded: {len(exercisedb_df)} exercises")
else:
    print("ExerciseDB data not found. Run the fetch_data.py script first.")
    exercisedb_df = None

if os.path.exists(f"{wger_dir}/all_exercises.csv"):
    wger_df = pd.read_csv(f"{wger_dir}/all_exercises.csv")
    print(f"Wger data loaded: {len(wger_df)} exercises")
else:
    print("Wger data not found. Run the fetch_data.py script first.")
    wger_df = None

ExerciseDB data not found. Run the fetch_data.py script first.
Wger data not found. Run the fetch_data.py script first.


## Explore ExerciseDB Data

Let's explore the structure and content of the ExerciseDB data.

In [6]:
if exercisedb_df is not None:
    # Display basic info
    print("ExerciseDB DataFrame Info:")
    exercisedb_df.info()

In [7]:
if exercisedb_df is not None:
    # Display sample data
    print("\nSample ExerciseDB Data:")
    exercisedb_df.head()

### Distribution of Exercises by Body Part

In [9]:
if exercisedb_df is not None:
    plt.figure(figsize=(12, 6))
    bodypart_counts = exercisedb_df['bodyPart'].value_counts()
    ax = sns.barplot(x=bodypart_counts.index, y=bodypart_counts.values)
    plt.title('Number of Exercises by Body Part')
    plt.xlabel('Body Part')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    # Add count labels on top of bars
    for i, count in enumerate(bodypart_counts.values):
        ax.text(i, count + 5, str(count), ha='center')
    
    plt.show()

### Distribution of Exercises by Target Muscle

In [10]:
if exercisedb_df is not None:
    plt.figure(figsize=(14, 8))
    target_counts = exercisedb_df['target'].value_counts().head(20)  # Top 20 for readability
    ax = sns.barplot(x=target_counts.index, y=target_counts.values)
    plt.title('Number of Exercises by Target Muscle (Top 20)')
    plt.xlabel('Target Muscle')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    # Add count labels on top of bars
    for i, count in enumerate(target_counts.values):
        ax.text(i, count + 2, str(count), ha='center')
    
    plt.show()

### Distribution of Exercises by Equipment

In [11]:
if exercisedb_df is not None:
    plt.figure(figsize=(12, 6))
    equipment_counts = exercisedb_df['equipment'].value_counts()
    ax = sns.barplot(x=equipment_counts.index, y=equipment_counts.values)
    plt.title('Number of Exercises by Equipment')
    plt.xlabel('Equipment')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    # Add count labels on top of bars
    for i, count in enumerate(equipment_counts.values):
        ax.text(i, count + 5, str(count), ha='center')
    
    plt.show()

### Visualize Exercise GIFs

Let's display some exercise GIFs to get a better understanding of the data.

In [10]:
def display_exercise_gif(exercise_row):
    """Display exercise GIF and information"""
    url = exercise_row['gifUrl']
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        img = PILImage.open(io.BytesIO(response.content))
        
        # Display exercise info
        print(f"Name: {exercise_row['name']}")
        print(f"Body Part: {exercise_row['bodyPart']}")
        print(f"Target Muscle: {exercise_row['target']}")
        print(f"Equipment: {exercise_row['equipment']}")
        print("\nInstructions:")
        print(exercise_row['instructions'])
        
        # Display GIF
        display(img)
        
    except Exception as e:
        print(f"Error loading GIF: {e}")

In [11]:
if exercisedb_df is not None:
    # Display a random sample of exercises
    sample_exercises = exercisedb_df.sample(3)
    
    for _, exercise in sample_exercises.iterrows():
        display_exercise_gif(exercise)
        print("\n" + "-"*80 + "\n")

## Load Processed Data

Let's load and examine the processed data after running the preprocessor.

In [12]:
# Load processed data
processed_data_dir = "data/processed"

# Check if processed data exists
if os.path.exists(f"{processed_data_dir}/exercisedb_processed.csv"):
    processed_df = pd.read_csv(f"{processed_data_dir}/exercisedb_processed.csv")
    print(f"Processed data loaded: {len(processed_df)} exercises")
else:
    print("Processed data not found. Run the preprocessor.py script first.")
    processed_df = None
    
if processed_df is not None:
    # Display basic info
    print("\nProcessed DataFrame Info:")
    processed_df.info()
    
    # Display sample data
    print("\nSample Processed Data:")
    display(processed_df.head())
    
    # Explore derived features
    print("\nExtracted Movement Patterns:")
    movement_pattern_counts = processed_df['movement_pattern'].value_counts()
    display(movement_pattern_counts)
    
    print("\nExtracted Joints Used:")
    # Since joints_used is a list stored as string, we need to process it
    all_joints = []
    for joints_str in processed_df['joints_used']:
        try:
            joints = eval(joints_str) if isinstance(joints_str, str) else []
            all_joints.extend(joints)
        except:
            pass
    
    joints_counts = pd.Series(all_joints).value_counts()
    display(joints_counts)
    
    print("\nCompound vs Isolation Exercises:")
    compound_counts = processed_df['is_compound'].value_counts()
    display(compound_counts)
    
    # Visualize movement patterns
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x=movement_pattern_counts.index, y=movement_pattern_counts.values)
    plt.title('Distribution of Exercise Movement Patterns')
    plt.xlabel('Movement Pattern')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    
    # Add count labels on top of bars
    for i, count in enumerate(movement_pattern_counts.values):
        ax.text(i, count + 2, str(count), ha='center')
    
    plt.tight_layout()
    plt.show()
    
    # Check for text features (TF-IDF)
    tfidf_cols = [col for col in processed_df.columns if col.startswith('tfidf_')]
    if tfidf_cols:
        print(f"\nNumber of TF-IDF features: {len(tfidf_cols)}")
        print("Sample TF-IDF features:")
        display(processed_df[tfidf_cols[:5]].head())
        
        # Visualize the most common terms
        feature_means = processed_df[tfidf_cols].mean().sort_values(ascending=False)
        top_features = feature_means.head(15)
        
        plt.figure(figsize=(12, 6))
        ax = sns.barplot(x=top_features.index, y=top_features.values)
        plt.title('Most Common TF-IDF Features Across Exercises')
        plt.xlabel('Feature')
        plt.ylabel('Mean TF-IDF Value')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

# If predictions are available, let's analyze them
if os.path.exists("data/predictions.csv"):
    predictions_df = pd.read_csv("data/predictions.csv")
    print(f"\nPredictions loaded: {len(predictions_df)} exercises")
    
    # Compare predictions with true values
    for col in ['movement_pattern', 'is_compound', 'risk_assessment']:
        pred_col = f"{col}_pred"
        if pred_col in predictions_df.columns and col in predictions_df.columns:
            print(f"\nConfusion Matrix for {col}:")
            conf_matrix = pd.crosstab(predictions_df[col], predictions_df[pred_col], 
                                      rownames=['Actual'], colnames=['Predicted'])
            display(conf_matrix)
            
            # Calculate accuracy
            accuracy = (predictions_df[col] == predictions_df[pred_col]).mean() * 100
            print(f"Accuracy: {accuracy:.2f}%")
            
            # Visualize confusion matrix
            plt.figure(figsize=(10, 8))
            sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
            plt.title(f'Confusion Matrix for {col}')
            plt.tight_layout()
            plt.show()
else:
    print("\nNo predictions found. Run the prediction script first.")

# Conclusion
print("\n" + "="*80)
print("CONCLUSION")
print("="*80)
print("This notebook has helped us explore the exercise data and understand its structure.")
print("We've seen the distribution of exercises by body part, target muscle, and equipment.")
print("We've also analyzed the derived features such as movement patterns and compound vs isolation exercises.")
print("\nNext steps:")
print("1. Run the preprocessing script to extract features from the raw data")
print("2. Train the classification models to predict exercise attributes")
print("3. Evaluate model performance and refine the models")
print("4. Integrate the models into a full exercise classification system")

Processed data not found. Run the preprocessor.py script first.

No predictions found. Run the prediction script first.

CONCLUSION
This notebook has helped us explore the exercise data and understand its structure.
We've seen the distribution of exercises by body part, target muscle, and equipment.
We've also analyzed the derived features such as movement patterns and compound vs isolation exercises.

Next steps:
1. Run the preprocessing script to extract features from the raw data
2. Train the classification models to predict exercise attributes
3. Evaluate model performance and refine the models
4. Integrate the models into a full exercise classification system


In [13]:
# Check for pose features (landmarks)
if processed_df is not None:
    landmark_cols = [col for col in processed_df.columns if col.startswith('landmark_')]
    if landmark_cols:
        print(f"\nNumber of landmark features: {len(landmark_cols)}")
        print("Sample landmark features:")
        display(processed_df[landmark_cols[:5]].head())
        
        # PCA visualization of landmarks
        from sklearn.decomposition import PCA
        from sklearn.preprocessing import StandardScaler
        
        # Scale the landmark data
        landmark_data = processed_df[landmark_cols].fillna(0)
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(landmark_data)
        
        # Apply PCA
        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(scaled_data)
        
        # Create DataFrame for plotting
        pca_df = pd.DataFrame({
            'PCA1': pca_result[:, 0],
            'PCA2': pca_result[:, 1],
            'movement_pattern': processed_df['movement_pattern'].values,
            'bodypart': processed_df['bodypart'].values
        })
        
        # Plot by movement pattern
        plt.figure(figsize=(12, 8))
        sns.scatterplot(x='PCA1', y='PCA2', hue='movement_pattern', data=pca_df, palette='viridis')
        plt.title('PCA of Exercise Pose Landmarks (colored by Movement Pattern)')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()
        
        # Plot by bodypart
        plt.figure(figsize=(12, 8))
        sns.scatterplot(x='PCA1', y='PCA2', hue='bodypart', data=pca_df, palette='tab10')
        plt.title('PCA of Exercise Pose Landmarks (colored by Body Part)')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()
    
    # Relationship between bodypart and movement pattern
    plt.figure(figsize=(16, 10))
    body_movement = pd.crosstab(processed_df['bodypart'], processed_df['movement_pattern'])
    sns.heatmap(body_movement, annot=True, cmap="YlGnBu", fmt="d", linewidths=.5)
    plt.title('Relationship Between Body Part and Movement Pattern')
    plt.ylabel('Body Part')
    plt.xlabel('Movement Pattern')
    plt.tight_layout()
    plt.show()
    
    # Relationship between equipment and movement pattern
    plt.figure(figsize=(16, 10))
    equip_movement = pd.crosstab(processed_df['equipment'], processed_df['movement_pattern'])
    sns.heatmap(equip_movement, annot=True, cmap="YlGnBu", fmt="d", linewidths=.5)
    plt.title('Relationship Between Equipment and Movement Pattern')
    plt.ylabel('Equipment')
    
    plt.xlabel('Movement Pattern')
    plt.tight_layout()
    plt.show()
    
    # Visualize compound vs isolation
    plt.figure(figsize=(8, 6))
    compound_df = pd.DataFrame({'Exercise Type': ['Compound', 'Isolation'], 
                               'Count': [compound_counts.get(True, 0), compound_counts.get(False, 0)]})
    ax = sns.barplot(x='Exercise Type', y='Count', data=compound_df)
    plt.title('Compound vs Isolation Exercises')
    
    # Add count labels on top of bars
    for i, count in enumerate(compound_df['Count']):
        ax.text(i, count + 5, str(count), ha='center')
    
    plt.tight_layout()
    plt.show()