# Exercise Classification Feature Engineering

This notebook demonstrates the feature engineering process for the exercise classification project. We'll extract features from exercise data to classify exercises based on various attributes including:

- Target muscle groups
- Required equipment
- Movement patterns
- Intensity/experience levels
- Exercise types
- Quality of movement
- Risk assessment

We'll use both text-based features from exercise descriptions and pose-based features from exercise images.

In [1]:
# Import libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import requests
from io import BytesIO
import cv2
import mediapipe as mp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import joblib
import warnings
from tqdm.notebook import tqdm

# Ignore warnings
warnings.filterwarnings('ignore')

# Set Matplotlib and Seaborn styles
plt.style.use('fivethirtyeight')
sns.set_palette('Set2')

# For displaying images in the notebook
%matplotlib inline

## 1. Load Raw Exercise Data

First, let's load the raw exercise data that we fetched from the APIs.

In [2]:
# Set data paths
RAW_DATA_DIR = "data/raw"
PROCESSED_DATA_DIR = "data/processed"
EXERCISEDB_DIR = f"{RAW_DATA_DIR}/exercisedb"
MODEL_DIR = "models"

# Create directories if they don't exist
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# Load ExerciseDB data
exercisedb_path = f"{EXERCISEDB_DIR}/all_exercises.csv"

if os.path.exists(exercisedb_path):
    df = pd.read_csv(exercisedb_path)
    print(f"Loaded {len(df)} exercises from ExerciseDB")
else:
    print("ExerciseDB data not found. Please run the data fetcher first.")
    df = None

ExerciseDB data not found. Please run the data fetcher first.


In [3]:
# Display basic info and sample data
if df is not None:
    print("\nDataFrame Info:")
    df.info()
    
    print("\nSample Data:")
    display(df.head(2))

## 2. Initial Data Preprocessing

Let's clean and prepare the data for feature extraction.

In [4]:
def clean_dataset(df):
    """Clean and preprocess the raw dataset"""
    # Make a copy to avoid modifying the original
    cleaned_df = df.copy()
    
    # Make column names lowercase and replace spaces with underscores
    cleaned_df.columns = [col.lower().replace(' ', '_') for col in cleaned_df.columns]
    
    # Convert all string columns to lowercase
    for col in cleaned_df.select_dtypes(include=['object']).columns:
        if col in ['instructions', 'name', 'bodypart', 'target', 'equipment']:
            cleaned_df[col] = cleaned_df[col].str.lower()
    
    # Fill missing values
    if 'instructions' in cleaned_df.columns:
        cleaned_df['instructions'].fillna('', inplace=True)
    
    return cleaned_df

# Clean the dataset
if df is not None:
    cleaned_df = clean_dataset(df)
    print(f"Cleaned dataset with {len(cleaned_df)} rows and {len(cleaned_df.columns)} columns")

In [5]:
# Check the unique values in key categorical columns
if 'cleaned_df' in locals():
    categorical_cols = ['bodypart', 'target', 'equipment']
    
    for col in categorical_cols:
        if col in cleaned_df.columns:
            unique_vals = cleaned_df[col].unique()
            print(f"\n{col.capitalize()} unique values ({len(unique_vals)}):")
            print(sorted(unique_vals)[:10], '...')

## 3. Text Feature Extraction

Now, let's extract features from the text data (exercise names and instructions).

In [6]:
class TextFeatureExtractor:
    """Extract features from exercise text data"""
    def __init__(self, max_features=100):
        self.max_features = max_features
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=max_features,
            stop_words='english',
            ngram_range=(1, 2),
            min_df=2
        )
    
    def extract_features(self, df):
        """Extract text features from a DataFrame"""
        # Combine name and instructions for TF-IDF
        if 'instructions' in df.columns and 'name' in df.columns:
            df['text_combined'] = df['name'] + ' ' + df['instructions'].fillna('')
        elif 'name' in df.columns:
            df['text_combined'] = df['name']
        else:
            raise ValueError("DataFrame must contain 'name' column")
        
        # Extract TF-IDF features
        print("Extracting TF-IDF features...")
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(df['text_combined'])
        
        # Convert to DataFrame
        feature_names = self.tfidf_vectorizer.get_feature_names_out()
        tfidf_df = pd.DataFrame(
            tfidf_matrix.toarray(), 
            columns=[f"tfidf_{f}" for f in feature_names]
        )
        
        return pd.concat([df, tfidf_df], axis=1)

# Initialize and use the text feature extractor
if 'cleaned_df' in locals():
    text_extractor = TextFeatureExtractor(max_features=100)
    text_features_df = text_extractor.extract_features(cleaned_df)
    
    # Display text features
    text_feature_cols = [col for col in text_features_df.columns if col.startswith('tfidf_')]
    print(f"\nExtracted {len(text_feature_cols)} text features")
    
    # Save intermediate result
    text_features_df.to_csv(f"{PROCESSED_DATA_DIR}/text_features.csv", index=False)
    print(f"Text features saved to {PROCESSED_DATA_DIR}/text_features.csv")

## 4. Pose Feature Extraction

Now, let's extract features from exercise images using MediaPipe pose estimation.

In [7]:
class PoseFeatureExtractor:
    """Extract features from exercise images using MediaPipe pose estimation"""
    def __init__(self, confidence_threshold=0.5):
        self.confidence_threshold = confidence_threshold
        self.mp_pose = mp.solutions.pose
        self.pose = self.mp_pose.Pose(
            static_image_mode=True,
            model_complexity=2,
            min_detection_confidence=confidence_threshold
        )
    
    def download_and_process_images(self, df, limit=None):
        """Download and process exercise images"""
        if 'gifUrl' not in df.columns:
            raise ValueError("DataFrame must contain 'gifUrl' column")
        
        # Limit the number of images to process
        process_df = df.head(limit) if limit else df
        
        # Process images and extract pose landmarks
        landmarks_list = []
        
        for i, row in tqdm(process_df.iterrows(), total=len(process_df), desc="Processing images"):
            try:
                # Download image
                response = requests.get(row['gifUrl'])
                response.raise_for_status()
                
                # For GIFs, extract the first frame
                image = Image.open(BytesIO(response.content))
                if hasattr(image, 'n_frames') and image.n_frames > 1:
                    image.seek(0)
                
                # Convert to numpy array for OpenCV
                image_np = np.array(image.convert('RGB'))
                
                # Process with MediaPipe
                results = self.pose.process(image_np)
                
                if results.pose_landmarks:
                    # Extract landmark positions
                    landmarks = []
                    for landmark in results.pose_landmarks.landmark:
                        landmarks.extend([landmark.x, landmark.y, landmark.z, landmark.visibility])
                else:
                    # If no landmarks detected, fill with zeros
                    landmarks = [0] * (33 * 4)  # 33 landmarks with x,y,z,visibility
                
                landmarks_list.append(landmarks)
                
            except Exception as e:
                print(f"Error processing image {row['gifUrl']}: {e}")
                landmarks_list.append([0] * (33 * 4))
        
        # Create feature columns for landmarks
        landmark_columns = []
        for i in range(33):
            for dim in ['x', 'y', 'z', 'vis']:
                landmark_columns.append(f"landmark_{i}_{dim}")
        
        landmarks_df = pd.DataFrame(landmarks_list, columns=landmark_columns)
        
        # Add landmark columns to original DataFrame
        result_df = process_df.copy()
        for col in landmark_columns:
            result_df[col] = landmarks_df[col]
        
        return result_df

# Initialize and use the pose feature extractor
if 'cleaned_df' in locals():
    pose_extractor = PoseFeatureExtractor(confidence_threshold=0.5)
    pose_features_df = pose_extractor.download_and_process_images(cleaned_df, limit=100)  # Limit for testing
    
    # Display pose features
    pose_feature_cols = [col for col in pose_features_df.columns if col.startswith('landmark_')]
    print(f"\nExtracted {len(pose_feature_cols)} pose features")
    
    # Save intermediate result
    pose_features_df.to_csv(f"{PROCESSED_DATA_DIR}/pose_features.csv", index=False)
    print(f"Pose features saved to {PROCESSED_DATA_DIR}/pose_features.csv")

## 5. Feature Analysis

Let's analyze the extracted features to understand their distribution and importance.

In [8]:
# Analyze text features
if 'text_features_df' in locals():
    # Display top TF-IDF terms
    print("Top TF-IDF Terms:")
    feature_importance = pd.DataFrame({
        'term': text_extractor.tfidf_vectorizer.get_feature_names_out(),
        'importance': text_features_df[text_feature_cols].mean()
    })
    display(feature_importance.sort_values('importance', ascending=False).head(10))
    
    # Visualize feature distribution
    plt.figure(figsize=(12, 6))
    text_features_df[text_feature_cols[:10]].boxplot()
    plt.title("Distribution of Text Features")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Analyze pose features
if 'pose_features_df' in locals():
    # Visualize landmark distribution
    plt.figure(figsize=(12, 6))
    pose_features_df[pose_feature_cols[:10]].boxplot()
    plt.title("Distribution of Pose Features")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 6. Feature Combination

Finally, let's combine the text and pose features into a unified feature set.

In [9]:
class CombinedFeatureExtractor:
    """Combine text and pose features into a unified feature set"""
    def __init__(self, text_extractor=None, pose_extractor=None, use_pca=True, pca_components=50):
        self.text_extractor = text_extractor or TextFeatureExtractor()
        self.pose_extractor = pose_extractor or PoseFeatureExtractor()
        self.use_pca = use_pca
        self.pca_components = pca_components
        
        self.text_scaler = StandardScaler()
        self.pose_scaler = StandardScaler()
        self.combined_scaler = StandardScaler()
        
        if use_pca:
            self.text_pca = PCA(n_components=pca_components)
            self.pose_pca = PCA(n_components=pca_components)
    
    def extract_all_features(self, df, process_images=True, image_limit=None):
        """Extract and combine all features"""
        # Extract text features
        text_features_df = self.text_extractor.extract_features(df)
        
        # Extract pose features if requested
        if process_images:
            pose_features_df = self.pose_extractor.download_and_process_images(df, limit=image_limit)
        else:
            pose_features_df = df.copy()
        
        # Combine features
        text_feature_cols = [col for col in text_features_df.columns if col.startswith('tfidf_')]
        pose_feature_cols = [col for col in pose_features_df.columns if col.startswith('landmark_')]
        
        # Scale features
        if text_feature_cols:
            text_features = text_features_df[text_feature_cols].values
            text_features_scaled = self.text_scaler.fit_transform(text_features)
            
            if self.use_pca:
                text_features_pca = self.text_pca.fit_transform(text_features_scaled)
                for i in range(text_features_pca.shape[1]):
                    text_features_df[f'text_pca_{i}'] = text_features_pca[:, i]
        
        if pose_feature_cols:
            pose_features = pose_features_df[pose_feature_cols].values
            pose_features_scaled = self.pose_scaler.fit_transform(pose_features)
            
            if self.use_pca:
                pose_features_pca = self.pose_pca.fit_transform(pose_features_scaled)
                for i in range(pose_features_pca.shape[1]):
                    pose_features_df[f'pose_pca_{i}'] = pose_features_pca[:, i]
        
        # Merge DataFrames
        result_df = pd.concat([text_features_df, pose_features_df], axis=1)
        
        return result_df

# Initialize and use the combined feature extractor
if 'cleaned_df' in locals():
    combined_extractor = CombinedFeatureExtractor(
        text_extractor=text_extractor,
        pose_extractor=pose_extractor,
        use_pca=True,
        pca_components=50
    )
    
    combined_features_df = combined_extractor.extract_all_features(
        cleaned_df,
        process_images=True,
        image_limit=100  # Limit for testing
    )
    
    # Display combined features
    print("\nCombined Features Shape:", combined_features_df.shape)
    print("\nCombined Feature Columns:")
    combined_feature_cols = [col for col in combined_features_df.columns if col.startswith(('text_pca_', 'pose_pca_'))]
    for col in combined_feature_cols:
        print(f"- {col}")
    
    # Save final features
    combined_features_df.to_csv(f"{PROCESSED_DATA_DIR}/combined_features.csv", index=False)
    print(f"\nCombined features saved to {PROCESSED_DATA_DIR}/combined_features.csv")