# M2: Exploratory Data Analysis & Association Rule Mining
## Personal Strength Training Data

**Discovery Question Addressed:** What exercises are frequently performed together within the same workout session, and what compound movement patterns emerge across different training splits?

---

## 1. Setup and Data Loading

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Association rule mining
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("Libraries loaded successfully")

ModuleNotFoundError: No module named 'mlxtend'

In [None]:
# Load datasets
# Update these paths to match your file locations
sessions_df = pd.read_csv('training_sessions.csv')
sets_df = pd.read_csv('training_sets.csv')

# Convert date columns
sessions_df['date'] = pd.to_datetime(sessions_df['date'])
sets_df['date'] = pd.to_datetime(sets_df['date'])

print(f"Sessions dataset: {sessions_df.shape[0]} rows, {sessions_df.shape[1]} columns")
print(f"Sets dataset: {sets_df.shape[0]} rows, {sets_df.shape[1]} columns")
print(f"\nDate range: {sessions_df['date'].min().date()} to {sessions_df['date'].max().date()}")

## 2. Data Quality Assessment

Before analysis, we need to understand and quantify data quality issues.

In [None]:
# Sessions dataset overview
print("=" * 60)
print("SESSIONS DATASET QUALITY REPORT")
print("=" * 60)
print(f"\nTotal sessions: {len(sessions_df)}")
print(f"\nMissing values per column:")
sessions_missing = sessions_df.isnull().sum()
sessions_missing_pct = (sessions_missing / len(sessions_df) * 100).round(2)
for col in sessions_df.columns:
    if sessions_missing[col] > 0:
        print(f"  {col}: {sessions_missing[col]} ({sessions_missing_pct[col]}%)")
if sessions_missing.sum() == 0:
    print("  No missing values")

print(f"\nSynthetic sessions: {sessions_df['is_synthetic'].sum()} ({sessions_df['is_synthetic'].mean()*100:.1f}%)")

In [None]:
# Sets dataset quality
print("=" * 60)
print("SETS DATASET QUALITY REPORT")
print("=" * 60)
print(f"\nTotal sets: {len(sets_df)}")

print(f"\nMissing values per column:")
sets_missing = sets_df.isnull().sum()
sets_missing_pct = (sets_missing / len(sets_df) * 100).round(2)
for col in sets_df.columns:
    if sets_missing[col] > 0:
        print(f"  {col}: {sets_missing[col]} ({sets_missing_pct[col]}%)")

# Weight notation analysis
print(f"\n--- Weight Data Quality ---")
missing_weight = sets_df['weight_lbs'].isnull().sum()
print(f"Sets with missing weight_lbs: {missing_weight} ({missing_weight/len(sets_df)*100:.1f}%)")

# Reps analysis
missing_reps = sets_df['reps'].isnull().sum()
print(f"Sets with missing reps: {missing_reps} ({missing_reps/len(sets_df)*100:.1f}%)")

# Volume analysis
missing_volume = sets_df['volume'].isnull().sum()
print(f"Sets with missing volume: {missing_volume} ({missing_volume/len(sets_df)*100:.1f}%)")

# Check for paused sets
paused_sets = sets_df['notes'].str.contains('paused', case=False, na=False).sum()
print(f"\nPaused sets: {paused_sets} ({paused_sets/len(sets_df)*100:.1f}%)")

# Bodyweight exercises
bw_sets = sets_df['notes'].str.contains('bodyweight', case=False, na=False).sum()
print(f"Bodyweight sets: {bw_sets} ({bw_sets/len(sets_df)*100:.1f}%)")

In [None]:
# Exercise name standardization check
print("=" * 60)
print("EXERCISE STANDARDIZATION REPORT")
print("=" * 60)

n_raw_exercises = sets_df['exercise_raw'].nunique()
n_standard_exercises = sets_df['exercise_standard'].nunique()

print(f"\nUnique raw exercise names: {n_raw_exercises}")
print(f"Unique standardized exercise names: {n_standard_exercises}")
print(f"Reduction from standardization: {n_raw_exercises - n_standard_exercises} names consolidated ({(1 - n_standard_exercises/n_raw_exercises)*100:.1f}% reduction)")

## 3. Exploratory Data Analysis

### 3.1 Temporal Patterns

In [None]:
# Training frequency by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = sessions_df['day_of_week'].value_counts().reindex(day_order)

fig, ax = plt.subplots(figsize=(10, 5))
bars = ax.bar(day_counts.index, day_counts.values, color=sns.color_palette('husl', 7))
ax.set_xlabel('Day of Week')
ax.set_ylabel('Number of Sessions')
ax.set_title('Training Frequency by Day of Week')

# Add value labels
for bar, val in zip(bars, day_counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
            str(val), ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('fig1_training_frequency_by_day.png', dpi=150)
plt.show()

print(f"\nMost common training day: {day_counts.idxmax()} ({day_counts.max()} sessions)")
print(f"Least common training day: {day_counts.idxmin()} ({day_counts.min()} sessions)")

In [None]:
# Training volume over time
sessions_df['month'] = sessions_df['date'].dt.to_period('M')
monthly_volume = sessions_df.groupby('month')['total_volume'].mean()

fig, ax = plt.subplots(figsize=(14, 5))
monthly_volume.plot(kind='line', marker='o', ax=ax, linewidth=2, markersize=4)
ax.set_xlabel('Month')
ax.set_ylabel('Average Session Volume (lbs × reps)')
ax.set_title('Average Training Volume Over Time')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig('fig2_volume_over_time.png', dpi=150)
plt.show()

In [None]:
# Days between sessions distribution
fig, ax = plt.subplots(figsize=(10, 5))
sessions_df['days_since_last'].hist(bins=20, ax=ax, edgecolor='black', alpha=0.7)
ax.axvline(sessions_df['days_since_last'].median(), color='red', linestyle='--', 
           label=f"Median: {sessions_df['days_since_last'].median():.0f} days")
ax.set_xlabel('Days Since Last Session')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Rest Days Between Sessions')
ax.legend()
plt.tight_layout()
plt.savefig('fig3_rest_days_distribution.png', dpi=150)
plt.show()

print(f"\nRest days statistics:")
print(f"  Mean: {sessions_df['days_since_last'].mean():.1f} days")
print(f"  Median: {sessions_df['days_since_last'].median():.0f} days")
print(f"  Max gap: {sessions_df['days_since_last'].max()} days")

### 3.2 Workout Type Analysis

In [None]:
# Workout type distribution
workout_counts = sessions_df['workout_type'].value_counts()

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.barh(workout_counts.index[:15], workout_counts.values[:15])
ax.set_xlabel('Number of Sessions')
ax.set_ylabel('Workout Type')
ax.set_title('Top 15 Workout Types by Frequency')
ax.invert_yaxis()

# Add value labels
for bar, val in zip(bars, workout_counts.values[:15]):
    ax.text(val + 0.5, bar.get_y() + bar.get_height()/2, 
            str(val), va='center', fontsize=9)

plt.tight_layout()
plt.savefig('fig4_workout_types.png', dpi=150)
plt.show()

print(f"\nTotal unique workout types: {sessions_df['workout_type'].nunique()}")

In [None]:
# Volume by workout type
workout_volume = sessions_df.groupby('workout_type')['total_volume'].agg(['mean', 'std', 'count'])
workout_volume = workout_volume[workout_volume['count'] >= 5].sort_values('mean', ascending=False)

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.barh(workout_volume.index[:12], workout_volume['mean'].values[:12])
ax.set_xlabel('Average Total Volume (lbs × reps)')
ax.set_ylabel('Workout Type')
ax.set_title('Average Volume by Workout Type (min 5 sessions)')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig('fig5_volume_by_workout_type.png', dpi=150)
plt.show()

### 3.3 Exercise Analysis

In [None]:
# Most frequent exercises
exercise_counts = sets_df['exercise_standard'].value_counts()

fig, ax = plt.subplots(figsize=(12, 8))
bars = ax.barh(exercise_counts.index[:20], exercise_counts.values[:20])
ax.set_xlabel('Number of Sets')
ax.set_ylabel('Exercise')
ax.set_title('Top 20 Most Frequent Exercises')
ax.invert_yaxis()
plt.tight_layout()
plt.savefig('fig6_top_exercises.png', dpi=150)
plt.show()

print(f"\nTotal unique exercises: {sets_df['exercise_standard'].nunique()}")
print(f"\nTop 10 exercises account for {exercise_counts.head(10).sum()/len(sets_df)*100:.1f}% of all sets")

In [None]:
# Weight distribution by exercise category
# Create exercise categories based on common patterns
def categorize_exercise(name):
    name_lower = name.lower()
    if any(x in name_lower for x in ['bench', 'chest', 'fly', 'dip', 'press']):
        if 'shoulder' in name_lower or 'overhead' in name_lower or 'ohp' in name_lower:
            return 'Shoulders'
        return 'Chest'
    elif any(x in name_lower for x in ['row', 'pull', 'lat', 'pulldown', 'chin']):
        return 'Back'
    elif any(x in name_lower for x in ['squat', 'leg', 'lunge', 'hack', 'curl', 'extension', 'calf']) and 'curl' in name_lower and 'leg' in name_lower:
        return 'Legs'
    elif any(x in name_lower for x in ['squat', 'leg', 'lunge', 'hack', 'extension', 'calf', 'abduct']):
        return 'Legs'
    elif any(x in name_lower for x in ['curl', 'bicep', 'hammer', 'preacher']) and 'leg' not in name_lower:
        return 'Biceps'
    elif any(x in name_lower for x in ['tricep', 'pushdown', 'skull', 'extension']) and 'leg' not in name_lower:
        return 'Triceps'
    elif any(x in name_lower for x in ['deadlift', 'rdl', 'romanian']):
        return 'Posterior Chain'
    elif any(x in name_lower for x in ['lateral', 'delt', 'shoulder', 'shrug', 'upright']):
        return 'Shoulders'
    else:
        return 'Other'

sets_df['exercise_category'] = sets_df['exercise_standard'].apply(categorize_exercise)

# Plot
category_counts = sets_df['exercise_category'].value_counts()
fig, ax = plt.subplots(figsize=(10, 6))
colors = sns.color_palette('husl', len(category_counts))
ax.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%', 
       colors=colors, startangle=90)
ax.set_title('Distribution of Sets by Muscle Group')
plt.tight_layout()
plt.savefig('fig7_muscle_group_distribution.png', dpi=150)
plt.show()

### 3.4 Session Characteristics

In [None]:
# Correlation matrix for session metrics
session_metrics = sessions_df[['num_exercises', 'num_sets', 'total_volume', 
                                'avg_weight', 'avg_reps', 'session_duration_est', 'days_since_last']]

fig, ax = plt.subplots(figsize=(10, 8))
correlation_matrix = session_metrics.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.2f', ax=ax, square=True)
ax.set_title('Correlation Matrix: Session Metrics')
plt.tight_layout()
plt.savefig('fig8_correlation_matrix.png', dpi=150)
plt.show()

In [None]:
# Session volume vs number of exercises
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(sessions_df['num_exercises'], sessions_df['total_volume'], alpha=0.5)
ax.set_xlabel('Number of Exercises')
ax.set_ylabel('Total Volume')
ax.set_title('Session Volume vs Number of Exercises')

# Add trend line
z = np.polyfit(sessions_df['num_exercises'], sessions_df['total_volume'], 1)
p = np.poly1d(z)
x_line = np.linspace(sessions_df['num_exercises'].min(), sessions_df['num_exercises'].max(), 100)
ax.plot(x_line, p(x_line), "r--", alpha=0.8, label=f'Trend line')
ax.legend()
plt.tight_layout()
plt.savefig('fig9_volume_vs_exercises.png', dpi=150)
plt.show()

## 4. Data Preprocessing for Association Rules

For association rule mining, we need to transform our data into a transactional format where each session is a transaction and exercises are items.

In [None]:
# Create transactions from sessions
# Each session becomes a transaction with exercises as items

def parse_exercises(exercises_str):
    """Parse comma-separated exercise string into list"""
    if pd.isna(exercises_str):
        return []
    return [ex.strip() for ex in exercises_str.split(',')]

# Create list of transactions
transactions = sessions_df['exercises_list'].apply(parse_exercises).tolist()

print(f"Total transactions (sessions): {len(transactions)}")
print(f"\nSample transactions:")
for i in range(3):
    print(f"  Session {i+1}: {transactions[i]}")

In [None]:
# Transform to one-hot encoded format for mlxtend
te = TransactionEncoder()
te_array = te.fit_transform(transactions)
transaction_df = pd.DataFrame(te_array, columns=te.columns_)

print(f"Transaction matrix shape: {transaction_df.shape}")
print(f"  - {transaction_df.shape[0]} sessions")
print(f"  - {transaction_df.shape[1]} unique exercises")

# Check sparsity
sparsity = 1 - (transaction_df.sum().sum() / (transaction_df.shape[0] * transaction_df.shape[1]))
print(f"\nMatrix sparsity: {sparsity*100:.1f}%")

In [None]:
# Examine exercise frequency in transactions
exercise_freq = transaction_df.sum().sort_values(ascending=False)
exercise_support = exercise_freq / len(transaction_df)

print("Top 15 exercises by support (frequency in sessions):")
print("="*50)
for ex, sup in exercise_support.head(15).items():
    print(f"  {ex}: {sup:.3f} ({int(exercise_freq[ex])} sessions)")

## 5. Association Rule Mining

### 5.1 Parameter Selection

**Minimum Support:** 0.05 (5%)
- Rationale: With 419 sessions, this means an itemset must appear in at least ~21 sessions
- This filters out very rare combinations while still capturing meaningful patterns

**Minimum Confidence:** 0.5 (50%)
- Rationale: If exercise A appears, we want at least 50% chance of seeing exercise B
- This ensures rules have predictive value

**Minimum Lift:** 1.0
- Rationale: Lift > 1 indicates positive association (better than random chance)

In [None]:
# Apply Apriori algorithm
MIN_SUPPORT = 0.05
MIN_CONFIDENCE = 0.5
MIN_LIFT = 1.0

print(f"Running Apriori with min_support={MIN_SUPPORT}...")
frequent_itemsets = apriori(transaction_df, min_support=MIN_SUPPORT, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(len)

print(f"\nFrequent itemsets found: {len(frequent_itemsets)}")
print(f"\nBreakdown by itemset size:")
print(frequent_itemsets['length'].value_counts().sort_index())

In [None]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=MIN_CONFIDENCE)
rules = rules[rules['lift'] >= MIN_LIFT]

print(f"Association rules generated: {len(rules)}")
print(f"\nRules filtered by: confidence >= {MIN_CONFIDENCE}, lift >= {MIN_LIFT}")

In [None]:
# Display top rules by lift
rules_sorted = rules.sort_values('lift', ascending=False)

print("\nTOP 20 ASSOCIATION RULES (sorted by lift)")
print("="*80)

for idx, row in rules_sorted.head(20).iterrows():
    antecedent = ', '.join(list(row['antecedents']))
    consequent = ', '.join(list(row['consequents']))
    print(f"\n{antecedent}")
    print(f"  → {consequent}")
    print(f"  Support: {row['support']:.3f} | Confidence: {row['confidence']:.3f} | Lift: {row['lift']:.2f}")

In [None]:
# Visualize rules: Support vs Confidence colored by Lift
fig, ax = plt.subplots(figsize=(12, 8))
scatter = ax.scatter(rules['support'], rules['confidence'], 
                     c=rules['lift'], cmap='viridis', alpha=0.6, s=50)
ax.set_xlabel('Support')
ax.set_ylabel('Confidence')
ax.set_title('Association Rules: Support vs Confidence (color = Lift)')
plt.colorbar(scatter, label='Lift')
plt.tight_layout()
plt.savefig('fig10_rules_support_confidence.png', dpi=150)
plt.show()

In [None]:
# Analyze rules by lift distribution
fig, ax = plt.subplots(figsize=(10, 5))
rules['lift'].hist(bins=30, ax=ax, edgecolor='black', alpha=0.7)
ax.axvline(rules['lift'].median(), color='red', linestyle='--', 
           label=f"Median lift: {rules['lift'].median():.2f}")
ax.set_xlabel('Lift')
ax.set_ylabel('Number of Rules')
ax.set_title('Distribution of Lift Values Across Rules')
ax.legend()
plt.tight_layout()
plt.savefig('fig11_lift_distribution.png', dpi=150)
plt.show()

### 5.2 Workout-Specific Association Rules

Let's also analyze patterns within specific workout types.

In [None]:
# Analyze "Upper" workout associations
upper_sessions = sessions_df[sessions_df['workout_type'].str.contains('Upper|Torso', case=False, na=False)]
upper_transactions = upper_sessions['exercises_list'].apply(parse_exercises).tolist()

if len(upper_transactions) >= 20:  # Need enough sessions
    te_upper = TransactionEncoder()
    te_upper_array = te_upper.fit_transform(upper_transactions)
    upper_df = pd.DataFrame(te_upper_array, columns=te_upper.columns_)
    
    # Run Apriori with slightly lower support due to smaller dataset
    upper_itemsets = apriori(upper_df, min_support=0.08, use_colnames=True)
    if len(upper_itemsets) > 0:
        upper_rules = association_rules(upper_itemsets, metric='confidence', min_threshold=0.5)
        upper_rules = upper_rules[upper_rules['lift'] >= 1.0]
        
        print(f"UPPER/TORSO WORKOUT RULES ({len(upper_sessions)} sessions)")
        print("="*60)
        
        for idx, row in upper_rules.sort_values('lift', ascending=False).head(10).iterrows():
            antecedent = ', '.join(list(row['antecedents']))
            consequent = ', '.join(list(row['consequents']))
            print(f"\n{antecedent} → {consequent}")
            print(f"  Support: {row['support']:.3f} | Confidence: {row['confidence']:.3f} | Lift: {row['lift']:.2f}")
    else:
        print("No frequent itemsets found for Upper workouts with current support threshold")
else:
    print(f"Not enough Upper workout sessions for analysis ({len(upper_transactions)} found)")

In [None]:
# Analyze "Legs/Limbs" workout associations
legs_sessions = sessions_df[sessions_df['workout_type'].str.contains('Leg|Limb|Lower', case=False, na=False)]
legs_transactions = legs_sessions['exercises_list'].apply(parse_exercises).tolist()

if len(legs_transactions) >= 20:
    te_legs = TransactionEncoder()
    te_legs_array = te_legs.fit_transform(legs_transactions)
    legs_df = pd.DataFrame(te_legs_array, columns=te_legs.columns_)
    
    legs_itemsets = apriori(legs_df, min_support=0.08, use_colnames=True)
    if len(legs_itemsets) > 0:
        legs_rules = association_rules(legs_itemsets, metric='confidence', min_threshold=0.5)
        legs_rules = legs_rules[legs_rules['lift'] >= 1.0]
        
        print(f"LEGS/LIMBS WORKOUT RULES ({len(legs_sessions)} sessions)")
        print("="*60)
        
        for idx, row in legs_rules.sort_values('lift', ascending=False).head(10).iterrows():
            antecedent = ', '.join(list(row['antecedents']))
            consequent = ', '.join(list(row['consequents']))
            print(f"\n{antecedent} → {consequent}")
            print(f"  Support: {row['support']:.3f} | Confidence: {row['confidence']:.3f} | Lift: {row['lift']:.2f}")
    else:
        print("No frequent itemsets found for Legs workouts with current support threshold")
else:
    print(f"Not enough Legs workout sessions for analysis ({len(legs_transactions)} found)")

## 6. Findings and Interpretation

### 6.1 Summary Statistics

In [None]:
# Final summary
print("=" * 70)
print("M2 ANALYSIS SUMMARY")
print("=" * 70)

print(f"\n--- Dataset Overview ---")
print(f"Training period: {sessions_df['date'].min().date()} to {sessions_df['date'].max().date()}")
print(f"Total sessions: {len(sessions_df)}")
print(f"Total sets: {len(sets_df)}")
print(f"Unique exercises: {sets_df['exercise_standard'].nunique()}")
print(f"Unique workout types: {sessions_df['workout_type'].nunique()}")

print(f"\n--- Data Quality ---")
print(f"Missing weight values: {sets_df['weight_lbs'].isnull().sum()} ({sets_df['weight_lbs'].isnull().mean()*100:.1f}%)")
print(f"Missing rep values: {sets_df['reps'].isnull().sum()} ({sets_df['reps'].isnull().mean()*100:.1f}%)")
print(f"Synthetic data: {sessions_df['is_synthetic'].sum()} sessions ({sessions_df['is_synthetic'].mean()*100:.1f}%)")

print(f"\n--- Association Rule Mining Results ---")
print(f"Parameters: min_support={MIN_SUPPORT}, min_confidence={MIN_CONFIDENCE}, min_lift={MIN_LIFT}")
print(f"Frequent itemsets discovered: {len(frequent_itemsets)}")
print(f"Association rules generated: {len(rules)}")
print(f"Average lift of rules: {rules['lift'].mean():.2f}")
print(f"Max lift: {rules['lift'].max():.2f}")

### 6.2 Key Findings

**Finding 1: Exercise Pairing Patterns**

[Interpret your top rules here - what exercises frequently appear together?]

**Finding 2: Workout Structure**

[Discuss what the associations reveal about how workouts are structured]

**Finding 3: Muscle Group Synergies**

[Discuss any patterns related to muscle group pairings]

### 6.3 Limitations

1. The analysis treats all sessions equally, regardless of whether workout types changed over time
2. Association rules don't capture the *order* of exercises within a session
3. Some exercises may be paired due to equipment availability rather than training principles

### 6.4 Next Steps for M3

1. **Clustering (Q2):** Apply K-Means to identify distinct training phases based on volume, intensity, and exercise selection
2. **Anomaly Detection (Q3):** Use Isolation Forest to identify unusually high or low performance sessions

In [None]:
# Export rules for reference
rules_export = rules.copy()
rules_export['antecedents'] = rules_export['antecedents'].apply(lambda x: ', '.join(list(x)))
rules_export['consequents'] = rules_export['consequents'].apply(lambda x: ', '.join(list(x)))
rules_export.to_csv('association_rules_output.csv', index=False)
print("Rules exported to 'association_rules_output.csv'")