In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error

# --- 1. Data Loading and Feature Engineering ---
# Load the dataset
df = pd.read_csv("expanded_fitness_data.csv")

# Feature Engineering: Training Volume = Sets * Reps (Measure of total work)
df['Training_Volume'] = df['Sets'] * df['Reps']

# --- 2. Define Final Feature Set and Target ---
# This set is optimized for stability and includes the engineered feature (Training_Volume).
final_features = [
    'Session_Duration (hours)', 'Avg_BPM', 'Max_BPM', 'Resting_BPM',
    'Height (m)', 'Age', 'Fat_Percentage', 'Experience_Level', 
    'Workout_Frequency (days/week)', 'Training_Volume', 
    'Workout_Type' # Categorical feature
]

X = df[final_features]
y = df['Calories_Burned']

# --- 3. Encoding and Data Preparation ---
# One-Hot Encode the categorical 'Workout_Type'
X_encoded = pd.get_dummies(X, columns=['Workout_Type'], drop_first=True)

# --- 4. Cross-Validation (Model Training and Evaluation) with Random Forest ---
# Initialize the Random Forest model (best choice for this data)
model_rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) 

# Setup 5-Fold Cross-Validation for robust evaluation
cv = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

print("--- Training Random Forest Model (5-Fold Cross-Validation) ---")

# Execute 5-Fold Cross-Validation for R-squared
r2_scores_rf = cross_val_score(model_rf, X_encoded, y, scoring='r2', cv=cv)

# Execute 5-Fold Cross-Validation for MAE
mae_scores_rf = -cross_val_score(model_rf, X_encoded, y, scoring=mae_scorer, cv=cv)

# --- 5. Display Results and Feature Importance ---
print("\n=======================================================")
print("  FINAL CROSS-VALIDATION RESULTS (Random Forest) ")
print("=======================================================")
print(f"Target Variable: Calories_Burned")
print(f"Model Type: Random Forest Regressor")
print("-" * 55)
print(f"Mean R-squared: {np.mean(r2_scores_rf):.4f} (+/- {np.std(r2_scores_rf):.4f})")
print(f"Mean Absolute Error (MAE): {np.mean(mae_scores_rf):.2f} calories (+/- {np.std(mae_scores_rf):.2f})")
print("=======================================================")

# Train the final model on all data to extract feature importance
model_rf.fit(X_encoded, y)
feature_importances = pd.DataFrame({
    'feature': X_encoded.columns, 
    'importance': model_rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\n--- Top 5 Feature Importance (Random Forest) ---")
print(feature_importances.head(5).to_markdown(index=False, numalign="left", stralign="left"))

--- Training Random Forest Model (5-Fold Cross-Validation) ---

  FINAL CROSS-VALIDATION RESULTS (Random Forest) 
Target Variable: Calories_Burned
Model Type: Random Forest Regressor
-------------------------------------------------------
Mean R-squared: 1.0000 (+/- 0.0000)
Mean Absolute Error (MAE): 0.57 calories (+/- 0.03)

--- Top 5 Feature Importance (Random Forest) ---
| feature                  | importance   |
|:-------------------------|:-------------|
| Session_Duration (hours) | 0.385295     |
| Experience_Level         | 0.298987     |
| Workout_Type_Yoga        | 0.171566     |
| Workout_Type_HIIT        | 0.132247     |
| Workout_Type_Strength    | 0.01188      |
