In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_absolute_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
# --- 1. Data Loading and Quality Checks ---
df = pd.read_csv("expanded_fitness_data.csv")
print("Data loaded successfully.")
print("\n--- Initial Data Quality Check ---")
print(f"Total samples: {df.shape[0]}, Total features: {df.shape[1]}")
print(f"Total null values found: {df.isnull().sum().sum()}")
print(f"Total duplicated rows found: {df.duplicated().sum()}")

# --- 2. Feature Engineering ---
# Training Volume = Sets * Reps (Measure of total work)
df['Training_Volume'] = df['Sets'] * df['Reps']
# Calorie Density = Calories / serving_size_g (Measure of food quality)
epsilon = 1e-6
df['Calorie_Density'] = df['Calories'] / (df['serving_size_g'] + epsilon)
print("\nFeature Engineering complete: 'Training_Volume' and 'Calorie_Density' created.")
# --- 3. Define Final Feature Set (VIF-Treated) and Target ---
# This set reflects the stability treatment: Weight (kg) and BMI were removed due to high VIF.
# Training_Volume is included as an engineered feature.
final_features = [
    'Session_Duration (hours)', 'Avg_BPM', 'Max_BPM', 'Resting_BPM',
    'Height (m)', 'Age', 'Fat_Percentage', 'Experience_Level', 
    'Workout_Frequency (days/week)', 'Training_Volume', 
    'Workout_Type' # Categorical feature
]
X = df[final_features]
y = df['Calories_Burned']
# --- 4. Encoding and Final Data Preparation (Pipeline) ---
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
# Define numeric and categorical feature lists
numeric_features = [f for f in final_features if f != 'Workout_Type']
categorical_features = ['Workout_Type']

Data loaded successfully.

--- Initial Data Quality Check ---
Total samples: 20000, Total features: 44
Total null values found: 0
Total duplicated rows found: 0

Feature Engineering complete: 'Training_Volume' and 'Calorie_Density' created.


In [20]:
# ColumnTransformer: median imputation + scaling for numeric, one-hot for categorical
# Build OneHotEncoder in a version-compatible way (sparse vs sparse_output)
try:
    ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
except TypeError:
    # newer sklearn renamed 'sparse' to 'sparse_output'
    ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numeric_features),
    ('cat', ohe, categorical_features),
], remainder='drop')

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())])

# Fit preprocessor separately to build a DataFrame for diagnostics (VIF) and to get feature names
X_processed = preprocessor.fit_transform(X)
num_cols = numeric_features
# get OHE feature names in a way that works across sklearn versions
try:
    cat_cols = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features).tolist()
except Exception:
    # fallback for older sklearn versions
    try:
        cat_cols = preprocessor.named_transformers_['cat'].get_feature_names(categorical_features).tolist()
    except Exception:
        # last resort: infer based on transformed shape
        cat_cols = [f"{categorical_features[0]}_lvl_{i}" for i in range(X_processed.shape[1] - len(num_cols))]

feature_names = num_cols + cat_cols
X_encoded_clean = pd.DataFrame(X_processed, columns=feature_names)
# replace infs and -infs with NaN
X_encoded_clean.replace([np.inf, -np.inf], np.nan, inplace=True)
# show missing counts after preprocessing
nan_counts = X_encoded_clean.isnull().sum()
if nan_counts.any():
    print("Missing values per column after preprocessing (will fill with median for VIF):")
    print(nan_counts[nan_counts > 0])
# fill feature NaNs with column median (safe for VIF and linear model)
X_encoded_clean = X_encoded_clean.fillna(X_encoded_clean.median())
# ensure target is numeric and align indexes
y_clean = pd.to_numeric(y, errors='coerce')
if y_clean.isnull().any():
    print(f"Warning: {y_clean.isnull().sum()} target rows could not be coerced to numeric and will be dropped.")
mask = y_clean.notnull()
X_encoded_clean = X_encoded_clean.loc[mask].reset_index(drop=True)
y_clean = y_clean.loc[mask].reset_index(drop=True)
# also keep the original X filtered for pipeline cross-val/training
X_filtered = X.loc[mask].reset_index(drop=True)
# add constant and compute VIF (handle exceptions per-column)
X_vif_check = add_constant(X_encoded_clean)
vif_data = pd.DataFrame()
vif_data["feature"] = X_vif_check.columns
vifs = []
for i in range(X_vif_check.shape[1]):
    try:
        vifs.append(variance_inflation_factor(X_vif_check.values, i))
    except Exception as e:
        vifs.append(np.nan)
        print(f"VIF computation failed for column index {i} ({X_vif_check.columns[i]}): {e}")
vif_data["VIF"] = vifs
vif_data = vif_data[vif_data['feature'] != 'const'].sort_values(by="VIF", ascending=False)
print("\n--- Final VIF Check (showing top 10) ---")
print(vif_data.head(10).to_markdown(index=False, numalign="left", stralign="left"))


--- Final VIF Check (showing top 10) ---
| feature                       | VIF     |
|:------------------------------|:--------|
| Experience_Level              | 4.66198 |
| Workout_Frequency (days/week) | 3.35366 |
| Session_Duration (hours)      | 2.36458 |
| Workout_Type_Strength         | 1.5162  |
| Workout_Type_Yoga             | 1.51448 |
| Workout_Type_HIIT             | 1.51128 |
| Fat_Percentage                | 1.03534 |
| Height (m)                    | 1.02778 |
| Max_BPM                       | 1.0153  |
| Training_Volume               | 1.0113  |


In [21]:
# --- 6. Cross-Validation (Model Training and Evaluation) using Pipeline ---
cv = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# 5-Fold Cross-Validation for R-squared and MAE using the pipeline and the original filtered X
r2_scores = cross_val_score(pipeline, X_filtered, y_clean, scoring='r2', cv=cv)
mae_scores = -cross_val_score(pipeline, X_filtered, y_clean, scoring=mae_scorer, cv=cv)

# --- 7. Display Final Best Results and Coefficients ---
print("\n=======================================================")
print("  FINAL CROSS-VALIDATION RESULTS (Pipeline Linear Regression)  ")
print("=======================================================")
print(f"Target Variable: Calories_Burned")
print(f"Model Type: sklearn Pipeline -> Linear Regression")
print("-" * 55)
print(f"Mean R-squared (Consistency): {np.mean(r2_scores):.4f} (+/- {np.std(r2_scores):.4f})")
print(f"Mean Absolute Error (Accuracy): {np.mean(mae_scores):.2f} calories (+/- {np.std(mae_scores):.2f})")
print("=======================================================")

# Train the final pipeline on all filtered data to get the final coefficients
pipeline.fit(X_filtered, y_clean)
# Extract linear coefficients by transforming X_filtered through preprocessor
X_all_processed = preprocessor.transform(X_filtered)
coeffs = pipeline.named_steps['regressor'].coef_
coefficients = pd.DataFrame(coeffs, feature_names, columns=['Coefficient'])
coefficients_sorted = coefficients.abs().sort_values(by='Coefficient', ascending=False)

print("\n--- Top 5 Feature Contributions (Coefficients) ---")
print(coefficients_sorted.head(5).to_markdown(numalign="left", stralign="left"))
# ...existing code...


  FINAL CROSS-VALIDATION RESULTS (Pipeline Linear Regression)  
Target Variable: Calories_Burned
Model Type: sklearn Pipeline -> Linear Regression
-------------------------------------------------------
Mean R-squared (Consistency): 0.9677 (+/- 0.0011)
Mean Absolute Error (Accuracy): 59.64 calories (+/- 1.50)

--- Top 5 Feature Contributions (Coefficients) ---
|                          | Coefficient   |
|:-------------------------|:--------------|
| Workout_Type_HIIT        | 451.314       |
| Session_Duration (hours) | 339.464       |
| Workout_Type_Yoga        | 297.73        |
| Workout_Type_Strength    | 151.466       |
| Experience_Level         | 91.3004       |
