In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np

In [2]:
df = pd.read_csv("../../data/final/final_gym_dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         973 non-null    float64
 1   weight                      973 non-null    float64
 2   height                      973 non-null    float64
 3   max_bpm                     973 non-null    float64
 4   avg_bpm                     973 non-null    float64
 5   resting_bpm                 973 non-null    float64
 6   session_duration            973 non-null    float64
 7   calories_burned             973 non-null    float64
 8   fat_percentage              973 non-null    float64
 9   water_intake                973 non-null    float64
 10  workout_frequency           973 non-null    float64
 11  experience_level            973 non-null    float64
 12  bmi                         973 non-null    float64
 13  gender_Female               973 non

In [4]:
x = df.drop('fat_percentage', axis=1)
y = df['fat_percentage']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 1. LinearRegression

In [5]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

lr_score = r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)
lr_scores = cross_val_score(lr, x, y, cv=kf, scoring='r2')

print("Linear Regression R2:", lr_score)
print("Linear Regression MAE:", lr_mae)
print("K-Fold mean:", lr_scores.mean())
print("K-Fold std:", lr_scores.std())

Linear Regression R2: 0.6326583761679292
Linear Regression MAE: 3.200625371154567
K-Fold mean: 0.6271518130420031
K-Fold std: 0.034441143105037986


# 2. Lasso

In [6]:
from sklearn.linear_model import LassoCV

lasso = LassoCV(cv=10, random_state=42)
lasso.fit(x_train, y_train)

importance = np.abs(lasso.coef_)
selected_features = x_train.columns[importance > 0]

lasso.fit(x_train[selected_features], y_train)
y_pred = lasso.predict(x_test[selected_features])

lasso_score = r2_score(y_test, y_pred)
lasso_mae = mean_absolute_error(y_test, y_pred)
lasso_scores = cross_val_score(lasso, x[selected_features], y, cv=kf, scoring='r2')

print("Lasso Regression R2:", lasso_score)
print("Lasso Regression MAE:", lasso_mae)
print("K-Fold mean:", lasso_scores.mean())
print("K-Fold std:", lasso_scores.std())
print("Selected features:", list(selected_features))

Lasso Regression R2: 0.630737556287549
Lasso Regression MAE: 3.2064637056208873
K-Fold mean: 0.6349639520182755
K-Fold std: 0.03505750415844593
Selected features: ['resting_bpm', 'session_duration', 'calories_burned', 'water_intake', 'workout_frequency', 'experience_level', 'gender_Female', 'workout_type_Cardio', 'workout_type_HIIT', 'workout_type_Strength', 'workout_type_Yoga', 'age_category_40-49', 'weight_category_60-79', 'weight_category_80-99', 'bmi_category_healthy', 'bmi_category_obesity', 'bmi_category_overweight', 'water_per_session']


# 3. Ridge

In [7]:
from sklearn.linear_model import RidgeCV

ridge = RidgeCV(cv=10)
ridge.fit(x_train, y_train)

importance = np.abs(ridge.coef_)
selected_features = x_train.columns[importance > np.mean(importance)]

ridge.fit(x_train[selected_features], y_train)
y_pred = ridge.predict(x_test[selected_features])

ridge_score = r2_score(y_test, y_pred)
ridge_mae = mean_absolute_error(y_test, y_pred)
ridge_scores = cross_val_score(ridge, x[selected_features], y, cv=kf, scoring='r2')

print("Ridge Regression R2:", ridge_score)
print("Ridge Regression MAE:", ridge_mae)
print("K-Fold mean:", ridge_scores.mean())
print("K-Fold std:", ridge_scores.std())
print("Selected features:", list(selected_features))

Ridge Regression R2: 0.6323339771358308
Ridge Regression MAE: 3.1811499627256254
K-Fold mean: 0.6403559066149989
K-Fold std: 0.025437919421281978
Selected features: ['session_duration', 'calories_burned', 'water_intake', 'workout_frequency', 'experience_level', 'gender_Female', 'gender_Male', 'weight_category_80-99', 'water_per_session', 'experience_intensity_ratio']


# 4. ElasticNet

In [8]:
from sklearn.linear_model import ElasticNetCV

elastic = ElasticNetCV(cv=10, random_state=42)
elastic.fit(x_train, y_train)

importance = np.abs(elastic.coef_)
selected_features = x_train.columns[importance > 0]

elastic.fit(x_train[selected_features], y_train)
y_pred = elastic.predict(x_test[selected_features])

elastic_score = r2_score(y_test, y_pred)
elastic_mae = mean_absolute_error(y_test, y_pred)
elastic_scores = cross_val_score(elastic, x[selected_features], y, cv=kf, scoring='r2')

print("ElasticNet R2:", elastic_score)
print("ElasticNet MAE:", elastic_mae)
print("K-Fold mean:", elastic_scores.mean())
print("K-Fold std:", elastic_scores.std())
print("Selected features:", list(selected_features))
print("--------------------------------------------------")


ElasticNet R2: 0.6324723087446029
ElasticNet MAE: 3.1877017816188213
K-Fold mean: 0.630530656427059
K-Fold std: 0.03493955817273135
Selected features: ['height', 'max_bpm', 'avg_bpm', 'resting_bpm', 'session_duration', 'calories_burned', 'water_intake', 'workout_frequency', 'experience_level', 'gender_Female', 'gender_Male', 'workout_type_Cardio', 'workout_type_HIIT', 'workout_type_Strength', 'workout_type_Yoga', 'age_category_30-39', 'age_category_40-49', 'age_category_50-100', 'weight_category_40-59', 'weight_category_60-79', 'weight_category_80-99', 'bmi_category_healthy', 'bmi_category_obesity', 'bmi_category_overweight', 'bmi_category_under weight', 'hrr', 'water_per_session', 'experience_intensity_ratio']
--------------------------------------------------


# 5. Decision Tree

In [9]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)
dt.fit(x_train, y_train)

importance = dt.feature_importances_
selected_features = x_train.columns[importance > np.mean(importance)]

dt.fit(x_train[selected_features], y_train)
y_pred = dt.predict(x_test[selected_features])

dt_score = r2_score(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)
dt_scores = cross_val_score(dt, x[selected_features], y, cv=kf, scoring='r2')

print("Decision Tree R2:", dt_score)
print("Decision Tree MAE:", dt_mae)
print("K-Fold mean:", dt_scores.mean())
print("K-Fold std:", dt_scores.std())
print("Selected features:", list(selected_features))

Decision Tree R2: 0.8078610298785487
Decision Tree MAE: 2.4015135749596475
K-Fold mean: 0.8077986684058436
K-Fold std: 0.023107022876457176
Selected features: ['experience_level', 'gender_Female', 'gender_Male']


# 6. Random Forest

In [10]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(x_train, y_train)

importance = rf.feature_importances_
selected_features = x_train.columns[importance > np.mean(importance)]

rf.fit(x_train[selected_features], y_train)
y_pred = rf.predict(x_test[selected_features])

rf_score = r2_score(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)
rf_scores = cross_val_score(rf, x[selected_features], y, cv=kf, scoring='r2')

print("Random Forest R2:", rf_score)
print("Random Forest MAE:", rf_mae)
print("K-Fold mean:", rf_scores.mean())
print("K-Fold std:", rf_scores.std())
print("Selected features:", list(selected_features))

Random Forest R2: 0.7557856131703684
Random Forest MAE: 2.5458638251203327
K-Fold mean: 0.7497906702624347
K-Fold std: 0.02763677774940567
Selected features: ['session_duration', 'experience_level', 'gender_Female', 'gender_Male']


# 7. Gradient Boosting

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(x_train, y_train)

importance = gbr.feature_importances_
selected_features = x_train.columns[importance > np.mean(importance)]

gbr.fit(x_train[selected_features], y_train)
y_pred = gbr.predict(x_test[selected_features])

gbr_score = r2_score(y_test, y_pred)
gbr_mae = mean_absolute_error(y_test, y_pred)
gbr_scores = cross_val_score(gbr, x[selected_features], y, cv=kf, scoring='r2')

print("Gradient Boosting R2:", gbr_score)
print("Gradient Boosting MAE:", gbr_mae)
print("K-Fold mean:", gbr_scores.mean())
print("K-Fold std:", gbr_scores.std())
print("Selected features:", list(selected_features))

Gradient Boosting R2: 0.7966747822953737
Gradient Boosting MAE: 2.418425289533779
K-Fold mean: 0.7977559395993147
K-Fold std: 0.020910421295707134
Selected features: ['session_duration', 'experience_level', 'gender_Female', 'gender_Male']


# 8. Extra Tree

In [12]:
from sklearn.ensemble import ExtraTreesRegressor

et = ExtraTreesRegressor(random_state=42)
et.fit(x_train, y_train)

importance = et.feature_importances_
selected_features = x_train.columns[importance > np.mean(importance)]

et.fit(x_train[selected_features], y_train)
y_pred = et.predict(x_test[selected_features])

et_score = r2_score(y_test, y_pred)
et_mae = mean_absolute_error(y_test, y_pred)
et_scores = cross_val_score(et, x[selected_features], y, cv=kf, scoring='r2')

print("Extra Trees R2:", et_score)
print("Extra Trees MAE:", et_mae)
print("K-Fold mean:", et_scores.mean())
print("K-Fold std:", et_scores.std())
print("Selected features:", list(selected_features))

Extra Trees R2: 0.7837585737468733
Extra Trees MAE: 2.4545128205128224
K-Fold mean: 0.7719843795918809
K-Fold std: 0.028813152031781156
Selected features: ['session_duration', 'calories_burned', 'water_intake', 'workout_frequency', 'experience_level', 'gender_Female', 'gender_Male', 'weight_category_80-99', 'experience_intensity_ratio']


# 9. Hist Gradient Boosting

In [13]:
from sklearn.ensemble import HistGradientBoostingRegressor

hgb = HistGradientBoostingRegressor(random_state=42)
hgb.fit(x_train, y_train)

y_pred = hgb.predict(x_test)

hgb_score = r2_score(y_test, y_pred)
hgb_mae = mean_absolute_error(y_test, y_pred)
hgb_scores = cross_val_score(hgb, x, y, cv=kf, scoring='r2')

print("HistGradientBoosting R2:", hgb_score)
print("HistGradientBoosting MAE:", hgb_mae)
print("K-Fold mean:", hgb_scores.mean())
print("K-Fold std:", hgb_scores.std())

HistGradientBoosting R2: 0.7764844693793314
HistGradientBoosting MAE: 2.4744790608962037
K-Fold mean: 0.7698849274129183
K-Fold std: 0.02913555316869835


# 10. SVM

In [14]:
from sklearn.svm import SVR

svm = SVR(kernel='rbf', C=20.0)

svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

svm_score = r2_score(y_test, y_pred)
svm_mae = mean_absolute_error(y_test, y_pred)
svm_scores = cross_val_score(svm, x, y, cv=kf, scoring='r2')

print("SVM R2:", svm_score)
print("SVM MAE:", svm_mae)
print("K-Fold mean:", svm_scores.mean())
print("K-Fold std:", svm_scores.std())

SVM R2: 0.7159633413448081
SVM MAE: 2.873353522969533
K-Fold mean: 0.7023530991343359
K-Fold std: 0.03476933945123406


# 11. KNN

In [15]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

knn_score = r2_score(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)
knn_scores = cross_val_score(knn, x, y, cv=kf, scoring='r2')

print("KNN R2:", knn_score)
print("KNN MAE:", knn_mae)
print("K-Fold mean:", knn_scores.mean())
print("K-Fold std:", knn_scores.std())

KNN R2: 0.6343339035595035
KNN MAE: 3.1556410256410263
K-Fold mean: 0.6227322833819866
K-Fold std: 0.05286810308338416


# 12. Adaboost

In [16]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(random_state=42)
ada.fit(x_train, y_train)

importance = ada.feature_importances_
selected_features = x_train.columns[importance > np.mean(importance)]

ada.fit(x_train[selected_features], y_train)
y_pred = ada.predict(x_test[selected_features])

ada_score = r2_score(y_test, y_pred)
ada_mae = mean_absolute_error(y_test, y_pred)
ada_scores = cross_val_score(ada, x[selected_features], y, cv=kf, scoring='r2')

print("AdaBoost R2:", ada_score)
print("AdaBoost MAE:", ada_mae)
print("K-Fold mean:", ada_scores.mean())
print("K-Fold std:", ada_scores.std())
print("Selected features:", list(selected_features))

AdaBoost R2: 0.7980287604917363
AdaBoost MAE: 2.405070933973988
K-Fold mean: 0.8061185674351051
K-Fold std: 0.021483721528177697
Selected features: ['session_duration', 'experience_level', 'gender_Female', 'gender_Male']


# 13. XGBoost

In [17]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=42)
xgb.fit(x_train, y_train)

importance = xgb.feature_importances_
selected_features = x_train.columns[importance > np.mean(importance)]

xgb.fit(x_train[selected_features], y_train)
y_pred = xgb.predict(x_test[selected_features])

xgb_score = r2_score(y_test, y_pred)
xgb_mae = mean_absolute_error(y_test, y_pred)
xgb_scores = cross_val_score(xgb, x[selected_features], y, cv=kf, scoring='r2')

print("XGBoost R2:", xgb_score)
print("XGBoost MAE:", xgb_mae)
print("K-Fold mean:", xgb_scores.mean())
print("K-Fold std:", xgb_scores.std())
print("Selected features:", list(selected_features))

XGBoost R2: 0.7381580598910102
XGBoost MAE: 2.610483653728778
K-Fold mean: 0.7494813902877013
K-Fold std: 0.021129763900929115
Selected features: ['session_duration', 'gender_Female']


In [18]:
from rich.console import Console
from rich.table import Table

console = Console()

results = [
    ['Linear Regression', lr_score, lr_mae, lr_scores.mean(), lr_scores.std()],
    ['Lasso', lasso_score, lasso_mae, lasso_scores.mean(), lasso_scores.std()],
    ['Ridge', ridge_score, ridge_mae, ridge_scores.mean(), ridge_scores.std()],
    ['ElasticNet', elastic_score, elastic_mae, elastic_scores.mean(), elastic_scores.std()],
    ['Decision Tree', dt_score, dt_mae, dt_scores.mean(), dt_scores.std()],
    ['Random Forest', rf_score, rf_mae, rf_scores.mean(), rf_scores.std()],
    ['Gradient Boosting', gbr_score, gbr_mae, gbr_scores.mean(), gbr_scores.std()],
    ['Extra Trees', et_score, et_mae, et_scores.mean(), et_scores.std()],
    ['HistGradBoosting', hgb_score, hgb_mae, hgb_scores.mean(), hgb_scores.std()],
    ['SVR', svm_score, svm_mae, svm_scores.mean(), svm_scores.std()],
    ['KNN', knn_score, knn_mae, knn_scores.mean(), knn_scores.std()],
    ['XGBoost', xgb_score, xgb_mae, xgb_scores.mean(), xgb_scores.std()],
    ['AdaBoost', ada_score, ada_mae, ada_scores.mean(), ada_scores.std()]
]

result_sorted = sorted(results, key=lambda i: i[1], reverse=True)

best_model = max(results, key=lambda x: x[1])
worst_model = min(results, key=lambda x: x[1])

table = Table(title="Embedded Models Comparison", show_lines=True)
table.add_column("Algorithm")
table.add_column("R2 score")
table.add_column("MAE", justify="right")
table.add_column("K-Fold mean", justify="right")
table.add_column("K-Fold std", justify="right")

for row in result_sorted:
    algo, r2, mae, kmean, kstd = row
    if row == best_model:
        table.add_row(f"[bold green]{algo}[/bold green]",
                      f"[bold green]{r2:.6f}[/bold green]",
                      f"[bold green]{mae:.6f}[/bold green]",
                      f"[bold green]{kmean:.6f}[/bold green]",
                      f"[bold green]{kstd:.6f}[/bold green]")
    elif row == worst_model:
        table.add_row(f"[bold salmon1]{algo}[/bold salmon1]",
                      f"[bold salmon1]{r2:.6f}[/bold salmon1]",
                      f"[bold salmon1]{mae:.6f}[/bold salmon1]",
                      f"[bold salmon1]{kmean:.6f}[/bold salmon1]",
                      f"[bold salmon1]{kstd:.6f}[/bold salmon1]")
    else:
        table.add_row(algo, f"{r2:.6f}", f"{mae:.6f}", f"{kmean:.6f}", f"{kstd:.6f}")

console.print(table)