In [1]:
import pandas as pd
from rich.console import Console
from rich.table import Table

In [2]:
df = pd.read_csv('../../data/final/final_gym_dataset.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         973 non-null    float64
 1   weight                      973 non-null    float64
 2   height                      973 non-null    float64
 3   max_bpm                     973 non-null    float64
 4   avg_bpm                     973 non-null    float64
 5   resting_bpm                 973 non-null    float64
 6   session_duration            973 non-null    float64
 7   calories_burned             973 non-null    float64
 8   fat_percentage              973 non-null    float64
 9   water_intake                973 non-null    float64
 10  workout_frequency           973 non-null    float64
 11  experience_level            973 non-null    float64
 12  bmi                         973 non-null    float64
 13  gender_Female               973 non

In [4]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble   import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost

In [5]:
from sklearn.model_selection import train_test_split

x = df.drop('fat_percentage', axis=1)
y = df['fat_percentage']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.metrics import r2_score, mean_absolute_error

In [7]:
from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
from sklearn.feature_selection import RFE

# LinearRegression

In [9]:
lr = LinearRegression()

rfe = RFE(lr, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

lr.fit(x_train[selected_features], y_train)
y_pred = lr.predict(x_test[selected_features])

lr_score = r2_score(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)
lr_scores = cross_val_score(lr, x[selected_features], y, cv=kf, scoring='r2')

print(f'linear regression score: {lr_score}')
print(f'linear regression mae: {lr_mae}')
print('--------------------------------------')
print("K-Fold mean:", lr_scores.mean())
print("K-Fold std:", lr_scores.std())
print("Selected features:", list(selected_features))

linear regression score: 0.6275249670876648
linear regression mae: 3.2079282151822657
--------------------------------------
K-Fold mean: 0.6372488038142172
K-Fold std: 0.02730403248308836
Selected features: ['weight', 'height', 'calories_burned', 'workout_frequency', 'experience_level', 'bmi', 'gender_Female', 'gender_Male', 'weight_category_80-99', 'water_per_session']


# Lasso

In [10]:
lasso = Lasso(alpha=0.01)

rfe = RFE(lasso, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

lasso.fit(x_train[selected_features], y_train)
y_pred = lasso.predict(x_test[selected_features])

lasso_score = r2_score(y_test, y_pred)
lasso_mae = mean_absolute_error(y_test, y_pred)
lasso_scores = cross_val_score(lasso, x[selected_features], y, cv=kf, scoring='r2')

print(f'Lasso regression score: {lasso_score}')
print(f'Lasso regression mae: {lasso_mae}')
print('--------------------------------------')
print("K-Fold mean:", lasso_scores.mean())
print("K-Fold std:", lasso_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.6314897510585967
Lasso regression mae: 3.2086478931536897
--------------------------------------
K-Fold mean: 0.6404757160556291
K-Fold std: 0.026507303402411777
Selected features: ['session_duration', 'calories_burned', 'water_intake', 'workout_frequency', 'experience_level', 'gender_Female', 'workout_type_HIIT', 'weight_category_80-99', 'bmi_category_healthy', 'water_per_session']


# Ridge

In [11]:
ridge = Ridge(alpha=0.1) 

rfe = RFE(ridge, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

ridge.fit(x_train[selected_features], y_train)
y_pred = ridge.predict(x_test[selected_features])

ridge_score = r2_score(y_test, y_pred)
ridge_mae = mean_absolute_error(y_test, y_pred)
ridge_scores = cross_val_score(ridge, x[selected_features], y, cv=kf, scoring='r2')

print(f'Lasso regression score: {ridge_score}')
print(f'Lasso regression mae: {ridge_mae}')
print('--------------------------------------')
print("K-Fold mean:", ridge_scores.mean())
print("K-Fold std:", ridge_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.6324716915011614
Lasso regression mae: 3.1738700759248193
--------------------------------------
K-Fold mean: 0.6398605395722701
K-Fold std: 0.026723476006248196
Selected features: ['height', 'session_duration', 'calories_burned', 'workout_frequency', 'experience_level', 'bmi', 'gender_Female', 'gender_Male', 'weight_category_80-99', 'water_per_session']


# ElasticNet

In [12]:
en = ElasticNet(alpha=0.001, l1_ratio=0.9) # lasso ga yaqin 

rfe = RFE(en, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

en.fit(x_train[selected_features], y_train)
y_pred = en.predict(x_test[selected_features])

en_score = r2_score(y_test, y_pred)
en_mae = mean_absolute_error(y_test, y_pred)
en_scores = cross_val_score(en, x[selected_features], y, cv=kf, scoring='r2')

print(f'Lasso regression score: {en_score}')
print(f'Lasso regression mae: {en_mae}')
print('--------------------------------------')
print("K-Fold mean:", en_scores.mean())
print("K-Fold std:", en_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.6324979820626819
Lasso regression mae: 3.1933623053396576
--------------------------------------
K-Fold mean: 0.6409656346580926
K-Fold std: 0.028002239206908277
Selected features: ['height', 'session_duration', 'calories_burned', 'workout_frequency', 'experience_level', 'gender_Female', 'weight_category_80-99', 'bmi_category_healthy', 'bmi_category_under weight', 'water_per_session']


# Decision Tree

In [13]:
dt = DecisionTreeRegressor(random_state=42)

rfe = RFE(dt, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

dt.fit(x_train[selected_features], y_train)
y_pred = dt.predict(x_test[selected_features])

dt_score = r2_score(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)
dt_scores = cross_val_score(dt, x[selected_features], y, cv=kf, scoring='r2')

print(f'Lasso regression score: {dt_score}')
print(f'Lasso regression mae: {dt_mae}')
print('--------------------------------------')
print("K-Fold mean:", dt_scores.mean())
print("K-Fold std:", dt_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.6022950156820823
Lasso regression mae: 3.186153846153846
--------------------------------------
K-Fold mean: 0.6003441388091189
K-Fold std: 0.061848992057947756
Selected features: ['weight', 'max_bpm', 'session_duration', 'calories_burned', 'bmi', 'gender_Male', 'hrr', 'calories_per_min', 'water_per_session', 'experience_intensity_ratio']


# RandomForest

In [14]:
rf = RandomForestRegressor(random_state=42)

rfe = RFE(rf, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

rf.fit(x_train[selected_features], y_train)
y_pred = rf.predict(x_test[selected_features])

rf_score = r2_score(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)
rf_scores = cross_val_score(rf, x[selected_features], y, cv=kf, scoring='r2')

print(f'Lasso regression score: {rf_score}')
print(f'Lasso regression mae: {rf_mae}')
print('--------------------------------------')
print("K-Fold mean:", rf_scores.mean())
print("K-Fold std:", rf_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.7928899893934587
Lasso regression mae: 2.421979487179488
--------------------------------------
K-Fold mean: 0.7916308275425007
K-Fold std: 0.023949327648549665
Selected features: ['weight', 'session_duration', 'experience_level', 'bmi', 'gender_Female', 'gender_Male', 'hrr', 'calories_per_min', 'water_per_session', 'experience_intensity_ratio']


# Gradient Boosting

In [15]:
gb = GradientBoostingRegressor(n_estimators=220, max_depth=5, random_state=42) 

rfe = RFE(gb, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

gb.fit(x_train[selected_features], y_train)
y_pred = gb.predict(x_test[selected_features])

gb_score = r2_score(y_test, y_pred)
gb_mae = mean_absolute_error(y_test, y_pred)
gb_scores = cross_val_score(gb, x[selected_features], y, cv=kf, scoring='r2')

print(f'Lasso regression score: {gb_score}')
print(f'Lasso regression mae: {gb_mae}')
print('--------------------------------------')
print("K-Fold mean:", gb_scores.mean())
print("K-Fold std:", gb_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.7666686412078064
Lasso regression mae: 2.5213314128421196
--------------------------------------
K-Fold mean: 0.7598423598894464
K-Fold std: 0.029710731001005842
Selected features: ['weight', 'session_duration', 'experience_level', 'bmi', 'gender_Female', 'gender_Male', 'hrr', 'calories_per_min', 'water_per_session', 'experience_intensity_ratio']


# Extra Tree

In [16]:
et = ExtraTreesRegressor(random_state=42)

rfe = RFE(et, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

et.fit(x_train[selected_features], y_train)
y_pred = et.predict(x_test[selected_features])

et_score = r2_score(y_test, y_pred)
et_mae = mean_absolute_error(y_test, y_pred)
et_scores = cross_val_score(et, x[selected_features], y, cv=kf, scoring='r2')

print(f'Lasso regression score: {et_score}')
print(f'Lasso regression mae: {et_mae}')
print('--------------------------------------')
print("K-Fold mean:", et_scores.mean())
print("K-Fold std:", et_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.7724340946950213
Lasso regression mae: 2.4945487179487182
--------------------------------------
K-Fold mean: 0.7822406349571885
K-Fold std: 0.026291304746741385
Selected features: ['session_duration', 'calories_burned', 'water_intake', 'workout_frequency', 'experience_level', 'bmi', 'gender_Female', 'gender_Male', 'weight_category_80-99', 'hrr']


# Hist Gradient Boosting

In [17]:
hgb = HistGradientBoostingRegressor(max_iter=200, random_state=42) 

hgb.fit(x_train, y_train)
y_pred = hgb.predict(x_test)

hgb_score = r2_score(y_test, y_pred)
hgb_mae = mean_absolute_error(y_test, y_pred)
hgb_scores = cross_val_score(hgb, x, y, cv=kf, scoring='r2')

print(f'Hist Gradient Boosting score: {hgb_score}')
print(f'Hist Gradient Boosting mae: {hgb_mae}')
print('--------------------------------------')
print("K-Fold mean:", hgb_scores.mean())
print("K-Fold std:", hgb_scores.std())

Hist Gradient Boosting score: 0.7699555866013328
Hist Gradient Boosting mae: 2.4959274524524804
--------------------------------------
K-Fold mean: 0.7644410448533296
K-Fold std: 0.027751653690838006


# SVM

In [18]:
svm = SVR(kernel='rbf', C=10.0)

svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

svm_score = r2_score(y_test, y_pred)
svm_mae = mean_absolute_error(y_test, y_pred)
svm_scores = cross_val_score(svm, x, y, cv=kf, scoring='r2')

print(f'CVM score: {svm_score}')
print(f'CVM mae: {svm_mae}')
print('--------------------------------------')
print("K-Fold mean:", svm_scores.mean())
print("K-Fold std:", svm_scores.std())

CVM score: 0.7242136105752499
CVM mae: 2.823779862134825
--------------------------------------
K-Fold mean: 0.7125039687130973
K-Fold std: 0.03185028154284607


# KNN

In [19]:
knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

knn_score = r2_score(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)
knn_scores = cross_val_score(knn, x, y, cv=kf, scoring='r2')

print(f'KNN score: {knn_score}')
print(f'KNN mae: {knn_mae}')
print('--------------------------------------')
print("K-Fold mean:", knn_scores.mean())
print("K-Fold std:", knn_scores.std())

KNN score: 0.6343339035595035
KNN mae: 3.1556410256410263
--------------------------------------
K-Fold mean: 0.6227322833819866
K-Fold std: 0.05286810308338416


# Adaboost

In [20]:
ab = AdaBoostRegressor(n_estimators=200)

rfe = RFE(ab, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

ab.fit(x_train[selected_features], y_train)
y_pred = ab.predict(x_test[selected_features])

ab_score = r2_score(y_test, y_pred)
ab_mae = mean_absolute_error(y_test, y_pred)
ab_scores = cross_val_score(ab, x[selected_features], y, cv=kf, scoring='r2')

print(f'Lasso regression score: {ab_score}')
print(f'Lasso regression mae: {ab_mae}')
print('--------------------------------------')
print("K-Fold mean:", ab_scores.mean())
print("K-Fold std:", ab_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.8082640437247374
Lasso regression mae: 2.3960266140716646
--------------------------------------
K-Fold mean: 0.807596154199975
K-Fold std: 0.023816357389747782
Selected features: ['weight', 'session_duration', 'experience_level', 'bmi', 'gender_Female', 'gender_Male', 'hrr', 'intensity', 'water_per_session', 'experience_intensity_ratio']


# XGBoost

In [21]:
xgb = xgboost.XGBRegressor()

rfe = RFE(xgb, n_features_to_select=10)
rfe.fit(x_train, y_train)
selected_features = x_train.columns[rfe.support_]

xgb.fit(x_train[selected_features], y_train)
y_pred = xgb.predict(x_test[selected_features])

xgb_score = r2_score(y_test, y_pred)
xgb_mae = mean_absolute_error(y_test, y_pred)
xgb_scores = cross_val_score(xgb, x[selected_features], y, cv=kf, scoring='r2')

print(f'Lasso regression score: {xgb_score}')
print(f'Lasso regression mae: {xgb_mae}')
print('--------------------------------------')
print("K-Fold mean:", xgb_scores.mean())
print("K-Fold std:", xgb_scores.std())
print("Selected features:", list(selected_features))

Lasso regression score: 0.7400914579526354
Lasso regression mae: 2.6256833149836614
--------------------------------------
K-Fold mean: 0.7532311080531113
K-Fold std: 0.026181211561093574
Selected features: ['weight', 'session_duration', 'water_intake', 'bmi', 'gender_Female', 'bmi_category_healthy', 'hrr', 'calories_per_min', 'water_per_session', 'experience_intensity_ratio']


In [22]:
from tabulate import tabulate

In [23]:
console = Console()

results = [
    ['Linear Regression', lr_score, lr_mae, lr_scores.mean(), lr_scores.std()],
    ['Lasso', lasso_score, lasso_mae, lasso_scores.mean(), lasso_scores.std()],
    ['Ridge', ridge_score, ridge_mae, ridge_scores.mean(), ridge_scores.std()],
    ['ElasticNet', en_score, en_mae, en_scores.mean(), en_scores.std()],
    ['Decision Tree', dt_score, dt_mae, dt_scores.mean(), dt_scores.std()],
    ['Random Forest', rf_score, rf_mae, rf_scores.mean(), rf_scores.std()],
    ['Gradient Boosting', gb_score, gb_mae, gb_scores.mean(), gb_scores.std()],
    ['Extra Trees', et_score, et_mae, et_scores.mean(), et_scores.std()],
    ['HistGradBoosting', hgb_score, hgb_mae, hgb_scores.mean(), hgb_scores.std()],
    ['SVR', svm_score, svm_mae, svm_scores.mean(), svm_scores.std()],
    ['KNN', knn_score, knn_mae, knn_scores.mean(), knn_scores.std()],
    ['XGBoost', xgb_score, xgb_mae, xgb_scores.mean(), xgb_scores.std()],
    ['AdaBoost', ab_score, ab_mae, ab_scores.mean(), ab_scores.std()]
]

result_sorted = sorted(results, key=lambda i: i[1], reverse=True)

best_model = max(results, key=lambda x: x[1])
worst_model = min(results, key=lambda x: x[1])

table = Table(title="Wrapper Models Comparison", show_lines=True)
table.add_column("Algorithm")
table.add_column("R2 score")
table.add_column("MAE", justify="right")
table.add_column("K-Fold mean", justify="right")
table.add_column("K-Fold std", justify="right")

for row in result_sorted:
    algo, r2, mae, kmean, kstd = row
    if row == best_model:
        table.add_row(f"[bold green]{algo}[/bold green]",
                      f"[bold green]{r2:.6f}[/bold green]",
                      f"[bold green]{mae:.6f}[/bold green]",
                      f"[bold green]{kmean:.6f}[/bold green]",
                      f"[bold green]{kstd:.6f}[/bold green]")
    elif row == worst_model:
        table.add_row(f"[bold salmon1]{algo}[/bold salmon1]",
                      f"[bold salmon1]{r2:.6f}[/bold salmon1]",
                      f"[bold salmon1]{mae:.6f}[/bold salmon1]",
                      f"[bold salmon1]{kmean:.6f}[/bold salmon1]",
                      f"[bold salmon1]{kstd:.6f}[/bold salmon1]")
    else:
        table.add_row(algo, f"{r2:.6f}", f"{mae:.6f}", f"{kmean:.6f}", f"{kstd:.6f}")

console.print(table)