In [13]:
import os

In [14]:
os.chdir(r'C:\SML_Projects\SML_gym_fatPercentage_predict_project')

In [15]:
print(os.getcwd())

C:\SML_Projects\SML_gym_fatPercentage_predict_project


In [16]:
import pandas as pd

In [17]:
from src.training import Trainer
from src.logger import get_logger

In [18]:
logger = get_logger('notebook_training', 'model_training.log')

In [19]:
try:
    df = pd.read_csv('data/final/final_gym_dataset.csv')
    logger.info(f'dataset imported successfully with shape: {df.shape}')
except Exception as e:
    logger.error(f'error during import dataset: {e}')
    print(f'error: {e}')

In [20]:
try:
    df.info()
except Exception as e:
    logger.error(f'error during dataset validation : {e}')
    print(f'error: {e}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         973 non-null    float64
 1   weight                      973 non-null    float64
 2   height                      973 non-null    float64
 3   max_bpm                     973 non-null    float64
 4   avg_bpm                     973 non-null    float64
 5   resting_bpm                 973 non-null    float64
 6   session_duration            973 non-null    float64
 7   calories_burned             973 non-null    float64
 8   fat_percentage              973 non-null    float64
 9   water_intake                973 non-null    float64
 10  workout_frequency           973 non-null    float64
 11  experience_level            973 non-null    float64
 12  bmi                         973 non-null    float64
 13  gender_Female               973 non

# Import

In [21]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost

from tabulate import tabulate
from rich.console import Console
from rich.table import Table

# Single Split

In [22]:
try:
    x = df.drop('fat_percentage', axis=1)
    y = df['fat_percentage']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    logger.info(f'dataset splited to x: {x.shape}, y: {y.shape}')
    print(x.shape, y.shape)
    
except Exception as e:
    logger.error(f'error during splitting dataset : {e}')
    print(f'error: {e}')

(973, 36) (973,)


# Model Training

In [23]:
try:
    models = [
        LinearRegression(),
        LassoCV(cv=10, random_state=42),
        RidgeCV(cv=10),
        ElasticNetCV(cv=10, random_state=42),
        DecisionTreeRegressor(random_state=42),
        RandomForestRegressor(random_state=42, n_estimators=200),
        GradientBoostingRegressor(random_state=42),
        HistGradientBoostingRegressor(random_state=42),
        ExtraTreesRegressor(random_state=42),
        AdaBoostRegressor(random_state=42),
        xgboost.XGBRegressor(random_state=42),
        SVR(kernel='rbf', C=20.0),
        KNeighborsRegressor(n_neighbors=10)
    ]

    results = []
    best_r2 = -float("inf")
    best_trained_model = None
    best_model_name = ""

    for model in models:
        trainer = Trainer(model, x, y)
        trainer.train().evaluate()

        results.append([model.__class__.__name__, trainer.r2, trainer.mae, trainer.kfold.mean(), trainer.kfold.std()])

        if trainer.r2 > best_r2:
            best_r2 = trainer.r2
            best_trained_model = trainer.model
            best_model_name = model.__class__.__name__
    
    logger.info(f'all models trained successfully! best model: {best_model_name}')
    
except Exception as e:
    logger.error(f'error during training models: {e}')
    print(f'error: {e}')

# Table compare

In [None]:
try:
    console = Console()
    results_sorted = sorted(results, key=lambda i: i[1], reverse=True)

    best_model = results_sorted[0]
    worst_model = results_sorted[-1]

    table = Table(title="Models Compare", show_lines=True)

    table.add_column("Algorithm")
    table.add_column("R2 score")
    table.add_column("Mean Absolute Error", justify="right")
    table.add_column("K-Fold Mean", justify="right")
    table.add_column("K-Fold Std", justify="right")
    
    for row in results_sorted:
        algo, r2, mae, kmean, kstd = row

        if row == best_model:
            table.add_row(
                f"[bold green]{algo}[/bold green]",
                f"[bold green]{r2:.6f}[/bold green]",
                f"[bold green]{mae:.6f}[/bold green]",
                f"[bold green]{kmean:.6f}[/bold green]",
                f"[bold green]{kstd:.6f}[/bold green]"
            )
        elif row == worst_model:
            table.add_row(
                f"[bold salmon1]{algo}[/bold salmon1]",
                f"[bold salmon1]{r2:.6f}[/bold salmon1]",
                f"[bold salmon1]{mae:.6f}[/bold salmon1]",
                f"[bold salmon1]{kmean:.6f}[/bold salmon1]",
                f"[bold salmon1]{kstd:.6f}[/bold salmon1]"
            )
        else:
            table.add_row(algo, f"{r2:.6f}", f"{mae:.6f}", f"{kmean:.6f}", f"{kstd:.6f}")
            
    console.print(table)

except Exception as e:
    logger.error(f'Error during create table: {str(e)}')
    print(f'error: {e}')