In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
import joblib

In [18]:
file_path = "../datasets/Bodyfat.csv"
data = pd.read_csv(file_path)
data = data.dropna()
data = data.drop(columns=['Density'])

In [19]:
X = data.drop("BodyFat", axis=1)
y = data["BodyFat"]

numerical_cols = X.columns.tolist()
preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])
X_processed = preprocessor.fit_transform(X)

# Split the data into training, validation, and test sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X_processed, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [20]:
# Define Keras models
def create_keras_model(hidden_layer_sizes=(100,), activation='relu', learning_rate=0.001):
    model = Sequential()
    for units in hidden_layer_sizes:
        model.add(Dense(units, activation=activation))
    model.add(Dense(1))  # Output layer for regression
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

# Train model
def train_keras_model(X_train, y_train, hidden_layer_sizes=(100,), activation='relu', learning_rate=0.001, epochs=100, batch_size=10):
    model = create_keras_model(hidden_layer_sizes, activation, learning_rate)
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train, y_train, validation_split=0.2, epochs=epochs, batch_size=batch_size, verbose=0, callbacks=[early_stopping])
    return model

In [21]:
# Define parameter grids for hyperparameter tuning
param_grid_nn = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100)],
    'activation': ['relu', 'tanh'],
    'learning_rate': [0.001, 0.01]
}

param_grid_rf = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10]
}

# Create pipelines for other models
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

pipeline_poly = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('regressor', LinearRegression())
])

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [22]:
# Cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=cv, scoring='neg_mean_squared_error')

pipeline_lr.fit(X_train, y_train)
pipeline_poly.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)

# Get best Random Forest model
best_rf = grid_search_rf.best_estimator_

def evaluate_keras_model(model, X_val, y_val):
    y_pred = model.predict(X_val).flatten()
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    return mse, r2

In [23]:
# Perform grid search for neural network hyperparameters
best_nn_mse = float('inf')
best_nn_model = None

for hidden_layer_sizes in param_grid_nn['hidden_layer_sizes']:
    for activation in param_grid_nn['activation']:
        for learning_rate in param_grid_nn['learning_rate']:
            print(f"Training NN with hidden_layer_sizes={hidden_layer_sizes}, activation={activation}, learning_rate={learning_rate}")
            model = train_keras_model(X_train, y_train, hidden_layer_sizes, activation, learning_rate)
            mse, r2 = evaluate_keras_model(model, X_val, y_val)
            print(f"Validation MSE: {mse}, R^2: {r2}")
            if mse < best_nn_mse:
                best_nn_mse = mse
                best_nn_model = model

Training NN with hidden_layer_sizes=(50,), activation=relu, learning_rate=0.001
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Validation MSE: 21.287640622825656, R^2: 0.5332103660555059
Training NN with hidden_layer_sizes=(50,), activation=relu, learning_rate=0.01
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Validation MSE: 22.155353063352795, R^2: 0.5141834020222797
Training NN with hidden_layer_sizes=(50,), activation=tanh, learning_rate=0.001
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Validation MSE: 16.973614869484695, R^2: 0.6278071575886133
Training NN with hidden_layer_sizes=(50,), activation=tanh, learning_rate=0.01
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Validation MSE: 17.542759916386007, R^2: 0.61532709872199
Training NN with hidden_layer_sizes=(100,), activation=relu, learning_rate=0.001
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Val

In [24]:
# Define ensemble models
models = [
    ('Linear Regression', pipeline_lr),
    ('Polynomial Regression', pipeline_poly),
    ('Random Forest', best_rf)
]

# Stacking Regressor
stacking_reg = StackingRegressor(
    estimators=[('Linear Regression', pipeline_lr), ('Polynomial Regression', pipeline_poly), ('Random Forest', best_rf)],
    final_estimator=LinearRegression()
)

# Voting Regressor
voting_reg = VotingRegressor(
    estimators=[('Linear Regression', pipeline_lr), ('Polynomial Regression', pipeline_poly), ('Random Forest', best_rf)]
)

# Fit ensemble models
stacking_reg.fit(X_train, y_train)
voting_reg.fit(X_train, y_train)

In [25]:
# Evaluate models on validation set
best_model_name, best_model = None, None
best_r2_score = -float('inf')

for name, model in models + [('Stacking Regressor', stacking_reg), ('Voting Regressor', voting_reg)]:
    y_pred = model.predict(X_val)
    current_r2 = r2_score(y_val, y_pred)
    if current_r2 > best_r2_score:
        best_r2_score = current_r2
        best_model_name = name
        best_model = model

print(f"Best Model: {best_model_name}")

# Test set evaluation
y_test_pred = best_model.predict(X_test)
print(f"Best Model ({best_model_name}) Test MSE: {mean_squared_error(y_test, y_test_pred)}")
print(f"Best Model ({best_model_name}) Test R^2: {r2_score(y_test, y_test_pred)}")

# Save the best model
joblib.dump(best_model, 'best_model.joblib')
joblib.dump(preprocessor, 'preprocessor.joblib')

# Load the best model and preprocessor
best_model_loaded = joblib.load('best_model.joblib')
preprocessor_loaded = joblib.load('preprocessor.joblib')

# Function to predict Body Fat based on new input
def predict_body_fat(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)
    input_processed = preprocessor_loaded.transform(input_df)
    prediction = best_model_loaded.predict(input_processed)
    return prediction[0]

# Example usage
input_data = {
    'Age': 30,
    'Weight': 70,
    'Height': 175,
    'Neck': '40',
    'Chest': 100,
    'Abdomen': 90,
    'Hip': 100,
    'Thigh': 50,
    'Knee': 40,
    'Ankle': 30,
    'Biceps': 30,
    'Forearm': 25,
    'Wrist': 20
}

predicted_body_fat = predict_body_fat(input_data)
print(f"Predicted Body Fat Percentage: {predicted_body_fat:.2f}")

Best Model: Random Forest
Best Model (Random Forest) Test MSE: 19.831183769160816
Best Model (Random Forest) Test R^2: 0.6560852597796731
Predicted Body Fat Percentage: 33.48


