In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold

In [15]:
file_path = "../datasets/Bodyfat.csv"
data = pd.read_csv(file_path)

In [16]:
X = data.drop("BodyFat", axis=1)
y = data["BodyFat"]

numerical_cols = X.columns.tolist()
preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])
X_processed = preprocessor.fit_transform(X)

# Split the data into training, validation, and test sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X_processed, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [17]:
# Define TensorFlow/Keras models
def create_keras_model(hidden_layer_sizes=(100,), activation='relu', learning_rate=0.001):
    model = Sequential()
    for units in hidden_layer_sizes:
        model.add(Dense(units, activation=activation))
    model.add(Dense(1))  # Output layer for regression
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mean_squared_error')
    return model

# Train TensorFlow model
def train_keras_model(X_train, y_train, hidden_layer_sizes=(100,), activation='relu', learning_rate=0.001, epochs=100, batch_size=10):
    model = create_keras_model(hidden_layer_sizes, activation, learning_rate)
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    model.fit(X_train, y_train, validation_split=0.2, epochs=epochs, batch_size=batch_size, verbose=0, callbacks=[early_stopping])
    return model

In [18]:
# Define parameter grids for hyperparameter tuning
param_grid_nn = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100)],
    'activation': ['relu', 'tanh'],
    'learning_rate': [0.001, 0.01]
}

param_grid_rf = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10]
}

# Create pipelines for other models
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

pipeline_poly = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),  # Polynomial features
    ('regressor', LinearRegression())
])

pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [19]:
# Set up cross-validation strategy for regression
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Grid Search for Random Forest
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=cv, scoring='neg_mean_squared_error')

# Fit other models
pipeline_lr.fit(X_train, y_train)
pipeline_poly.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)

# Get best Random Forest model
best_rf = grid_search_rf.best_estimator_

# Train and evaluate TensorFlow models
def evaluate_keras_model(model, X_val, y_val):
    y_pred = model.predict(X_val).flatten()
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    return mse, r2

KeyboardInterrupt: 

In [None]:
# Perform grid search for neural network hyperparameters
best_nn_mse = float('inf')
best_nn_model = None

for hidden_layer_sizes in param_grid_nn['hidden_layer_sizes']:
    for activation in param_grid_nn['activation']:
        for learning_rate in param_grid_nn['learning_rate']:
            print(f"Training NN with hidden_layer_sizes={hidden_layer_sizes}, activation={activation}, learning_rate={learning_rate}")
            model = train_keras_model(X_train, y_train, hidden_layer_sizes, activation, learning_rate)
            mse, r2 = evaluate_keras_model(model, X_val, y_val)
            print(f"Validation MSE: {mse}, R^2: {r2}")
            if mse < best_nn_mse:
                best_nn_mse = mse
                best_nn_model = model

Training NN with hidden_layer_sizes=(50,), activation=relu, learning_rate=0.001
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Validation MSE: 3.9078127631896784, R^2: 0.9143105371998335
Training NN with hidden_layer_sizes=(50,), activation=relu, learning_rate=0.01
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Validation MSE: 0.3606522390850298, R^2: 0.9920917151108211
Training NN with hidden_layer_sizes=(50,), activation=tanh, learning_rate=0.001
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Validation MSE: 2.124701377989305, R^2: 0.9534101220494325
Training NN with hidden_layer_sizes=(50,), activation=tanh, learning_rate=0.01
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Validation MSE: 1.332623130647213, R^2: 0.9707785999227279
Training NN with hidden_layer_sizes=(100,), activation=relu, learning_rate=0.001
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Val

In [20]:
# Define ensemble models
models = [
    ('Linear Regression', pipeline_lr),
    ('Polynomial Regression', pipeline_poly),
    ('Random Forest', best_rf)
]

# Stacking Regressor
stacking_reg = StackingRegressor(
    estimators=[('Linear Regression', pipeline_lr), ('Polynomial Regression', pipeline_poly), ('Random Forest', best_rf)],
    final_estimator=LinearRegression()
)

# Voting Regressor
voting_reg = VotingRegressor(
    estimators=[('Linear Regression', pipeline_lr), ('Polynomial Regression', pipeline_poly), ('Random Forest', best_rf)]
)

# Fit ensemble models
stacking_reg.fit(X_train, y_train)
voting_reg.fit(X_train, y_train)

In [21]:
# Evaluate models on validation set
for name, model in models:
    y_pred = model.predict(X_val)
    print(f"{name} Validation MSE: {mean_squared_error(y_val, y_pred)}")
    print(f"{name} Validation R^2: {r2_score(y_val, y_pred)}\n")

# Evaluate TensorFlow model
if best_nn_model:
    nn_mse, nn_r2 = evaluate_keras_model(best_nn_model, X_val, y_val)
    print(f"Neural Network Validation MSE: {nn_mse}")
    print(f"Neural Network Validation R^2: {nn_r2}")

# Evaluate ensemble models on validation set
for name, model in [('Stacking Regressor', stacking_reg), ('Voting Regressor', voting_reg)]:
    y_pred = model.predict(X_val)
    print(f"{name} Validation MSE: {mean_squared_error(y_val, y_pred)}")
    print(f"{name} Validation R^2: {r2_score(y_val, y_pred)}\n")

# Test set evaluation
best_model_name, best_model = max(
    [(name, model) for name, model in models + [('Stacking Regressor', stacking_reg), ('Voting Regressor', voting_reg)]],
    key=lambda item: r2_score(y_val, item[1].predict(X_val))
)

y_test_pred = best_model.predict(X_test)
print(f"Best Model ({best_model_name}) Test MSE: {mean_squared_error(y_test, y_test_pred)}")
print(f"Best Model ({best_model_name}) Test R^2: {r2_score(y_test, y_test_pred)}")

# Evaluate TensorFlow model on test set
if best_nn_model:
    nn_test_mse, nn_test_r2 = evaluate_keras_model(best_nn_model, X_test, y_test)
    print(f"Neural Network Test MSE: {nn_test_mse}")
    print(f"Neural Network Test R^2: {nn_test_r2}")

Linear Regression Validation MSE: 0.29217483051916504
Linear Regression Validation R^2: 0.9935932692306165

Polynomial Regression Validation MSE: 2.689785469456723
Polynomial Regression Validation R^2: 0.94101911071673

Random Forest Validation MSE: 0.28979628947368397
Random Forest Validation R^2: 0.9936454252362356

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Neural Network Validation MSE: 0.2793110019439732
Neural Network Validation R^2: 0.9938753437891892
Stacking Regressor Validation MSE: 0.1829756976978857
Stacking Regressor Validation R^2: 0.9959877583212509

Voting Regressor Validation MSE: 0.4242559138975792
Voting Regressor Validation R^2: 0.9906970309084093

Best Model (Stacking Regressor) Test MSE: 1.2707531074186196
Best Model (Stacking Regressor) Test R^2: 0.9779624489435841
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Neural Network Test MSE: 2.408809882765494
Neural Network Test R^2: 0.9582261334111722
