In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [47]:
data = pd.read_csv("Dataset.csv")

In [48]:
data.dropna(inplace=True)
data = pd.get_dummies(data)

In [49]:
X = data.drop(columns=["BMI"])
y = data["BMR"]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming you have train and test data X_train_scaled, X_test_scaled, y_train, y_test

models = {
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor()
}

param_grids = {
    "Random Forest": {
        "n_estimators": [100, 200, 300],
        "max_depth": [None, 5, 10],
        "min_samples_split": [2, 5, 10]
    },
    "Gradient Boosting": {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.5],
        "max_depth": [3, 5, 7]
    }
}

best_models = {name: GridSearchCV(model, param_grid=params, cv=5, scoring='neg_mean_squared_error').fit(X_train_scaled, y_train).best_estimator_
               for name, model, params in zip(models.keys(), models.values(), param_grids.values())}

# Calculate RMSE scores for each model
rmse_scores = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores[name] = rmse

# Find the best model based on the minimum RMSE score
best_model_name = min(rmse_scores, key=rmse_scores.get)
best_model = best_models[best_model_name]

print("Best Model:", best_model_name)
print("RMSE Score:", rmse_scores[best_model_name])

In [45]:
for name, model in best_models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = [(-score)**0.5 for score in cv_scores]
    print(f"Cross-Validation RMSE for {name}:", rmse_scores)

Cross-Validation RMSE for Random Forest: [0.500388893285714, 0.7113973744661957, 0.3298156455297856, 0.644112014495397, 0.31744411380829757]
Cross-Validation RMSE for Gradient Boosting: [0.3230512879137702, 0.525854684320972, 0.7552958164273694, 0.41619179573286824, 0.33709587159946985]


In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Daily Calorie Limit")
plt.ylabel("Predicted Daily Calorie Limit")
plt.title("Actual vs. Predicted Daily Calorie Limit")
plt.show()

In [None]:
if best_model_name == "Random Forest":
    feature_importance = best_model.feature_importances_
    feature_names = X.columns
    importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importance})
    importance_df = importance_df.sort_values(by="Importance", ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x="Importance", y="Feature", data=importance_df)
    plt.title("Random Forest Feature Importance")
    plt.show()
elif best_model_name == "Gradient Boosting":
    feature_importance = best_model.feature_importances_
    feature_names = X.columns
    importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importance})
    importance_df = importance_df.sort_values(by="Importance", ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x="Importance", y="Feature", data=importance_df)
    plt.title("Gradient Boosting Feature Importance")
    plt.show()