In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv("C:/Users/ML Projects/predict_calorie_expenditure_flask_app/datasets/train.csv")

In [None]:
train_df.shape

In [None]:
train_df.head(2)

In [None]:
train_df.tail(2)

In [None]:
train_df.isnull().sum()

In [None]:
train_df["Sex"] = train_df["Sex"].map({"male": 0, "female": 1})
train_df.head(2)

In [None]:
train_df = train_df.drop(['id'], axis=1)
train_df.head(2)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
corr_matrix = train_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap (Before Feature Engineering)")
plt.xlabel("features")
plt.ylabel("features")
plt.show()

In [None]:
train_df["Duration_Heart"] = train_df["Duration"] * train_df["Heart_Rate"]
train_df.head(2)

In [None]:
train_df["Duration_Temp"] = train_df["Duration"] * train_df["Body_Temp"]
train_df.head(2)

In [None]:
train_df["Age_Duration"] = train_df["Age"] * train_df["Duration"]
train_df.head(2)

In [None]:
train_df["Weight_Duration"] = train_df["Weight"] * train_df["Duration"]
train_df.head(2)

In [None]:
train_df["Height_Duration"] = train_df["Height"] * train_df["Duration"]
train_df.head(2)

In [None]:
train_df["HR_per_Weight"] = train_df["Heart_Rate"] / train_df["Weight"]
train_df.head(2)

In [None]:
corr_matrix = train_df.corr()
plt.figure(figsize=(20, 18))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap (After Feature Engineering)")
plt.xlabel("Features + New Engineering Features")
plt.ylabel("Features + New Engineering Features")
plt.show()

In [None]:
num_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories', 'Duration_Heart', 'Duration_Temp', 'Age_Duration', 'Weight_Duration', 'Height_Duration', 'HR_per_Weight']
print(train_df[num_cols].describe().T[["min", "max"]])

In [None]:
plt.figure(figsize=(20, 18))
for i, col in enumerate(num_cols):
    plt.subplot(5, 3, i + 1)
    sns.boxplot(data=train_df, x=col)
    plt.title(f"{col}")
plt.tight_layout()
plt.show()

In [None]:
features = train_df.drop(['Calories'], axis=1)
labels = train_df['Calories']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels,
                                                    test_size=0.2, 
                                                    random_state= 42)
print("Training size:", X_train.shape[0])
print("Testing size:", X_test.shape[0])

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_log_error, make_scorer

In [None]:
def rmsle_clipped(y_true, y_pred):
    y_true = np.clip(y_true, 1e-6, None)
    y_pred = np.clip(y_pred, 1e-6, None)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def calculate_performance(y_true, y_pred):
    return rmsle_clipped(y_true, y_pred), r2_score(y_true, y_pred)

rmsle_scorer = make_scorer(rmsle_clipped, greater_is_better=False)

In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
model_dict = {
    "XGB": XGBRegressor(objective='reg:squarederror', random_state=42),
    "LGBM": LGBMRegressor(random_state=42),
    "RF":  RandomForestRegressor(random_state=42),
    "LR":  LinearRegression()
}

results = []
for name, model in model_dict.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmsle, r2 = calculate_performance(y_test, preds)
    results.append({"Model": name, "RMSLE": rmsle, "RÂ² Score": r2})

results_df = pd.DataFrame(results).sort_values("RMSLE")
print("Default Model Performance Without Tuning")
print(results_df)

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(data=results_df, x="RMSLE", y="Model", hue="Model", legend=False, palette="viridis")
plt.title("Model Comparison (Lower RMSLE is Better)")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'learning_rate': [0.05, 0.1],
    'max_depth': [6, 10],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=XGBRegressor(objective='reg:squarederror', random_state=42),
    param_grid=param_grid,
    scoring=rmsle_scorer,
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

In [None]:
best_model = XGBRegressor(
    objective='reg:squarederror',
    learning_rate=0.05,
    max_depth=10,
    n_estimators=200,
    subsample=1.0,
    colsample_bytree=0.8,
    random_state=42
)
best_model.fit(X_train, y_train)

In [None]:
importances = best_model.feature_importances_
feature_names = X_train.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title("Best Model - Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()

In [None]:
y_pred = best_model.predict(X_test)
y_pred = np.clip(y_pred, 0, None)

rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
print(f"Validation RMSLE: {rmsle:.4f}")

In [None]:
test_df = pd.read_csv("C:/Users/ML Projects/predict_calorie_expenditure_flask_app/datasets/test.csv")

In [None]:
test_df.head(2)

In [None]:
test_df.tail(2)

In [None]:
test_df.shape

In [None]:
test_df.isnull().sum()

In [None]:
test_df["Sex"] = test_df["Sex"].map({"male": 0, "female": 1})
test_df.head(3)

In [None]:
test_df["Duration_Heart"] = test_df["Duration"] * test_df["Heart_Rate"]
test_df.head(2)

In [None]:
test_df["Duration_Temp"] = test_df["Duration"] * test_df["Body_Temp"]
test_df.head(2)

In [None]:
test_df["Age_Duration"] = test_df["Age"] * test_df["Duration"]
test_df.head(2)

In [None]:
test_df["Weight_Duration"] = test_df["Weight"] * test_df["Duration"]
test_df.head(2)

In [None]:
test_df["Height_Duration"] = test_df["Height"] * test_df["Duration"]
test_df.head(2)

In [None]:
test_df["HR_per_Weight"] = test_df["Heart_Rate"] / test_df["Weight"]
test_df.head(2)

In [None]:
test_ids = test_df["id"]

In [None]:
test_df = test_df.drop(['id'], axis=1)
test_df.head(2)

In [None]:
test_preds = best_model.predict(test_df)
test_preds = np.clip(test_preds, 0, None)

In [None]:
submission = pd.DataFrame({
    "id": test_ids,
    "Calories": test_preds
})
submission.to_csv("final_submission_6.csv", index=False)
print("final_submission_6.csv created")

In [None]:
import pickle

In [None]:
with open("best_xgbr_model_final.pkl", "wb") as f:
    pickle.dump(best_model, f)
print("Model saved to best_xgbr_model_final.pkl")