In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adilshamim8/predict-calorie-expenditure")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/predict-calorie-expenditure


In [None]:
import pandas as pd

In [None]:
dataframe = pd.read_csv('/kaggle/input/predict-calorie-expenditure/train.csv', sep=',')

dataframe.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [None]:
print(dataframe.isnull().sum())

id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64


In [None]:
print(dataframe.dtypes)

id              int64
Sex            object
Age             int64
Height        float64
Weight        float64
Duration      float64
Heart_Rate    float64
Body_Temp     float64
Calories      float64
dtype: object


**Reasoning**:
Based on the data types, list the numerical columns suitable for normalization. The 'id' column is typically not used for normalization in regression models.



In [None]:
numerical_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']
print("Numerical columns for normalization:", numerical_cols)

Numerical columns for normalization: ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']


## Apply normalization

### Subtask:
Implement the chosen normalization method to the selected numerical features.


**Reasoning**:
Implement the chosen normalization method (Standardization) using StandardScaler on the selected numerical features and update the dataframe with the normalized values.



In [None]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'Calories']
numerical_data = dataframe[numerical_cols]

# Instantiate StandardScaler
scaler = StandardScaler()

normalized_data = scaler.fit_transform(numerical_data)

print(normalized_data)

normalized_dataframe = pd.DataFrame(normalized_data, columns=numerical_cols)

# Optionally, you can replace the original numerical columns in the main dataframe
dataframe[numerical_cols] = normalized_dataframe

# Display the first few rows of the updated dataframe to verify
display(dataframe.head())

[[-0.3571921   1.11523482  0.49020109 ...  0.58371421  1.23577241
   0.98913238]
 [ 1.48794322 -0.91213671 -1.08317237 ... -1.10943632 -0.43116345
  -0.86998179]
 [ 0.63127325 -1.06808836 -0.79710447 ... -1.21525823 -0.30293761
  -0.95011602]
 ...
 [ 1.22435246 -0.99011254 -0.58255354 ...  1.85357711  1.10754658
   2.12703846]
 [ 0.23588711  0.56940402  1.13385388 ...  0.68953612  0.33819156
   0.33203168]
 [-0.15949903 -0.28833009 -0.72558749 ...  0.16042658  0.72286907
   0.2358706 ]]


Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,-0.357192,1.115235,0.490201,1.266324,0.583714,1.235772,0.989132
1,1,female,1.487943,-0.912137,-1.083172,-0.888309,-1.109436,-0.431163,-0.869982
2,2,female,0.631273,-1.068088,-0.797104,-1.008011,-1.215258,-0.302938,-0.950116
3,3,male,-1.411555,1.349162,1.062337,1.146622,1.007002,0.851095,0.828864
4,4,female,-0.225397,-0.678209,-1.011655,1.146622,0.689536,0.722869,0.925025


**Reasoning**:
Verify that the normalization was applied correctly by calculating the mean and standard deviation of the normalized numerical columns.



In [None]:
print(dataframe[numerical_cols].describe())

                Age        Height        Weight      Duration    Heart_Rate  \
count  7.500000e+05  7.500000e+05  7.500000e+05  7.500000e+05  7.500000e+05   
mean   1.374284e-16  8.861984e-16 -3.410605e-17  3.999882e-17  3.481470e-16   
std    1.000001e+00  1.000001e+00  1.000001e+00  1.000001e+00  1.000001e+00   
min   -1.411555e+00 -3.797242e+00 -2.799580e+00 -1.726222e+00 -3.014231e+00   
25%   -8.843736e-01 -8.341609e-01 -8.686214e-01 -8.883092e-01 -7.919706e-01   
50%   -9.360134e-02 -5.440260e-02 -8.193471e-02 -5.039623e-02 -5.121724e-02   
75%    6.971709e-01  8.033315e-01  8.477860e-01  9.072186e-01  7.953580e-01   
max    2.476409e+00  3.688437e+00  4.066050e+00  1.745131e+00  3.440906e+00   

          Body_Temp      Calories  
count  7.500000e+05  7.500000e+05  
mean   2.005819e-14  1.273293e-17  
std    1.000001e+00  1.000001e+00  
min   -3.765035e+00 -1.398868e+00  
25%   -5.593893e-01 -8.699818e-01  
50%    3.381916e-01 -1.808274e-01  
75%    8.510949e-01  7.647565e-01  


## Verify normalization

### Subtask:
Check the distribution and range of the normalized data to ensure the transformation was successful.


**Reasoning**:
Calculate and print the mean, standard deviation, minimum, and maximum for each numerical column in the normalized_dataframe to verify the standardization process.



In [None]:
# Calculate and print the mean and standard deviation
print("Mean and Standard Deviation of Normalized Data:")
print(normalized_dataframe.agg(['mean', 'std']))

# Calculate and print the minimum and maximum values
print("\nMinimum and Maximum Values of Normalized Data:")
print(normalized_dataframe.agg(['min', 'max']))

# Explain what the values indicate
print("\nExplanation:")
print("After Standardization (Z-score normalization), the data is transformed to have a mean of approximately 0 and a standard deviation of approximately 1. The minimum and maximum values will vary depending on the original data's distribution and outliers, but they represent the scaled range of the data.")

Mean and Standard Deviation of Normalized Data:
               Age        Height        Weight      Duration    Heart_Rate  \
mean  1.374284e-16  8.861984e-16 -3.410605e-17  3.999882e-17  3.481470e-16   
std   1.000001e+00  1.000001e+00  1.000001e+00  1.000001e+00  1.000001e+00   

         Body_Temp      Calories  
mean  2.005819e-14  1.273293e-17  
std   1.000001e+00  1.000001e+00  

Minimum and Maximum Values of Normalized Data:
          Age    Height   Weight  Duration  Heart_Rate  Body_Temp  Calories
min -1.411555 -3.797242 -2.79958 -1.726222   -3.014231  -3.765035 -1.398868
max  2.476409  3.688437  4.06605  1.745131    3.440906   1.876902  3.617535

Explanation:
After Standardization (Z-score normalization), the data is transformed to have a mean of approximately 0 and a standard deviation of approximately 1. The minimum and maximum values will vary depending on the original data's distribution and outliers, but they represent the scaled range of the data.


In [None]:
outliers_height = normalized_dataframe[normalized_dataframe['Height'].abs() > 3]
normalized_dataframe.head()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,-0.357192,1.115235,0.490201,1.266324,0.583714,1.235772,0.989132
1,1.487943,-0.912137,-1.083172,-0.888309,-1.109436,-0.431163,-0.869982
2,0.631273,-1.068088,-0.797104,-1.008011,-1.215258,-0.302938,-0.950116
3,-1.411555,1.349162,1.062337,1.146622,1.007002,0.851095,0.828864
4,-0.225397,-0.678209,-1.011655,1.146622,0.689536,0.722869,0.925025


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
target = ['Calories']

X = dataframe[features]
Y = dataframe[target]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
#Aplicando XGBoost
model = xgb.XGBRegressor(
        n_estimators=1000,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=42,
    )

model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=False
    )

In [None]:
data_test = dataframe[features]
data_test.head()
dataframe.get(0)

In [None]:
X.iloc[0]

Unnamed: 0,0
Age,-0.357192
Height,1.115235
Weight,0.490201
Duration,1.266324
Heart_Rate,0.583714
Body_Temp,1.235772


In [None]:
data_to_predicted = X[features].iloc[0:1]
kcal_predicted = model.predict(data_to_predicted)
kcal_predicted

array([0.9735421], dtype=float32)

**Reasoning**:
Utilizar as métricas MAE, MSE, RMSE e R-squared para avaliar o desempenho do modelo XGBoost treinado no conjunto de teste.

In [None]:
# Fazer previsões no conjunto de teste
y_pred = model.predict(X_test)

# Calcular as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

Mean Absolute Error (MAE): 0.0395
Mean Squared Error (MSE): 0.0045
Root Mean Squared Error (RMSE): 0.0045
R-squared (R2): 0.9955


In [None]:
cv_scores = cross_val_score(model, X, Y, cv=5, scoring='r2')

In [None]:
print(f"R² médio na validação cruzada: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

R² médio na validação cruzada: 0.9955 ± 0.0001


In [None]:
#Aplicando a regressão Linear
model_linear_regression = LinearRegression()
model_linear_regression.fit(X_train, y_train)

In [None]:
# Fazer previsões no conjunto de teste
y_pred = model_linear_regression.predict(X_test)

# Calcular as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

Mean Absolute Error (MAE): 0.1297
Mean Squared Error (MSE): 0.0315
Root Mean Squared Error (RMSE): 0.0315
R-squared (R2): 0.9684


In [None]:
#Cross-validation - Linear Regression
cv_scores = cross_val_score(model_linear_regression, X, Y, cv=5, scoring='r2')
print(f"R² médio na validação cruzada: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

R² médio na validação cruzada: 0.9683 ± 0.0001


In [None]:
#Aplicando a arvore de decisão

model_tree_decision = DecisionTreeRegressor(max_depth=10)

In [None]:
model_tree_decision.fit(X_train, y_train)

In [None]:
# Fazer previsões no conjunto de teste
y_pred = model_tree_decision.predict(X_test)

# Calcular as métricas de avaliação
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

Mean Absolute Error (MAE): 0.0621
Mean Squared Error (MSE): 0.0090
Root Mean Squared Error (RMSE): 0.0090
R-squared (R2): 0.9910


In [None]:
#Cross-validation - Àrvore de Decisão
cv_scores = cross_val_score(model_tree_decision, X, Y, cv=5, scoring='r2')
print(f"R² médio na validação cruzada: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

R² médio na validação cruzada: 0.9909 ± 0.0001
