In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> * Utilizing Linear Regression, Support Vector Regression, Adaboost and Random Forest algorithms to forecast calorie expenditure.
> * Conducting Exploratory Data Analysis for dataset visualization. Employing label encoding for categorical columns. 
> * Projecting calorie burn predictions on the test dataset

Importing Dependencies

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
cal_df = pd.read_csv("/kaggle/input/fmendesdat263xdemos/exercise.csv")
exercise_df = pd.read_csv("/kaggle/input/fmendesdat263xdemos/calories.csv")

In [None]:
cal_df.head()

In [None]:
exercise_df.head()

In [None]:
master_df = pd.merge(cal_df, exercise_df, on='User_ID')
master_df.head()

In [None]:
master_df.shape

In [None]:
print("Duplicated values: ", master_df.duplicated().sum())
print("Missing values:\n",master_df.isnull().sum())

In [None]:
#User ID column not required
master_df.drop(columns = ['User_ID'], inplace = True)

# Exploratory Data Analysis

In [None]:
#A scatter plot with Calories on the y-axis and Duration on the x-axis. 
#This can help us understand the relationship between the duration of an activity and the calories burned.
plt.figure(figsize=(10,5))
sns.scatterplot(data = master_df, y = 'Calories', x = 'Duration')
plt.xlabel("Duration of exercise in terms of minutes")
plt.ylabel("Calories Burned")
plt.title("Relationship between the duration of an activity and the calories burned")
plt.show()

**It's clearly observable that as the duration increases, there's a noticeable rise in calories burned, indicating a robust and positive correlation.**

In [None]:
#a bar plot to show the distribution of Gender in your dataset. 
#This give us an overview of how many data points you have for each gender.
gender_counts = master_df['Gender'].value_counts()
plt.bar(gender_counts.index, gender_counts.values, color = 'y')
plt.xlabel("Gender")
plt.ylabel("Count")
plt.title("Distribution of Gender in the Dataset")
plt.show()

In [None]:
#a histogram to visualize the distribution of Age. 
#This will help to understand the age distribution of your dataset.
age_counts = master_df['Age'].value_counts()

sns.displot(master_df['Age'], bins=10, kde = True, color = 'seagreen',edgecolor='k',linewidth=2,linestyle='--',
             alpha=.9,hue_norm=None )
plt.xlabel("Age")
plt.ylabel("Count")
plt.title("Distribution of Age in the Dataset")
plt.show()

Correlation heatmap to visualize the correlations between numeric variables (Calories, Age, Height, Weight, Duration, Heart_Rate, Body_Temp). 
*  Ths will give us insights into which variables are most strongly correlated with each other.
*  Selecting only the numeric columns for correlation

In [None]:
numeric_columns = ["Calories", "Age", "Height", "Weight", "Duration", "Heart_Rate", "Body_Temp"]
corr_df = master_df[numeric_columns]

# Calculate the correlation matrix
correlation_matrix = corr_df.corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(data=correlation_matrix, annot=True, cmap="Blues", center=0)
plt.title("Correlation Heatmap of Numeric Variables")
plt.show()

**Calories and Duration indicate that longer durations of activity lead to higher calorie burn which eventually increases
Heart Rate and Body Temperature**

In [None]:
g = sns.FacetGrid(master_df, col='Duration', col_wrap=3, height=4, sharey=False)
g.map(sns.lineplot, 'Heart_Rate', 'Body_Temp', marker='o')

g.set_titles(col_template="Duration: {col_name}")
g.set_axis_labels('Heart Rate', 'Body Temperature')
g.fig.suptitle('Relationship between Heart Rate and Body Temperature by Duration', y=1.02)
plt.tight_layout()
plt.show()

**Duration of an Exercise is directly proportional to Heart Rate and Body Temperature of an individual**

In [None]:
import matplotlib.patches as mpatches
plt.figure(figsize=(10, 6))
scatter = sns.scatterplot(data = master_df,x = 'Weight',y ='Height', hue = 'Calories')

calorie_counts = master_df['Calories'].value_counts().to_dict()

# Create a pie chart legend with labels
labels = [f'Calories: {cal} ({count})' for cal, count in calorie_counts.items()]

# Place the legend outside the plot
legend = plt.legend(labels, title='Calories', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title('Scatter Plot of Weight vs Height with Calories Color Mapping')
plt.xlabel('Weight')
plt.ylabel('Height')
plt.show()

Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()
master_df['Sex'] = label_enc.fit_transform(master_df['Gender'])
master_df.drop(columns=['Gender'], inplace = True)
master_df.head()

In [None]:
master_df.describe()

In [None]:
from sklearn.model_selection import train_test_split
X = master_df.drop(['Calories'], axis = 1)
y = master_df['Calories']
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

print('Test data R2_Score ',r2_score(y_test,y_pred))

# Support Vector Regressor - SVR

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = SVR(kernel='rbf')  
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

print('Test data R2_Score ',r2_score(y_test,y_pred))

# Balancing the Imbalance Data

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
print("Class distribution before oversampling:", Counter(y))
class_distribution = Counter(y)

In [None]:
labels = class_distribution.keys()
counts = class_distribution.values()

# Create a bar graph
plt.bar(labels, counts, color = 'g')

plt.xlabel("Calories")
plt.ylabel("Count")
plt.title("Class Distribution Before Oversampling")

plt.xticks(rotation=45)
plt.show()

In [None]:
oversampler = RandomOverSampler(random_state=42)

In [None]:
X_resampled, y_resampled = oversampler.fit_resample(X, y)
print("Class distribution after oversampling:", Counter(y_resampled))
resampled_class_distribution = Counter(y_resampled)

In [None]:
labels = resampled_class_distribution.keys()
counts = resampled_class_distribution.values()

# Create a bar graph
plt.bar(labels, counts, color = 'g')

plt.xlabel("Calories")
plt.ylabel("Count")
plt.title("Class Distribution After Oversampling")

plt.xticks(rotation=45)
plt.show()

Applying ML Algorithms on Resampled data

# Support Vector Regressor

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = SVR(kernel='rbf')  
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_svr = model.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred_svr)
rmse = np.sqrt(mse)
print("Support Vector Regressor Metrics:")
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print('R2_Score ',r2_score(y_test,y_pred_svr))

In [None]:
np.transpose(y_pred_svr)
svr_df=pd.DataFrame(y_pred_svr)
result = y_test.to_frame()
result = result.reset_index(drop=True)
result_svr = result.sort_index()
y_both = pd.concat([svr_df, result_svr], axis=1)
y_both.columns = ['Predicted calories', 'Original calories']
print(y_both)

# Linear Regression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred_lin_reg = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred_lin_reg)
print("Linear Regresssion Metrics:")
print("Mean Squared Error:", mse)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

print('R2_Score ',r2_score(y_test,y_pred_lin_reg))

In [None]:
np.transpose(y_pred_lin_reg)
lin_reg_df=pd.DataFrame(y_pred_lin_reg)
result = y_test.to_frame()
result = result.reset_index(drop=True)
result_lin_reg = result.sort_index()
y_both = pd.concat([lin_reg_df, result_lin_reg], axis=1)
y_both.columns = ['Predicted calories', 'Original calories']
print(y_both)

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Calculate metrics for Random Forest Regressor
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print("\nRandom Forest Regressor Metrics:")
print("Mean Squared Error:", mse_rf)
print("Root Mean Squared Error:", rmse_rf)
print("R2 Score:", r2_rf)

In [None]:
np.transpose(y_pred_rf)
rf_df=pd.DataFrame(y_pred_rf)
result = y_test.to_frame()
result = result.reset_index(drop=True)
result_rf = result.sort_index()
y_both = pd.concat([rf_df, result_rf], axis=1)
y_both.columns = ['Predicted calories', 'Original calories']
print(y_both)

 # Adaboost

In [None]:
from sklearn.ensemble import AdaBoostRegressor

adaboost_model = AdaBoostRegressor()
adaboost_model.fit(X_train, y_train)
y_pred_adaboost = adaboost_model.predict(X_test)

# Calculate metrics for AdaBoostRegressor
mse_adaboost = mean_squared_error(y_test, y_pred_adaboost)
rmse_adaboost = np.sqrt(mse_adaboost)
r2_adaboost = r2_score(y_test, y_pred_adaboost)
print("AdaBoostRegressor Metrics:")
print("Mean Squared Error:", mse_adaboost)
print("Root Mean Squared Error:", rmse_adaboost)
print("R2 Score:", r2_adaboost)

In [None]:
np.transpose(y_pred_adaboost)
adaboost_df=pd.DataFrame(y_pred_adaboost)
result = y_test.to_frame()
result = result.reset_index(drop=True)
result_adaboost = result.sort_index()
y_both = pd.concat([adaboost_df, result_adaboost], axis=1)
y_both.columns = ['Predicted calories', 'Original calories']
print(y_both)

In [None]:
master_prediction = pd.concat([lin_reg_df, svr_df, rf_df, adaboost_df, result_adaboost], axis = 1)
master_prediction.columns = ['Linear Regression Prediction', 'Support Vector Regressor Prediction','Random Forest Regressor Prediction',
                             'Adaboost Regressor Prediction', 'Original calories']
master_prediction.head(20)

> Considering the evaluation metrics :

> The Random Forest Regressor demonstrates the lowest values for both Mean Squared Error and Root Mean Squared Error, indicating its potential for delivering more accurate predictions compared to the other algorithms.

> Furthermore, the Random Forest Regressor achieves the highest R2 Score, implying a significant ability to elucidate the variance within the dataset.

> While the Support Vector Regressor and AdaBoost Regressor exhibit favorable performance, the Random Forest Regressor emerges as the standout choice in this context. 