In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler,OrdinalEncoder

from sklearn.metrics import r2_score

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import KFold

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

In [None]:
def read_csv(file_path):
    return pd.read_csv(file_path)
def dataset_info_statistics(data):

    print("Dataset Information:")
    print(data.info())
    print("\n")

    print("Basic Statistics for Numerical Columns:")
    print(data.describe())
    print("\n")

def check_null(data):
    null_counts = data.isnull().sum()
    print("Null Values in the Dataset:")
    return null_counts

def check_duplicates(data):

    return data.duplicated().any()

def plot_graph(data):

    numerical_columns = data.select_dtypes(include=np.number).columns

    for column in numerical_columns:
        plt.figure(figsize=(5,3))
        sns.distplot(data[column],kde=True)
        plt.title(f"Histogram for {column}")
        plt.xlabel(column)
        plt.ylabel("Frequency")
        plt.show()

    categorical_columns = data.select_dtypes(include='object').columns
    for column in categorical_columns:
        plt.figure(figsize=(5, 3))
        sns.countplot(data[column])
        plt.title(f'Countplot for {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.show()

def seperate_features_target(data,target_column):


    X = data.drop(columns=[target_column],axis=1)
    y = data[target_column]

    return X,y

def perform_train_test_split(X, y, test_size=0.20, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

In [None]:
calories = read_csv('calories.csv')
exercise = read_csv('exercise.csv')

In [None]:
data = pd.merge(calories, exercise, on='User_ID')

In [None]:
data.head()

Unnamed: 0,User_ID,Calories,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,231.0,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,66.0,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,26.0,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,71.0,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,35.0,female,27,154.0,58.0,10.0,81.0,39.8


In [None]:
dataset_info_statistics(data)

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   User_ID     15000 non-null  int64  
 1   Calories    15000 non-null  float64
 2   Gender      15000 non-null  object 
 3   Age         15000 non-null  int64  
 4   Height      15000 non-null  float64
 5   Weight      15000 non-null  float64
 6   Duration    15000 non-null  float64
 7   Heart_Rate  15000 non-null  float64
 8   Body_Temp   15000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 1.0+ MB
None


Basic Statistics for Numerical Columns:
            User_ID      Calories           Age        Height        Weight  \
count  1.500000e+04  15000.000000  15000.000000  15000.000000  15000.000000   
mean   1.497736e+07     89.539533     42.789800    174.465133     74.966867   
std    2.872851e+06     62.456978     16.980264     14.258114     15.03565

In [None]:
check_null(data)

Null Values in the Dataset:


Unnamed: 0,0
User_ID,0
Calories,0
Gender,0
Age,0
Height,0
Weight,0
Duration,0
Heart_Rate,0
Body_Temp,0


In [None]:
#plot_graph(data)

In [None]:
data.columns

Index(['User_ID', 'Calories', 'Gender', 'Age', 'Height', 'Weight', 'Duration',
       'Heart_Rate', 'Body_Temp'],
      dtype='object')

In [None]:
X,y = seperate_features_target(data,'Calories')

In [None]:
X = X.drop(columns=['User_ID'])

In [None]:
X_train,X_test,y_train,y_test = perform_train_test_split(X, y, test_size=0.20, random_state=42)

### Column Transformer and Pipeline

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('ordinal',OrdinalEncoder(),['Gender']),
    ('num',StandardScaler(),['Age',
                            'Height',
                            'Weight',
                            'Duration',
                            'Heart_Rate',
                            'Body_Temp']),
],remainder='passthrough')

In [None]:
pipeline = Pipeline([("preprocessor",preprocessor),
                     ("model",LinearRegression())
                    ])

In [None]:
from sklearn import set_config

In [None]:
set_config(display='diagram')

In [None]:
pipeline

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test,y_pred)

0.9672937151257295

In [None]:
from sklearn.model_selection import KFold

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cv_results = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')

In [None]:
cv_results.mean()

np.float64(0.9671402283675841)

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
mean_absolute_error(y_test,y_pred)

8.441513553849704

In [None]:
def model_scorer(model_name,model):

    output=[]


    output.append(model_name)

    pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',model)])

    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    output.append(r2_score(y_test,y_pred))
    output.append(mean_absolute_error(y_test,y_pred))

    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
    output.append(cv_results.mean())

    return output

In [None]:
model_dict={
    'log':LinearRegression(),
    'RF':RandomForestRegressor(),
    'XGBR':XGBRegressor(),
}

In [None]:
model_output=[]
for model_name,model in model_dict.items():
    model_output.append(model_scorer(model_name,model))

In [None]:
model_output

[['log',
  0.9672937151257295,
  8.441513553849704,
  np.float64(0.9671402283675841)],
 ['RF',
  0.9982908632655064,
  1.6792266666666669,
  np.float64(0.9979278216048867)],
 ['XGBR',
  0.9988678909361673,
  1.4981198125282924,
  np.float64(0.9988510864545181)]]

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('ordinal',OrdinalEncoder(),['Gender']),
    ('num',StandardScaler(),['Age',
                            'Height',
                            'Weight',
                            'Duration',
                            'Heart_Rate',
                            'Body_Temp']),

],remainder='passthrough')

In [None]:
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model',XGBRegressor())

])

In [None]:
pipeline.fit(X,y)

In [None]:
sample = pd.DataFrame({
   'Gender':'male',
    'Age':68,
    'Height':190.0,
    'Weight':94.0,
    'Duration':29.0,
    'Heart_Rate':105.0,
    'Body_Temp':40.8,
},index=[0])

In [None]:
pipeline.predict(sample)

array([231.0721], dtype=float32)

### Save The Model

In [None]:
import pickle

In [None]:
with open('pipeline.pkl','wb') as f:
    pickle.dump(pipeline,f)

In [None]:
with open('pipeline.pkl','rb') as f:
    pipeline_saved = pickle.load(f)

In [None]:
result = pipeline_saved.predict(sample)

In [None]:
result

array([231.0721], dtype=float32)

### GUI

In [None]:
import pickle
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

# Load the trained pipeline
with open('pipeline.pkl', 'rb') as f:
    pipeline = pickle.load(f)

# Create widgets
gender = widgets.Dropdown(options=['male', 'female'], description='Gender:')
age = widgets.FloatText(description='Age:')
height = widgets.FloatText(description='Height:')
weight = widgets.FloatText(description='Weight:')
duration = widgets.FloatText(description='Duration:')
heart_rate = widgets.FloatText(description='Heart Rate:')
body_temp = widgets.FloatText(description='Body Temp:')
output = widgets.Output()

# Prediction function
def predict_calories(b):
    sample = pd.DataFrame({
        'Gender': [gender.value],
        'Age': [age.value],
        'Height': [height.value],
        'Weight': [weight.value],
        'Duration': [duration.value],
        'Heart_Rate': [heart_rate.value],
        'Body_Temp': [body_temp.value]
    })

    result = pipeline.predict(sample)
    output.clear_output()
    with output:
        print(f'Predicted Calories Burnt: {result[0]}')

# Predict button
predict_button = widgets.Button(description='Predict')
predict_button.on_click(predict_calories)

# Display widgets
display(widgets.VBox([gender, age, height, weight, duration, heart_rate, body_temp, predict_button, output]))


VBox(children=(Dropdown(description='Gender:', options=('male', 'female'), value='male'), FloatText(value=0.0,…