<a href="https://colab.research.google.com/github/Jules-gatete/ML_Summative/blob/main/Daily_Water_Intake_Prediction_Model_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

**loading the Data**

In [None]:
# Load the dataset
file_path = "/content/gym_members_exercise_tracking.csv"
data = pd.read_csv(file_path)

# check the first few rows of the dataset
data.head()

In [None]:
# Select relevant features for training (ensure only 5 features are used)
features = ['Age', 'Gender', 'Height (m)', 'Weight (kg)', 'Workout_Type']
target = 'Water_Intake (liters)'


In [None]:
# Prepare data
X = data[features]
y = data[target]

# Convert to DataFrame (ensure it's a DataFrame)
X = pd.DataFrame(X, columns=features)


In [None]:
#description of data
data.describe()


In [None]:
# get information about the dataset
data.info()

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

In [None]:
# Plot the distribution of the data
data.hist()
plt.show()

In [None]:
# Get the categorical and numerical data
categorical_data = data.select_dtypes(include=['object'])
numerical_data = data.select_dtypes(include=['float64', 'int64'])

print(categorical_data.head())

In [None]:
label_encoder = LabelEncoder()

In [None]:
# see the new data
data.head()

In [None]:
label_encoder_gender = LabelEncoder()
label_encoder_workout_type = LabelEncoder()

# Encode the categorical variables
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])
data['Workout_Type'] = label_encoder_workout_type.fit_transform(data['Workout_Type'])


In [None]:
# Display changes
print(data.head())
print(data.dtypes)

In [None]:
# Check the correlation of the data
sns.heatmap(data.corr(), annot=True)
plt.show()

In [None]:
# Get the features and target variables
X = data.drop('Water_Intake (liters)', axis=1)
y = data['Water_Intake (liters)']

In [None]:
# MinMax Scaling (optional - use only if needed)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Standard Scaling (recommended for most algorithms)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, 'scaler.joblib')  # Save the scaler

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the shape of the training data
print(X_train.shape)

In [None]:
# Preprocessing: Impute missing values and one-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), ['Age', 'Height (m)', 'Weight (kg)']),  # Imputes missing numeric values
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Gender', 'Workout_Type'])  # OneHot encodes categorical features and ignores unknown categories
    ])


In [None]:
def calculate_metrics(y_test, y_pred, model_name):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = metrics.r2_score(y_test, y_pred)

    print(f"\n{model_name} Metrics:")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-Squared (R²): {r2:.4f}")
    return mae, mse, rmse, r2

In [None]:
# 1. Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train, y_train)
decision_pred = decision_tree.predict(X_test)

# R-squared error for Decision Tree
decision_r2 = metrics.r2_score(y_test, decision_pred)
print("Decision Tree - R squared error:", decision_r2)

# Mean Squared Error (MSE) for Decision Tree
decision_mse = metrics.mean_squared_error(y_test, decision_pred)
print("Decision Tree - Mean Squared Error (MSE):", decision_mse)

# Root Mean Squared Error (RMSE) for Decision Tree
decision_rmse = np.sqrt(decision_mse)
print("Decision Tree - Root Mean Squared Error (RMSE):", decision_rmse)

In [None]:
#Linear Regression

linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
linear_pred = linear_reg.predict(X_test)

# R-squared error for Linear Regression
linear_r2 = metrics.r2_score(y_test, linear_pred)
print("Linear Regression - R squared error:", linear_r2)

# Mean Squared Error (MSE) for Linear Regression
linear_mse = metrics.mean_squared_error(y_test, linear_pred)
print("Linear Regression - Mean Squared Error (MSE):", linear_mse)

# Root Mean Squared Error (RMSE) for Linear Regression
linear_rmse = np.sqrt(linear_mse)
print("Linear Regression - Root Mean Squared Error (RMSE):", linear_rmse)

In [None]:
# 3. Random Forest Regressor
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)
random_pred = random_forest.predict(X_test)

# R-squared error for Random Forest
random_r2 = metrics.r2_score(y_test, random_pred)
print("Random Forest - R squared error:", random_r2)

# Mean Squared Error (MSE) for Random Forest
random_mse = metrics.mean_squared_error(y_test, random_pred)
print("Random Forest - Mean Squared Error (MSE):", random_mse)

# Root Mean Squared Error (RMSE) for Random Forest
random_rmse = np.sqrt(random_mse)
print("Random Forest - Root Mean Squared Error (RMSE):", random_rmse)

In [None]:


# Preprocessing: Encode categorical features and handle missing values
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), ['Age', 'Height (m)', 'Weight (kg)']),  # Handle numerical features
        ('cat', OneHotEncoder(), ['Gender', 'Workout_Type'])  # One-hot encode categorical features
    ])

In [None]:
# Build the model pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42)) ])

In [None]:
# Fit the model pipeline on the training data
model_pipeline.fit(X_train, y_train)

In [None]:
# Save the trained model using joblib
import joblib
joblib.dump(model_pipeline, 'random_forest_model.joblib')

print("Model trained and saved successfully.")

In [None]:
# Recreate LabelEncoder and fit it on all categories (must match training data)
label_encoder_gender = LabelEncoder()
label_encoder_workout_type = LabelEncoder()

# These should have been fitted on the entire dataset during training
# Assuming you know the categories present in your training data
label_encoder_gender.fit(['male', 'female'])
label_encoder_workout_type.fit(['sedentary', 'light', 'moderate', 'heavy'])

# Load the trained Random Forest model (if saved previously)
# Assuming the model was saved as 'random_forest_model.joblib'
try:
    model_pipeline = joblib.load('random_forest_model.joblib')  # Load your trained RandomForest model
    print("Model loaded successfully!")
except FileNotFoundError:
    print("Error: Model file not found. Please ensure the model is saved correctly.")

# Function to validate user inputs
def get_valid_input(prompt, min_val, max_val, dtype=float):
    while True:
        try:
            value = dtype(input(prompt))
            if min_val <= value <= max_val:
                return value
            else:
                print(f"Value must be between {min_val} and {max_val}. Try again.")
        except ValueError:
            print("Invalid input. Please enter a number.")

# Prompt user for input
print("\nEnter the following details to predict your daily water intake:")

age = get_valid_input("Age (0-120 years): ", 0, 120, int)

gender = input("Gender (Male/Female): ").strip().lower()
while gender not in ['male', 'female']:
    print("Invalid input. Please enter 'Male' or 'Female'.")
    gender = input("Gender (Male/Female): ").strip().lower()

# Encode gender input
encoded_gender = label_encoder_gender.transform([gender])[0]

height = get_valid_input("Height (in meters, 0.5-2.5): ", 0.5, 2.5)

weight = get_valid_input("Weight (in kg, 10-300): ", 10, 300)

work_type = input("Workout Type (Sedentary/Light/Moderate/Heavy): ").strip().lower()
while work_type not in ['sedentary', 'light', 'moderate', 'heavy']:
    print("Invalid input. Please enter 'Sedentary', 'Light', 'Moderate', or 'Heavy'.")
    work_type = input("Workout Type (Sedentary/Light/Moderate/Heavy): ").strip().lower()

# Encode workout type input
encoded_work_type = label_encoder_workout_type.transform([work_type])[0]

# Prepare user input for prediction
input_data = {
    'Age': [age],
    'Height (m)': [height],
    'Weight (kg)': [weight],
    'Gender': [encoded_gender],  # Use encoded value for gender
    'Workout_Type': [encoded_work_type]  # Use encoded value for workout type
}

# Convert to DataFrame
input_df = pd.DataFrame(input_data)

# Predict the target value (daily water intake)
try:
    prediction = model_pipeline.predict(input_df)  # Use the Random Forest model to make predictions
    print(f"\nPredicted daily water intake: {prediction[0]:.2f} liters")
except Exception as e:
    print(f"An error occurred during prediction: {e}")


Model loaded successfully!

Enter the following details to predict your daily water intake:
Age (0-120 years): 55
