In [None]:
# Nicholas Chludzinski

In [None]:
''' 
This machine learning model is designed to predict whether an individual is overweight or obese based on a range of personal, 
lifestyle, and health-related factors using Logistic Regression. It uses features such as age, height, weight, physical activity levels, 
eating habits, and transportation choices to make its prediction. The model simplifies the original multi-class obesity labels into a 
binary classification: it outputs a 1 if the person is likely overweight or obese, and a 0 if the person is of normal weight. This allows for quick, 
interpretable assessments that can support health awareness, early intervention, or personalized recommendations. 
'''

In [86]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [87]:
# I acquired my data from: https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition

In [88]:
# Load Data CSV
df = pd.read_csv('/Users/nicho/Downloads/obesity/ObesityDataSet.csv')

# Strip columns of whitespace
df.columns.str.strip()

# See a sample of our file
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [89]:
# See the unique values of our target variable
df["NObeyesdad"].unique()

array(['Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II',
       'Obesity_Type_I', 'Insufficient_Weight', 'Obesity_Type_II',
       'Obesity_Type_III'], dtype=object)

In [90]:
# Remove certain rows (ie. I want to remove "Insuffucient_Weight", as I only want normal weight (0) and overweight/ obese (1)
# Remove rows where NObeyesdad is "Insufficient_Weight"
df = df[df["NObeyesdad"] != "Insufficient_Weight"]

# Convert the values in our target column into binary values (1s and 0s)
df["BinaryTarget"] = df["NObeyesdad"].isin(["Overweight_Level_I", "Overweight_Level_II",
       "Obesity_Type_I", "Obesity_Type_II",
       "Obesity_Type_III"]).astype(int)

# Now see our chart with the new column
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BinaryTarget
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight,0
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,0
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight,0
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I,1
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II,1


In [91]:
# Our new target column, now with binary values
TargetColumn = 'BinaryTarget'

# Drop target column and old target column to maintain model integrity and avoid data leakage
X = df.drop([TargetColumn, "NObeyesdad"], axis=1)

# One-hot-encode all categorical variables (ie. Male, Female, Yes, No, etc) into binary (1s and 0s)
X = pd.get_dummies(X)

# Define our target variable, the variable we are trying to predict
y = df[TargetColumn]

# Now see our chart with all binary values
X.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Female,Gender_Male,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21.0,1.62,64.0,2.0,3.0,2.0,0.0,1.0,True,False,...,False,False,False,False,True,False,False,False,True,False
1,21.0,1.52,56.0,3.0,3.0,3.0,3.0,0.0,True,False,...,True,False,False,True,False,False,False,False,True,False
2,23.0,1.8,77.0,2.0,3.0,2.0,2.0,1.0,False,True,...,False,False,True,False,False,False,False,False,True,False
3,27.0,1.8,87.0,3.0,3.0,2.0,2.0,0.0,False,True,...,False,False,True,False,False,False,False,False,False,True
4,22.0,1.78,89.8,2.0,1.0,2.0,0.0,0.0,False,True,...,False,False,False,True,False,False,False,False,True,False


In [92]:
# Train test split (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [93]:
# Normalize our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [94]:
# Choose model
model = LogisticRegression()

In [95]:
# Fit model to the training data and predict on the test data
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [96]:
# Analyze model
print("Accuracy:\n", accuracy_score(y_test, y_pred))
print("\nClassification:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy:
 0.9836956521739131

Classification:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95        57
           1       0.99      0.99      0.99       311

    accuracy                           0.98       368
   macro avg       0.96      0.98      0.97       368
weighted avg       0.98      0.98      0.98       368


Confusion Matrix:
 [[ 55   2]
 [  4 307]]


In [97]:
# Prediction part below

In [98]:
# Use our model to predict on a new user's data...

new_user = {
    "Age": 25,
    "Height": 1.75,
    "Weight": 65,
    "family_history_with_overweight": "no",
    "FAVC": "yes",
    "FCVC": 2.0,
    "NCP": 3.0,
    "CAEC": "Sometimes",
    "SMOKE": "no",
    "CH2O": 2.0,
    "SCC": "no",
    "FAF": 1.0,
    "TUE": 1.0,
    "CALC": "Sometimes",
    "MTRANS": "Public_Transportation",
    "Gender": "Male"
}


In [99]:
# Convert to DataFrame
new_df = pd.DataFrame([new_user])

# One-hot encode (must match training columns!)
new_df_encoded = pd.get_dummies(new_df)

# Align with training features (fill missing columns with 0)
new_df_encoded = new_df_encoded.reindex(columns=X.columns, fill_value=0)

# Scale (use the same scaler!)
new_df_scaled = scaler.transform(new_df_encoded)

# Predict
prediction = model.predict(new_df_scaled)
probability = model.predict_proba(new_df_scaled)[0][1]


In [100]:
if prediction[0] == 1:
    print(f"Prediction: Obese/ Overweight (Probability: {probability:.2f})")
else:
    print(f"Prediction: Normal Weight (Probability: {1 - probability:.2f})")


Prediction: Normal Weight (Probability: 0.97)
