In [362]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [363]:
df = pd.read_csv('realistic_obesity_data_50000(in).csv')
df.head()

Unnamed: 0,Age,Gender,Height_cm,Weight_kg,BMI,Obesity_Status,Physical_Activity_Level,Diet_Type,Smoking_Habits,Alcohol_Consumption,Family_History_Obesity,Blood_Pressure,Cholesterol_Levels,Education_Level,Income_Level,Geographical_Region
0,56,0,168.131011,42.559879,15.055831,Underweight,3,1,0,0,0,Hypertension Stage 1,1,1,1,2
1,18,1,198.592722,110.917488,28.123759,Overweight,4,1,2,0,1,Elevated,2,3,2,1
2,43,0,165.658984,67.969225,24.767509,Normal weight,2,0,1,0,1,Hypertension Stage 2,2,2,3,2
3,43,1,154.234595,65.45675,27.516351,Overweight,3,1,0,0,0,Hypertension Stage 2,2,2,2,3
4,35,0,168.723152,60.8708,21.382557,Normal weight,1,0,0,1,1,Hypertension Stage 1,1,3,2,1


In [364]:
df.shape

(50000, 16)

In [365]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      50000 non-null  int64  
 1   Gender                   50000 non-null  int64  
 2   Height_cm                50000 non-null  float64
 3   Weight_kg                50000 non-null  float64
 4   BMI                      50000 non-null  float64
 5   Obesity_Status           50000 non-null  object 
 6   Physical_Activity_Level  50000 non-null  int64  
 7   Diet_Type                50000 non-null  int64  
 8   Smoking_Habits           50000 non-null  int64  
 9   Alcohol_Consumption      50000 non-null  int64  
 10  Family_History_Obesity   50000 non-null  int64  
 11  Blood_Pressure           50000 non-null  object 
 12  Cholesterol_Levels       50000 non-null  int64  
 13  Education_Level          50000 non-null  int64  
 14  Income_Level          

In [366]:
df.describe()

Unnamed: 0,Age,Gender,Height_cm,Weight_kg,BMI,Physical_Activity_Level,Diet_Type,Smoking_Habits,Alcohol_Consumption,Family_History_Obesity,Cholesterol_Levels,Education_Level,Income_Level,Geographical_Region
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,49.03762,0.49324,170.024662,74.380876,25.591904,2.55356,0.89886,0.49552,0.39862,0.5041,1.99432,3.01338,2.00118,2.00392
std,18.159998,0.499959,9.994981,14.639937,3.81425,1.026739,0.698313,0.670395,0.662337,0.499988,0.817326,1.411793,0.81809,0.815989
min,18.0,0.0,128.98372,40.0,13.342826,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,33.0,0.0,163.299024,64.280835,23.04364,2.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0
50%,49.0,0.0,170.018393,74.323274,25.609747,3.0,1.0,0.0,0.0,1.0,2.0,3.0,2.0,2.0
75%,65.0,1.0,176.690384,84.338574,28.142434,3.0,1.0,1.0,1.0,1.0,3.0,4.0,3.0,3.0
max,80.0,1.0,211.669013,137.01678,42.163885,4.0,2.0,2.0,2.0,1.0,3.0,5.0,3.0,3.0


In [367]:
df.isnull().sum()

Age                        0
Gender                     0
Height_cm                  0
Weight_kg                  0
BMI                        0
Obesity_Status             0
Physical_Activity_Level    0
Diet_Type                  0
Smoking_Habits             0
Alcohol_Consumption        0
Family_History_Obesity     0
Blood_Pressure             0
Cholesterol_Levels         0
Education_Level            0
Income_Level               0
Geographical_Region        0
dtype: int64

In [368]:
df = df.drop(columns=['BMI'])

In [369]:
label_encoder = LabelEncoder()

df['Obesity_Status'] = label_encoder.fit_transform(df['Obesity_Status'])
df['Blood_Pressure'] = label_encoder.fit_transform(df['Blood_Pressure'])

In [370]:
scaler = StandardScaler()

df[['Height_cm', 'Weight_kg']] = scaler.fit_transform(df[['Height_cm', 'Weight_kg']])

print(df.head())

   Age  Gender  Height_cm  Weight_kg  Obesity_Status  Physical_Activity_Level  \
0   56       0  -0.189462  -2.173596               3                        3   
1   18       1   2.858269   2.495706               2                        4   
2   43       0  -0.436791  -0.437961               0                        2   
3   43       1  -1.579815  -0.609580               2                        3   
4   35       0  -0.130218  -0.922833               0                        1   

   Diet_Type  Smoking_Habits  Alcohol_Consumption  Family_History_Obesity  \
0          1               0                    0                       0   
1          1               2                    0                       1   
2          0               1                    0                       1   
3          1               0                    0                       0   
4          0               0                    1                       1   

   Blood_Pressure  Cholesterol_Levels  Education_L

In [371]:
def remove_outliers_zscore(df, column_names, threshold=3):
    z_scores = np.abs((df[column_names] - df[column_names].mean()) / df[column_names].std())
    
    filtered_df = df[(z_scores < threshold).all(axis=1)]
    
    return filtered_df

continuous_columns = ['Height_cm', 'Weight_kg']

df_cleaned = remove_outliers_zscore(df, continuous_columns)

print(f"Original data shape: {df.shape}")
print(f"Data shape after outlier removal: {df_cleaned.shape}")

df_cleaned.head()

Original data shape: (50000, 15)
Data shape after outlier removal: (49805, 15)


Unnamed: 0,Age,Gender,Height_cm,Weight_kg,Obesity_Status,Physical_Activity_Level,Diet_Type,Smoking_Habits,Alcohol_Consumption,Family_History_Obesity,Blood_Pressure,Cholesterol_Levels,Education_Level,Income_Level,Geographical_Region
0,56,0,-0.189462,-2.173596,3,3,1,0,0,0,1,1,1,1,2
1,18,1,2.858269,2.495706,2,4,1,2,0,1,0,2,3,2,1
2,43,0,-0.436791,-0.437961,0,2,0,1,0,1,2,2,2,3,2
3,43,1,-1.579815,-0.60958,2,3,1,0,0,0,2,2,2,2,3
4,35,0,-0.130218,-0.922833,0,1,0,0,1,1,1,1,3,2,1


In [372]:
X = df_cleaned.drop('Obesity_Status', axis=1) 
y = df_cleaned['Obesity_Status'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Random Forest**

In [374]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [375]:
y_pred_rf = rf_model.predict(X_test)

In [376]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Accuracy: 0.9328380684670213


**Logistic Regression**

In [378]:
from sklearn.linear_model import LogisticRegression

model_slr = LogisticRegression()
model_slr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [379]:
y_pred_slr = model_slr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_slr))

Accuracy: 0.8607569521132417


**Decision Tree**

In [381]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)

In [382]:
y_pred_dt = model_dt.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_dt))

Accuracy: 0.9834353980524044


**SVM**

In [384]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', random_state=42)  
svm_classifier.fit(X_train, y_train)

In [385]:
y_pred_svm = svm_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_svm))

Accuracy: 0.98393735568718


In [386]:
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

In [387]:
y_pred_xg = xgb_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_xg))

Accuracy: 0.9846400963758659
