## Imagine you have a dataset where you have different features like `Age` , `Gender` , `Height` , `Weight` , `BMI` , and `Blood Pressure` and you have to classify the people into different classes like `Normal` , `Overweight` , `Obesity` , `Underweight` , and `Extreme Obesity` by using any 4 different classification algorithms. Now you have to build a model which can classify people into different classes.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn import metrics

In [2]:
df = pd.read_csv('Data/ObesityDataSet_raw_and_data_sinthetic.csv', encoding='ISO-8859-2')

df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
df.shape

(2111, 17)

In [4]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [5]:
df.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

### Data Preparation

In [7]:
# Converting object variables into category

columns = ["Gender", "family_history_with_overweight", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS", "NObeyesdad"]

for col in columns:
    df[col] = df[col].astype('category')

In [8]:
# Converting to int or nearest int value (doing it to ease up calculation)
columns = ["FCVC", "NCP", "CH2O", "TUE", "FAF"]

for col in columns:
    #round to nearest whole number
    df[col] = round(df[col]).astype('int')  

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   Gender                          2111 non-null   category
 1   Age                             2111 non-null   float64 
 2   Height                          2111 non-null   float64 
 3   Weight                          2111 non-null   float64 
 4   family_history_with_overweight  2111 non-null   category
 5   FAVC                            2111 non-null   category
 6   FCVC                            2111 non-null   int32   
 7   NCP                             2111 non-null   int32   
 8   CAEC                            2111 non-null   category
 9   SMOKE                           2111 non-null   category
 10  CH2O                            2111 non-null   int32   
 11  SCC                             2111 non-null   category
 12  FAF                 

In [10]:
# Copying to a new df

df_new = df.copy()

In [11]:
# create dummy variables
df_new = pd.get_dummies(df_new,columns=["Gender","family_history_with_overweight",
                                          "FAVC","CAEC","SMOKE","SCC","CALC","MTRANS"])
df_new.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad,Gender_Female,...,SCC_yes,CALC_Always,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21.0,1.62,64.0,2,3,2,0,1,Normal_Weight,1,...,0,0,0,0,1,0,0,0,1,0
1,21.0,1.52,56.0,3,3,3,3,0,Normal_Weight,1,...,1,0,0,1,0,0,0,0,1,0
2,23.0,1.8,77.0,2,3,2,2,1,Normal_Weight,0,...,0,0,1,0,0,0,0,0,1,0
3,27.0,1.8,87.0,3,3,2,2,0,Overweight_Level_I,0,...,0,0,1,0,0,0,0,0,0,1
4,22.0,1.78,89.8,2,1,2,0,0,Overweight_Level_II,0,...,0,0,0,1,0,0,0,0,1,0


In [12]:
# Mapping of classes
class_mapping = {
    "Normal_Weight": "Normal",
    "Overweight_Level_I": "Overweight",
    "Overweight_Level_II": "Overweight",
    "Obesity_Type_I": "Obesity",
    "Obesity_Type_II": "Obesity",
    "Obesity_Type_III": "Extreme Obesity",
    "Insufficient_Weight": "Underweight"
}

# Rename the classes
df_new["NObeyesdad"] = df_new["NObeyesdad"].map(class_mapping)

df_new['NObeyesdad'].value_counts()

Obesity            648
Overweight         580
Extreme Obesity    324
Normal             287
Underweight        272
Name: NObeyesdad, dtype: int64

In [13]:
# Encode the target variable
le = LabelEncoder()
df_new["NObeyesdad"] = le.fit_transform(df_new["NObeyesdad"])

### Model Training

In [14]:
X = df_new.drop(columns=['NObeyesdad'])
y = df_new['NObeyesdad']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Classification models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "XGB": XGBClassifier()
}

# Train and evaluate models
for model_name, model in models.items():
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = round(metrics.accuracy_score(y_test, y_pred),5)
    # output
    print(model_name + ':')        
    print("---------------")      
    print("Accuracy:", accuracy)
    print("\nClassification Report:\n", metrics.classification_report(y_test, y_pred)) 
    print("\n")

Decision Tree:
---------------
Accuracy: 0.95508

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        63
           1       0.91      0.82      0.86        62
           2       0.99      0.99      0.99       136
           3       0.92      0.96      0.94       106
           4       0.93      0.96      0.95        56

    accuracy                           0.96       423
   macro avg       0.95      0.95      0.95       423
weighted avg       0.96      0.96      0.95       423



Random Forest:
---------------
Accuracy: 0.94326

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        63
           1       0.87      0.76      0.81        62
           2       1.00      0.98      0.99       136
           3       0.89      0.96      0.92       106
           4       0.93      0.96      0.95        56

    accuracy                       