In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
import warnings

In [2]:
df = pd.read_csv('data/cleaned_dataset.csv')

In [3]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight,24.386526
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight,24.238227
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight,23.765432
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I,26.851852
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II,28.342381


#### Preparing X and Y variables

In [5]:
X=df.drop(columns=['NObeyesdad'],axis=1)

In [6]:
X

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,BMI
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,24.386526
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,24.238227
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,23.765432
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,26.851852
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,28.342381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2082,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,44.901475
2083,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,43.741923
2084,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,43.543817
2085,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,44.071535


In [7]:
y=df['NObeyesdad']

In [8]:
y


0             Normal_Weight
1             Normal_Weight
2             Normal_Weight
3        Overweight_Level_I
4       Overweight_Level_II
               ...         
2082       Obesity_Type_III
2083       Obesity_Type_III
2084       Obesity_Type_III
2085       Obesity_Type_III
2086       Obesity_Type_III
Name: NObeyesdad, Length: 2087, dtype: object

In [9]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
# One-hot Encoding
onehot_features = ['Gender','family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC','MTRANS']
#Ordinal Encoding
ordinal_features=['CAEC','CALC']
# Define the desired category order
categories = [['no', 'Sometimes', 'Frequently', 'Always'], ['no', 'Sometimes', 'Frequently', 'Always']]

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')
ord_transformer=OrdinalEncoder(categories=categories)

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, onehot_features),
        ("OrdinalEncoder", ord_transformer, ordinal_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [10]:
num_features

Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'BMI'], dtype='object')

In [11]:
X = preprocessor.fit_transform(X)


In [12]:
X.shape

(2087, 20)

In [13]:
# Get the transformed feature names
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame with the transformed data and feature names
X_transformed_df = pd.DataFrame(X, columns=transformed_feature_names)

# Display the DataFrame with transformed data and feature names
X_transformed_df

Unnamed: 0,OneHotEncoder__Gender_Male,OneHotEncoder__family_history_with_overweight_yes,OneHotEncoder__FAVC_yes,OneHotEncoder__SMOKE_yes,OneHotEncoder__SCC_yes,OneHotEncoder__MTRANS_Bike,OneHotEncoder__MTRANS_Motorbike,OneHotEncoder__MTRANS_Public_Transportation,OneHotEncoder__MTRANS_Walking,OrdinalEncoder__CAEC,OrdinalEncoder__CALC,StandardScaler__Age,StandardScaler__Height,StandardScaler__Weight,StandardScaler__FCVC,StandardScaler__NCP,StandardScaler__CH2O,StandardScaler__FAF,StandardScaler__TUE,StandardScaler__BMI
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,-0.526613,-0.887408,-0.872985,-0.788364,0.390906,-0.007810,-1.186977,0.554211,-0.670475
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,-0.526613,-1.960788,-1.178508,1.082164,0.390906,1.636552,2.328908,-1.090505,-0.688960
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,-0.212507,1.044677,-0.376509,-0.788364,0.390906,-0.007810,1.156947,0.554211,-0.747890
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.415705,1.044677,0.005395,1.082164,0.390906,-0.007810,1.156947,-1.090505,-0.363194
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,-0.369560,0.830001,0.112328,-0.788364,-2.225418,-0.007810,-1.186977,-1.090505,-0.177412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2082,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,-0.530250,0.086470,1.701376,1.082164,0.390906,-0.454848,0.777546,0.400014,1.886538
2083,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,-0.372239,0.492788,1.790528,1.082164,0.390906,0.000626,0.385081,-0.104876,1.742010
2084,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,-0.287258,0.531665,1.788482,1.082164,0.390906,0.081303,0.470422,-0.027545,1.717318
2085,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.001389,0.394745,1.775393,1.082164,0.390906,1.393744,0.148013,-0.126644,1.783093


In [14]:
X_transformed_df.shape

(2087, 20)

In [15]:
# Apply LabelEncoder separately to the target column
label_encoder = LabelEncoder()
y= label_encoder.fit_transform(y)


In [16]:
y

array([1, 1, 1, ..., 4, 4, 4])

In [17]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((1669, 20), (418, 20))

#### Create an Evaluate Function to give all metrics after model Training

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, average='weighted')  # Use 'weighted' for multiclass classification
    recall = recall_score(true, predicted, average='weighted')  # Use 'weighted' for multiclass classification
    f1 = f1_score(true, predicted, average='weighted')  # Use 'weighted' for multiclass classification
    confusion_mat = confusion_matrix(true, predicted)
    
    return accuracy, precision, recall, f1, confusion_mat

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "SVC": SVC(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}

model_list = []
accuracy_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    accuracy_train, precision_train, recall_train, f1_train, _ = evaluate_model(y_train, y_train_pred)
    accuracy_test, precision_test, recall_test, f1_test, confusion_test = evaluate_model(y_test, y_test_pred)
    
    print(model_name)
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(accuracy_train))
    print("- Precision: {:.4f}".format(precision_train))
    print("- Recall: {:.4f}".format(recall_train))
    print("- F1 Score: {:.4f}".format(f1_train))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(accuracy_test))
    print("- Precision: {:.4f}".format(precision_test))
    print("- Recall: {:.4f}".format(recall_test))
    print("- F1 Score: {:.4f}".format(f1_test))
    
    model_list.append(model_name)
    accuracy_list.append(accuracy_test)

    print('='*35)
    print('\n')


Logistic Regression
Model performance for Training set
- Accuracy: 0.9293
- Precision: 0.9300
- Recall: 0.9293
- F1 Score: 0.9289
----------------------------------
Model performance for Test set
- Accuracy: 0.9043
- Precision: 0.9059
- Recall: 0.9043
- F1 Score: 0.9033


Random Forest Classifier
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9904
- Precision: 0.9906
- Recall: 0.9904
- F1 Score: 0.9904


Decision Tree Classifier
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9569
- Precision: 0.9584
- Recall: 0.9569
- F1 Score: 0.9569


K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.9143
- Precision: 0.9169
- Recall: 0.9143
- F1 Score: 0.9103
----------------------------------
Mod



AdaBoost Classifier
Model performance for Training set
- Accuracy: 0.5842
- Precision: 0.4524
- Recall: 0.5842
- F1 Score: 0.4955
----------------------------------
Model performance for Test set
- Accuracy: 0.5478
- Precision: 0.4295
- Recall: 0.5478
- F1 Score: 0.4636




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
model_list

['Logistic Regression',
 'Random Forest Classifier',
 'Decision Tree Classifier',
 'K-Neighbors Classifier',
 'SVC',
 'XGBClassifier',
 'CatBoosting Classifier',
 'AdaBoost Classifier']

In [20]:
accuracy_list

[0.9043062200956937,
 0.9904306220095693,
 0.9569377990430622,
 0.8708133971291866,
 0.930622009569378,
 0.9760765550239234,
 0.9808612440191388,
 0.5478468899521531]

In [21]:
import pandas as pd

# Assuming model_list and accuracy_list are already defined in your code
data = {'Model': model_list, 'Accuracy': accuracy_list}

# Create DataFrame
accuracy_df = pd.DataFrame(data)

# Sort DataFrame by accuracy in descending order
accuracy_df = accuracy_df.sort_values(by='Accuracy', ascending=False)

# Display the DataFrame
accuracy_df


Unnamed: 0,Model,Accuracy
1,Random Forest Classifier,0.990431
6,CatBoosting Classifier,0.980861
5,XGBClassifier,0.976077
2,Decision Tree Classifier,0.956938
4,SVC,0.930622
0,Logistic Regression,0.904306
3,K-Neighbors Classifier,0.870813
7,AdaBoost Classifier,0.547847
