## Fields detials

1.  Gender - sexual representation ( Male or Female)
2.  Age - in years
3.  Height - in meters
4.  Weight - in Kgs
5.  family_history_with_overweight - YES or NO type
6.  FAVC - Frequent consumption of high caloric food
7.  FCVC -  Frequency of consumption of vegetables
8.  NCP - Number of Main Meals
9.  CAEC - Consumption of Food between Meals
10. SMOKE - YES or NO type
11. CH2O - Consumption of Water Daily
12. SCC - Calories Consumption monitoring
13. FAF - Physical Activity Frequency
14. TUE - Time Using Technology (hours)
15. CALC - Consumption of Alcohol
16. MTRANS - Mode of Transportation Used Daily
17. NObeyesdad - Targer Feature

### 1. Import required packages

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from dataclasses import dataclass
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

### 2. Read the dataframe

In [49]:
file_path = os.path.join(os.getcwd(), 'train.csv')
df = pd.read_csv(file_path)

## 3. EDA
### 3.1 Handling missing values
### 3.2 Handling out liers
### 3.3 Handling duplicates

In [50]:
df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


### 3.0.0 Identify the column types

In [51]:
df.drop(columns=['id'], axis=1, inplace=True)
columns = df.columns
cat_columns = [col for col in columns if df[col].dtype == 'O' or df[col].nunique() < (df.shape[0] // df[col].nunique())]
num_columns = [col for col in columns if df[col].dtype != 'O' and df[col].nunique() > (df.shape[0] // df[col].nunique())]
print('Number of columns present in the dataframe: ', len(columns))
print('Number of catagorical columns present in the dataframe: ', len(cat_columns))
print('Number of numarical columns present in the dataframe: ', len(num_columns))


Number of columns present in the dataframe:  17
Number of catagorical columns present in the dataframe:  9
Number of numarical columns present in the dataframe:  8


### 3.0.1 List out the number of unique values present in each columns

In [52]:
for col in columns:
  print(f"{col} : {df[col].nunique()}")

Gender : 2
Age : 1703
Height : 1833
Weight : 1979
family_history_with_overweight : 2
FAVC : 2
FCVC : 934
NCP : 689
CAEC : 4
SMOKE : 2
CH2O : 1506
SCC : 2
FAF : 1360
TUE : 1297
CALC : 3
MTRANS : 5
NObeyesdad : 7


### 3.0.2 List the unique values in catagorical columns

In [53]:
for col in cat_columns:
  print(f"{col} : {df[col].unique()}")

Gender : ['Male' 'Female']
family_history_with_overweight : ['yes' 'no']
FAVC : ['yes' 'no']
CAEC : ['Sometimes' 'Frequently' 'no' 'Always']
SMOKE : ['no' 'yes']
SCC : ['no' 'yes']
CALC : ['Sometimes' 'no' 'Frequently']
MTRANS : ['Public_Transportation' 'Automobile' 'Walking' 'Motorbike' 'Bike']
NObeyesdad : ['Overweight_Level_II' 'Normal_Weight' 'Insufficient_Weight'
 'Obesity_Type_III' 'Obesity_Type_II' 'Overweight_Level_I'
 'Obesity_Type_I']


In [54]:
df['MTRANS'].replace({
    'Bike': 'Bicycle',
    'Motorbike': '2 or 4 wheeler',
    'Public_Transportation': '2 or 4 wheeler',
    'Automobile': '2 or 4 wheeler',
}, inplace=True)
print(f"unique values in MTRANS is {df['MTRANS'].unique()}")

unique values in MTRANS is ['2 or 4 wheeler' 'Walking' 'Bicycle']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MTRANS'].replace({


### 3.1.0 List the number of missing values in the dataframe

In [55]:
df.isna().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

### 3.2.0 Check the outliers in the dataframe

In [56]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


### 3.3.0 Check the duplicates

In [57]:
print(f"number duplicate records in the dataframe is {df.duplicated().sum()}")

number duplicate records in the dataframe is 5


In [58]:
print("duplicate recordes")
df[df.duplicated()]

duplicate recordes


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
8165,Female,19.0,1.52,42.0,no,yes,3.0,1.0,Sometimes,no,1.0,no,0.0,0.0,Sometimes,2 or 4 wheeler,Insufficient_Weight
9127,Male,23.0,1.7,75.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,Sometimes,2 or 4 wheeler,Overweight_Level_I
12780,Female,40.0,1.56,80.0,yes,yes,2.0,3.0,Sometimes,no,1.0,no,0.0,0.0,no,2 or 4 wheeler,Obesity_Type_I
14194,Male,19.0,1.82,72.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,2 or 4 wheeler,Normal_Weight
17376,Female,40.0,1.56,80.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Sometimes,2 or 4 wheeler,Obesity_Type_I


### 3.3.1 Delete the duplicate recordes

In [59]:
df = df.drop_duplicates()

## 4. Feature engineering
### 4.1 Create preprocesser pipeline

In [60]:
from sklearn.base import BaseEstimator, TransformerMixin

class MyLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y=None):
        for column in X.columns:
            self.encoders[column] = LabelEncoder().fit(X[column])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        for column, encoder in self.encoders.items():
            X_encoded[column] = encoder.transform(X[column])
        return X_encoded

In [61]:
cat_pipeline = Pipeline(
    steps=[
        ('Encoder', MyLabelEncoder()),
        ('Scaler', StandardScaler())
    ]
)

preprocessor_pipeline = ColumnTransformer([
    ('cat_pipeline', cat_pipeline, cat_columns[:-1]),
    ('Scaler', StandardScaler(), num_columns)
], remainder='passthrough')

### split the dataset using train_test_split

In [62]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.2, random_state=42)

### encode the target

In [63]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [64]:
x_train[:5]

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
10901,Male,23.0,1.65,62.0,no,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,2 or 4 wheeler
15207,Male,21.0,1.65,70.0,no,yes,2.0,1.0,no,no,2.0,no,1.0,0.0,Sometimes,2 or 4 wheeler
12022,Female,25.919571,1.610225,102.249831,yes,yes,3.0,3.0,Sometimes,no,1.120213,no,1.999836,0.813235,Sometimes,2 or 4 wheeler
13780,Male,21.125836,1.65,80.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,2.70825,1.0,no,2 or 4 wheeler
14306,Female,18.0,1.716545,52.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,0.827502,1.817146,no,2 or 4 wheeler


### fit the data in preprocessor pipeline for save the preprocessor

In [65]:
preprocessor_pipeline.fit(x_train)

### save the preprocessor_pipeline

In [66]:
def save_object(file_path, object_to_save):
  try:
    joblib.dump(object_to_save, file_path)
  except Exception as e:
        raise

In [67]:
preprocessor_path = os.path.join(os.getcwd(), 'preprocessor.pkl')
save_object(
    file_path = preprocessor_path,
    object_to_save = preprocessor_pipeline
)

### Reload the model

In [68]:
preprocessor = joblib.load( preprocessor_path)

In [69]:
# preprocess the x_train and x_test
x_test = preprocessor.transform(x_test)
x_train = preprocessor.transform(x_train)

In [70]:
def evaluate_model(x_train, x_test, y_train, y_test, models, params):
    try:
        report = {
            'model': [],
            'train accuracy': [],
            'test accuracy': []
        }

        for i in range(len(list(models))):
            # initiate model
            model = list(models.values())[i]
            para = params[list(models.keys())[i]]

            gs = GridSearchCV(model, para, cv=3)
            gs.fit(x_train, y_train)

            model.set_params(**gs.best_params_)
            model.fit(x_train, y_train)

            # make predictions
            y_train_pred = model.predict(x_train)
            y_test_pred = model.predict(x_test)

            # calculate accuracy
            train_accuracy = accuracy_score(y_train, y_train_pred)
            test_accuracy = accuracy_score(y_test, y_test_pred)

            # append the accuracies to the report
            report['model'].append(list(models.keys())[i])
            report['train accuracy'].append(train_accuracy)
            report['test accuracy'].append(test_accuracy)

        return report

    except Exception as e:
        raise


In [71]:
@dataclass
class ModelTrainerConfig:
    trained_model_file_path = os.path.join(os.getcwd(), 'model.pkl')

class ModelTrainer:
    def __init__(self):
        self.model_trainer_config = ModelTrainerConfig()

    def initiate_model_trainer(self, x_train, x_test, y_train, y_test):
        try:
            models = {
                'Decision Tree': DecisionTreeClassifier(),
                'Ada Boost': AdaBoostClassifier(),
                'Gradient Boost': GradientBoostingClassifier(),
                'Random Forest': RandomForestClassifier(),
                'KNeighbors': KNeighborsClassifier(),
                'Cat Boost': CatBoostClassifier(),
                'XGBoost': XGBClassifier()
            }

            model_params = {
                "Decision Tree": {
                    'criterion':['gini', 'entropy'],
                    'splitter':['best','random'],
                    'max_depth':[None, 5, 10, 15, 20],
                },
                "Ada Boost":{
                    'learning_rate':[0.1, 0.01, 0.5, 0.001],
                    'n_estimators': [50, 100, 200, 500]
                },
                "Gradient Boost":{
                    'learning_rate':[0.1, 0.01, 0.05, 0.001],
                    'subsample':[0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
                    'n_estimators': [50, 100, 200, 500]
                },
                "Random Forest":{
                    'n_estimators': [50, 100, 200, 500],
                    'max_depth':[None, 5, 10, 15, 20],
                    'criterion':['gini', 'entropy']
                },
                'KNeighbors': {
                    'n_neighbors': [3, 5, 7, 9],
                    'weights': ['uniform', 'distance'],
                    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                },
                "Cat Boost":{
                    'depth': [6, 8, 10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "XGBoost":{
                    'learning_rate':[0.1, 0.01, 0.05, 0.001],
                    'n_estimators': [50, 100, 200, 500]
                },
            }

            # get models evaluation report
            model_report: dict = evaluate_model(x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test, models=models, params=model_params)

            report_df = pd.DataFrame(model_report, index=None)
            print('model report: \n', report_df)

            best_train_record = report_df[report_df['train accuracy'] == report_df['train accuracy'].max()]
            print('best model based on train accuracy: \n', best_train_record)

            best_test_record = report_df[report_df['test accuracy'] == report_df['test accuracy'].max()]
            print('best model based on test accuracy: \n', best_test_record)

            report_df['over_all_score'] = (report_df['train accuracy'] + report_df['test accuracy']) / 2
            record_above_nine = report_df[(report_df['train accuracy'] > 0.9) & (report_df['test accuracy'] > 0.9)]
            best_model_record = record_above_nine[record_above_nine['over_all_score'] == record_above_nine['over_all_score'].max()]
            print('overall best model: \n', best_model_record)

            best_model_score = best_model_record['test accuracy'].iloc[0]
            best_model_name = best_model_record['model'].iloc[0]
            best_model = models[best_model_name]

            if best_model_score < 0.6:
                print('Best model not found')
                return

            print(f"best model score: {best_model_score}")
            print(f"best model name: {best_model_name}")

            save_object(
                file_path = self.model_trainer_config.trained_model_file_path,
                object_to_save = best_model
            )

            return self.model_trainer_config.trained_model_file_path

        except Exception as e:
            raise


In [72]:
model_trainer = ModelTrainer()
best_model_path = model_trainer.initiate_model_trainer(
    x_train = x_train,
    x_test = x_test,
    y_train = y_train,
    y_test = y_test
)



0:	learn: 1.9125911	total: 29.7ms	remaining: 861ms
1:	learn: 1.8813967	total: 57.5ms	remaining: 805ms
2:	learn: 1.8506369	total: 82.4ms	remaining: 742ms
3:	learn: 1.8216043	total: 109ms	remaining: 706ms
4:	learn: 1.7952445	total: 136ms	remaining: 678ms
5:	learn: 1.7685596	total: 162ms	remaining: 650ms
6:	learn: 1.7434215	total: 186ms	remaining: 610ms
7:	learn: 1.7196412	total: 212ms	remaining: 582ms
8:	learn: 1.6959484	total: 235ms	remaining: 549ms
9:	learn: 1.6736933	total: 278ms	remaining: 555ms
10:	learn: 1.6512999	total: 323ms	remaining: 559ms
11:	learn: 1.6304395	total: 366ms	remaining: 548ms
12:	learn: 1.6091619	total: 399ms	remaining: 521ms
13:	learn: 1.5877976	total: 426ms	remaining: 487ms
14:	learn: 1.5683373	total: 450ms	remaining: 450ms
15:	learn: 1.5503335	total: 475ms	remaining: 416ms
16:	learn: 1.5315801	total: 500ms	remaining: 382ms
17:	learn: 1.5140609	total: 538ms	remaining: 359ms
18:	learn: 1.4963695	total: 570ms	remaining: 330ms
19:	learn: 1.4793541	total: 594ms	rema

In [73]:
'''
model report: 
             model  train accuracy  test accuracy
0   Decision Tree        0.918945       0.869942
1       Ada Boost        0.592436       0.590800
2  Gradient Boost        0.946766       0.903179
3   Random Forest        0.990847       0.899326
4      KNeighbors        1.000000       0.744942
5       Cat Boost        0.914669       0.903902
6         XGBoost        0.970733       0.903420
best model based on train accuracy: 
         model  train accuracy  test accuracy
4  KNeighbors             1.0       0.744942
best model based on test accuracy: 
         model  train accuracy  test accuracy
4  KNeighbors             1.0       0.744942
'''

'\nmodel report: \n             model  train accuracy  test accuracy\n0   Decision Tree        0.918945       0.869942\n1       Ada Boost        0.592436       0.590800\n2  Gradient Boost        0.946766       0.903179\n3   Random Forest        0.990847       0.899326\n4      KNeighbors        1.000000       0.744942\n5       Cat Boost        0.914669       0.903902\n6         XGBoost        0.970733       0.903420\nbest model based on train accuracy: \n         model  train accuracy  test accuracy\n4  KNeighbors             1.0       0.744942\nbest model based on test accuracy: \n         model  train accuracy  test accuracy\n4  KNeighbors             1.0       0.744942\n'

In [74]:
val = [0.918945 / 0.869942, 0.592436 / 0.590800, 0.946766 / 0.903179, 0.990847 / 0.899326, 1.000000 / 0.744942, 0.914669 / 0.903902, 0.970733 / 0.903420]

In [75]:
val

[1.0563290426258303,
 1.0027691266079892,
 1.048259536592414,
 1.1017662115851206,
 1.3423863871281254,
 1.0119116895415654,
 1.074509087689004]

In [76]:
reportt = {
    'model': ['Decision Tree', 'Ada Boost', 'Gradient Boost', 'Random Forest', 'KNeighbors', 'Cat Boost', 'XGBoost',],
    'train accuracy': [0.918945, 0.592436, 0.946766, 0.990847, 1.000000, 0.914669, 0.970733],
    'test accuracy': [0.869942, 0.590800, 0.903179, 0.899326, 0.744942, 0.903902, 0.903420]
}

dff = pd.DataFrame(reportt)
dff['over_all_score'] = (dff['train accuracy'] + dff['test accuracy']) / 2

dff

Unnamed: 0,model,train accuracy,test accuracy,over_all_score
0,Decision Tree,0.918945,0.869942,0.894443
1,Ada Boost,0.592436,0.5908,0.591618
2,Gradient Boost,0.946766,0.903179,0.924972
3,Random Forest,0.990847,0.899326,0.945086
4,KNeighbors,1.0,0.744942,0.872471
5,Cat Boost,0.914669,0.903902,0.909285
6,XGBoost,0.970733,0.90342,0.937076


In [77]:

above_nine = dff[(dff['train accuracy'] > 0.9) & (dff['test accuracy'] > 0.9)]
best_model_record = above_nine[above_nine['over_all_score'] == above_nine['over_all_score'].max()]
best_model_record

Unnamed: 0,model,train accuracy,test accuracy,over_all_score
6,XGBoost,0.970733,0.90342,0.937076
