## Fields detials

1.  Gender - sexual representation ( Male or Female)
2.  Age - in years
3.  Height - in meters
4.  Weight - in Kgs
5.  family_history_with_overweight - YES or NO type
6.  FAVC - Frequency of Consumption of Vegetables
7.  FCVC - Frequency of Consumption of Carbonated Drinks
8.  NCP - Number of Main Meals
9.  CAEC - Consumption of Food between Meals
10. SMOKE - YES or NO type
11. CH2O - Consumption of Water Daily
12. SCC - Calories Consumption per Day
13. FAF - Physical Activity Frequency
14. TUE - Time Using Technology (hours)
15. CALC - Consumption of Alcohol
16. MTRANS - Mode of Transportation Used Daily
17. NObeyesdad - Targer Feature

### 1. Import required packages

In [157]:
!pip install catboost



In [158]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from dataclasses import dataclass
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import joblib

### 2. Read the dataframe

In [159]:
file_path = os.path.join(os.getcwd(), 'train.csv')
df = pd.read_csv(file_path)

## 3. EDA
### 3.1 Handling missing values
### 3.2 Handling out liers
### 3.3 Handling duplicates

In [160]:
df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


### 3.0.0 Identify the column types

In [161]:
df.drop(columns=['id'], axis=1, inplace=True)
columns = df.columns
cat_columns = [col for col in columns if df[col].dtype == 'O' or df[col].nunique() < (df.shape[0] // df[col].nunique())]
num_columns = [col for col in columns if df[col].dtype != 'O' and df[col].nunique() > (df.shape[0] // df[col].nunique())]
print('Number of columns present in the dataframe: ', len(columns))
print('Number of catagorical columns present in the dataframe: ', len(cat_columns))
print('Number of numarical columns present in the dataframe: ', len(num_columns))


Number of columns present in the dataframe:  17
Number of catagorical columns present in the dataframe:  9
Number of numarical columns present in the dataframe:  8


### 3.0.1 List out the number of unique values present in each columns

In [162]:
for col in columns:
  print(f"{col} : {df[col].nunique()}")

Gender : 2
Age : 1703
Height : 1833
Weight : 1979
family_history_with_overweight : 2
FAVC : 2
FCVC : 934
NCP : 689
CAEC : 4
SMOKE : 2
CH2O : 1506
SCC : 2
FAF : 1360
TUE : 1297
CALC : 3
MTRANS : 5
NObeyesdad : 7


### 3.0.2 List the unique values in catagorical columns

In [163]:
for col in cat_columns:
  print(f"{col} : {df[col].unique()}")

Gender : ['Male' 'Female']
family_history_with_overweight : ['yes' 'no']
FAVC : ['yes' 'no']
CAEC : ['Sometimes' 'Frequently' 'no' 'Always']
SMOKE : ['no' 'yes']
SCC : ['no' 'yes']
CALC : ['Sometimes' 'no' 'Frequently']
MTRANS : ['Public_Transportation' 'Automobile' 'Walking' 'Motorbike' 'Bike']
NObeyesdad : ['Overweight_Level_II' 'Normal_Weight' 'Insufficient_Weight'
 'Obesity_Type_III' 'Obesity_Type_II' 'Overweight_Level_I'
 'Obesity_Type_I']


In [164]:
df['MTRANS'].replace({
    'Bike': 'Bicycle',
    'Motorbike': '2 or 4 wheeler',
    'Public_Transportation': '2 or 4 wheeler',
    'Automobile': '2 or 4 wheeler',
}, inplace=True)
print(f"unique values in MTRANS is {df['MTRANS'].unique()}")

unique values in MTRANS is ['2 or 4 wheeler' 'Walking' 'Bicycle']


### 3.1.0 List the number of missing values in the dataframe

In [165]:
df.isna().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

### 3.2.0 Check the outliers in the dataframe

In [166]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


### 3.3.0 Check the duplicates

In [167]:
print(f"number duplicate records in the dataframe is {df.duplicated().sum()}")

number duplicate records in the dataframe is 5


In [168]:
print("duplicate recordes")
df[df.duplicated()]

duplicate recordes


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
8165,Female,19.0,1.52,42.0,no,yes,3.0,1.0,Sometimes,no,1.0,no,0.0,0.0,Sometimes,2 or 4 wheeler,Insufficient_Weight
9127,Male,23.0,1.7,75.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,Sometimes,2 or 4 wheeler,Overweight_Level_I
12780,Female,40.0,1.56,80.0,yes,yes,2.0,3.0,Sometimes,no,1.0,no,0.0,0.0,no,2 or 4 wheeler,Obesity_Type_I
14194,Male,19.0,1.82,72.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,2 or 4 wheeler,Normal_Weight
17376,Female,40.0,1.56,80.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Sometimes,2 or 4 wheeler,Obesity_Type_I


### 3.3.1 Delete the duplicate recordes

In [169]:
df = df.drop_duplicates()

## 4. Feature engineering
### 4.1 Create preprocesser pipeline

In [170]:
from sklearn.base import BaseEstimator, TransformerMixin

class MyLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoders = {}

    def fit(self, X, y=None):
        for column in X.columns:
            self.encoders[column] = LabelEncoder().fit(X[column])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        for column, encoder in self.encoders.items():
            X_encoded[column] = encoder.transform(X[column])
        return X_encoded

In [171]:
cat_pipeline = Pipeline(
    steps=[
        ('Encoder', MyLabelEncoder()),
        ('Scaler', StandardScaler())
    ]
)

preprocessor_pipeline = ColumnTransformer([
    ('cat_pipeline', cat_pipeline, cat_columns[:-1]),
    ('Scaler', StandardScaler(), num_columns)
], remainder='passthrough')

### split the dataset using train_test_split

In [172]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.2, random_state=42)

### encode the target

In [173]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [174]:
x_train[:5]

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
10901,Male,23.0,1.65,62.0,no,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Sometimes,2 or 4 wheeler
15207,Male,21.0,1.65,70.0,no,yes,2.0,1.0,no,no,2.0,no,1.0,0.0,Sometimes,2 or 4 wheeler
12022,Female,25.919571,1.610225,102.249831,yes,yes,3.0,3.0,Sometimes,no,1.120213,no,1.999836,0.813235,Sometimes,2 or 4 wheeler
13780,Male,21.125836,1.65,80.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,2.70825,1.0,no,2 or 4 wheeler
14306,Female,18.0,1.716545,52.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,0.827502,1.817146,no,2 or 4 wheeler


### fit the data in preprocessor pipeline for save the preprocessor

In [175]:
preprocessor_pipeline.fit(x_train)

### save the preprocessor_pipeline

In [176]:
def save_object(file_path, object_to_save):
  try:
    joblib.dump(object_to_save, file_path)
  except Exception as e:
        raise e

In [177]:
preprocessor_path = os.path.join(os.getcwd(), 'preprocessor.pkl')
save_object(
    file_path = preprocessor_path,
    object_to_save = preprocessor_pipeline
)

### Reload the model

In [178]:
preprocessor = joblib.load( preprocessor_path)

In [179]:
# preprocess the x_train and x_test
x_test = preprocessor.transform(x_test)
x_train = preprocessor.transform(x_train)

In [183]:
def evaluate_model(x_train, x_test, y_train, y_test, models, params):
    try:
        report = {
            'model': [],
            'train accuracy': [],
            'test accuracy': []
        }

        for i in range(len(list(models))):
            # initiate model
            model = list(models.values())[i]
            para = params[list(models.keys())[i]]

            gs = GridSearchCV(model, para, cv=3)
            gs.fit(x_train, y_train)

            model.set_params(**gs.best_params_)
            model.fit(x_train, y_train)

            # make predictions
            y_train_pred = model.predict(x_train)
            y_test_pred = model.predict(x_test)

            # calculate accuracy
            train_accuracy = accuracy_score(y_train, y_train_pred)
            test_accuracy = accuracy_score(y_test, y_test_pred)

            # append the accuracies to the report
            report['model'].append(list(models.keys())[i])
            report['train accuracy'].append(train_accuracy)
            report['test accuracy'].append(test_accuracy)

        return report

    except Exception as e:
        raise e


In [181]:
@dataclass
class ModelTrainerConfig:
    trained_model_file_path = os.path.join(os.getcwd(), 'model.pkl')

class ModelTrainer:
    def __init__(self):
        self.model_trainer_config = ModelTrainerConfig()

    def initiate_model_trainer(self, x_train, x_test, y_train, y_test):
        try:
            models = {
                'Logistic Regression': LogisticRegression(),
                'Decision Tree': DecisionTreeClassifier(),
                'Ada Boost': AdaBoostClassifier(),
                'Gradient Boost': GradientBoostingClassifier(),
                'Random Forest': RandomForestClassifier(),
                'KNeighbors': KNeighborsClassifier(),
                'Cat Boost': CatBoostClassifier(),
                'XGBoost': XGBClassifier()
            }

            model_params = {
                "Logistic Regression":{},
                "Decision Tree": {
                    'criterion':['gini', 'entropy'],
                    'splitter':['best','random'],
                    'max_depth':[None, 5, 10, 15, 20],
                },
                "Ada Boost":{
                    'learning_rate':[0.1, 0.01, 0.5, 0.001],
                    'n_estimators': [50, 100, 200, 500]
                },
                "Gradient Boost":{
                    'learning_rate':[0.1, 0.01, 0.05, 0.001],
                    'subsample':[0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
                    'n_estimators': [50, 100, 200, 500]
                },
                "Random Forest":{
                    'n_estimators': [50, 100, 200, 500],
                    'max_depth':[None, 5, 10, 15, 20],
                    'criterion':['gini', 'entropy']
                },
                'KNeighbors': {
                    'n_neighbors': [3, 5, 7, 9],
                    'weights': ['uniform', 'distance'],
                    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                },
                "Cat Boost":{
                    'depth': [6, 8, 10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "XGBoost":{
                    'learning_rate':[0.1, 0.01, 0.05, 0.001],
                    'n_estimators': [50, 100, 200, 500]
                },
            }

            # get models evaluation report
            model_report: dict = evaluate_model(x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test, models=models, params=model_params)

            report_df = pd.DataFrame(model_report, index=None)
            print('model report: \n', report_df)

            best_train_record = report_df[report_df['train accuracy'] == report_df['train accuracy'].max()]
            print('best model based on train accuracy: \n', best_train_record)

            best_test_record = report_df[report_df['test accuracy'] == report_df['test accuracy'].max()]
            print('best model based on test accuracy: \n', best_train_record)

            best_model_record = report_df[report_df['test accuracy'] == report_df['test accuracy'].max() and report_df['train accuracy'] == report_df['train accuracy'].max()]
            print('overall best model: \n', best_model_record)

            best_model_score = best_model_record['test accuracy'].iloc[0]
            best_model_name = best_model_record['model'].iloc[0]
            best_model = models[best_model_name]

            if best_model_score < 0.6:
                print('Best model not found')
                return

            print(f"best model score: {best_model_score}")
            print(f"best model name: {best_model_name}")

            save_object(
                file_path = self.model_trainer_config.trained_model_file_path,
                object_to_save = best_model
            )

            return self.model_trainer_config.trained_model_file_path

        except Exception as e:
            raise e


In [None]:
model_trainer = ModelTrainer()
best_model_path = model_trainer.initiate_model_trainer(
    x_train = x_train,
    x_test = x_test,
    y_train = y_train,
    y_test = y_test
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt