In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from sklearn.model_selection import GridSearchCV
import pickle

In [None]:
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [None]:
class DataHandler:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None
        self.input_df = None
        self.output_df = None

    def load_data(self):
        self.data = pd.read_csv(self.file_path)

    def create_input_output(self, target_column):
        self.output_df = self.data[target_column]
        self.input_df = self.data.drop(target_column, axis=1)

In [None]:
filePath = "ObesityDataSet_raw_and_data_sinthetic.csv"
dataHandler = DataHandler(file_path=filePath)
dataHandler.load_data()
dataHandler.create_input_output("NObeyesdad")
inputDF = dataHandler.input_df
outputDF = dataHandler.output_df

In [None]:
class ModelHandler:
    def __init__(self, input_data, output_data):
        self.input_data = input_data
        self.output_data = output_data
        self.createModel()
        self.x_train, self.x_test, self.y_train, self.y_test, self.y_predict = [None] * 5

    def dataConvertToNumeric(self, columns):
        for column in columns:
            if column in self.x_train.columns:
                self.x_train[column] = pd.to_numeric(self.x_train[column], errors="coerce")
                self.x_test[column] = pd.to_numeric(self.x_test[column], errors="coerce")
            else:
                print(f"Warning: Column '{column}' not found in input data.")

    def checkOutlierWithBox(self, column):
        boxplot = self.x_train.boxplot(column=column)
        plt.show()

    def createMeanFromColumn(self, column):
        return np.mean(self.x_train[column])

    def createModel(self, criteria="gini", maxdepth=6):
        self.model = RandomForestClassifier(criterion=criteria, max_depth=maxdepth)

    def dataConvertToNumeric(self, columns):
        self.x_train[columns] = pd.to_numeric(self.x_train[columns], errors="coerce")
        self.x_test[columns] = pd.to_numeric(self.x_test[columns], errors="coerce")

    def fillingNAWithNumbers(self, columns, number):
        self.x_train[columns].fillna(number, inplace=True)
        self.x_test[columns].fillna(number, inplace=True)

    def makePrediction(self):
        self.y_predict = self.model.predict(self.x_test)

    def createReport(self):
        print("\nClassification Report\n")
        print(classification_report(self.y_test, self.y_predict, target_names=["1", "2", "3", "4", "5", "6"]))

    def split_data(self, test_size = 0.2, random_state=42):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            self.input_data, self.output_data, test_size=test_size, random_state=random_state
        )

    def train_model(self):
        self.model.fit(self.x_train, self.y_train)

    def evaluate_model(self):
        return accuracy_score(self.y_test, self.y_predict)

    def tuningParameter(self):
        parameters = {
            "criterion": ["gini", "entropy", "log_loss"],
            "max_depth": [2, 4, 6, 8]
        }
        RFClass = RandomForestClassifier()
        RFClass = GridSearchCV(RFClass, scoring="accuracy", param_grid=parameters, cv=5)
        RFClass.fit(self.x_train, self.y_train)
        print("Tuned Hyperparameters ", RFClass.best_params_)
        print("Accuracy: ", RFClass.best_score_)

        self.createModel(criteria=RFClass.best_params_["criterion"], maxdepth=RFClass.best_params_["max_depth"])

    def save_model_to_file(self, filename):
        with open(filename, "wb") as file:
            pickle.dump(self.model, file)

In [None]:
modelHandler = ModelHandler(inputDF, outputDF)
modelHandler.split_data()
modelHandler.dataConvertToNumeric("NObeyesdad")
modelHandler.checkOutlierWithBox("NObeyesdad")
ageReplaceNA = modelHandler.createMeanFromColumn("NObeyesdad")
modelHandler.fillingNAWithNumbers("NObeyesdad", ageReplaceNA)

KeyError: 'NObeyesdad'

In [None]:
print("Before Tuning Parameter")
modelHandler.train_model()
modelHandler.makePrediction()
print("Model Accuracy: ", modelHandler.evaluate_model())
modelHandler.createReport()

print("After Tuning Parameter")
modelHandler.tuningParameter()
modelHandler.train_model()

print("Model Accuracy: ", modelHandler.evaluate_model())
modelHandler.makePrediction()
modelHandler.createReport()
modelHandler.save_model_to_file("trained_model.pkl")

Before Tuning Parameter
Model Accuracy:  0.9864864864864865

Classification Report

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        31
           2       0.90      1.00      0.95         9
           3       1.00      1.00      1.00        13
           4       1.00      0.88      0.93         8
           5       1.00      1.00      1.00        10
           6       1.00      1.00      1.00         3

    accuracy                           0.99        74
   macro avg       0.98      0.98      0.98        74
weighted avg       0.99      0.99      0.99        74

After Tuning Parameter
Tuned Hyperparameters  {'criterion': 'entropy', 'max_depth': 6}
Accuracy:  0.9760958503798948
Model Accuracy:  0.9864864864864865

Classification Report

              precision    recall  f1-score   support

           1       1.00      1.00      1.00        31
           2       0.90      1.00      0.95         9
           3       1.00      1.00