In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')
df = data.copy()

In [3]:
df = df.drop_duplicates()
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
# train test split 
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [5]:
train_df

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
280,Male,21.000000,1.750000,62.000000,no,yes,3.000000,4.000000,Frequently,yes,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
1197,Male,33.151905,1.685127,83.986895,yes,yes,2.000000,2.473911,Sometimes,no,2.452789,no,0.932792,0.000000,no,Automobile,Overweight_Level_II
102,Female,21.000000,1.550000,57.000000,no,yes,2.000000,4.000000,Frequently,no,2.000000,yes,2.000000,0.000000,Sometimes,Automobile,Normal_Weight
2050,Female,20.741442,1.694439,122.813033,yes,yes,3.000000,3.000000,Sometimes,no,1.409444,no,0.933595,0.840393,Sometimes,Public_Transportation,Obesity_Type_III
852,Female,21.016623,1.755427,78.300084,yes,yes,3.000000,1.000000,Sometimes,no,2.000000,no,2.877473,0.000000,Sometimes,Public_Transportation,Overweight_Level_I
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662,Male,23.083621,1.848553,121.421121,yes,yes,3.000000,2.567567,Sometimes,no,2.011023,no,0.916478,0.000000,Sometimes,Public_Transportation,Obesity_Type_II
1119,Female,35.456326,1.651812,79.437921,yes,yes,2.156065,2.909117,Sometimes,no,1.221281,no,0.503279,1.796136,no,Automobile,Overweight_Level_II
1154,Male,22.882558,1.793451,89.909259,yes,yes,1.899116,2.375026,Sometimes,no,1.398540,no,0.000000,1.365793,Sometimes,Public_Transportation,Overweight_Level_II
1318,Male,23.237302,1.761008,97.829344,yes,yes,2.000000,3.000000,Sometimes,no,2.988771,no,2.429923,1.978043,no,Public_Transportation,Obesity_Type_I


In [6]:
categorical_columns = train_df.select_dtypes(include='object').columns
numerical_columns = train_df.select_dtypes(exclude='object').columns

In [7]:
from sklearn.impute import SimpleImputer #Handling missing values
from sklearn.preprocessing import StandardScaler #Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import LabelEncoder
# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [8]:
encoder = LabelEncoder()
for cat_cols in train_df[categorical_columns]:
   
    train_df[cat_cols] = encoder.fit_transform(train_df[cat_cols])

In [9]:
encoder = LabelEncoder()
for cat_cols in test_df[categorical_columns]:
    test_df[cat_cols] = encoder.fit_transform(test_df[cat_cols])

In [10]:
X_train = train_df.drop(labels=['NObeyesdad'], axis=1)
y_train = train_df['NObeyesdad']

X_test = test_df.drop(labels=['NObeyesdad'], axis=1)
y_test = test_df['NObeyesdad']

In [11]:
### training the model

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [13]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, average='weighted')
    recall = recall_score(true, predicted, average='weighted')
    f1 = f1_score(true, predicted, average='weighted')

    return accuracy, precision, recall, f1


In [14]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "support vector machine": SVC()
}

In [15]:
model_list = list()
accuracy_list = list()

In [16]:
X_train

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
280,1,21.000000,1.750000,62.000000,0,1,3.000000,4.000000,1,1,2.000000,0,0.000000,0.000000,2,3
1197,1,33.151905,1.685127,83.986895,1,1,2.000000,2.473911,2,0,2.452789,0,0.932792,0.000000,3,0
102,0,21.000000,1.550000,57.000000,0,1,2.000000,4.000000,1,0,2.000000,1,2.000000,0.000000,2,0
2050,0,20.741442,1.694439,122.813033,1,1,3.000000,3.000000,2,0,1.409444,0,0.933595,0.840393,2,3
852,0,21.016623,1.755427,78.300084,1,1,3.000000,1.000000,2,0,2.000000,0,2.877473,0.000000,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662,1,23.083621,1.848553,121.421121,1,1,3.000000,2.567567,2,0,2.011023,0,0.916478,0.000000,2,3
1119,0,35.456326,1.651812,79.437921,1,1,2.156065,2.909117,2,0,1.221281,0,0.503279,1.796136,3,0
1154,1,22.882558,1.793451,89.909259,1,1,1.899116,2.375026,2,0,1.398540,0,0.000000,1.365793,2,3
1318,1,23.237302,1.761008,97.829344,1,1,2.000000,3.000000,2,0,2.988771,0,2.429923,1.978043,3,3


In [17]:
print(y_train)

280     1
1197    6
102     1
2050    4
852     5
       ..
1662    3
1119    6
1154    6
1318    2
881     5
Name: NObeyesdad, Length: 1460, dtype: int32


In [18]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    accuracy, precision, recall, f1 = evaluate_model(y_train, y_train_pred)
    accuracy_test, precision_test, recall_test, f1_test = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(model)

    print('Model performance for Training set')
    print("- accuracy: {:.4f}".format(accuracy))
    print("- precision: {:.4f}".format(precision))
    print("- recall: {:.4f}".format(recall))
    print("- f1: {:.4f}".format(f1))

    # print('----------------------------------')
    
    print('Model performance for Testing set')
    print("- accuracy: {:.4f}".format(accuracy_test))
    print("- precision: {:.4f}".format(precision_test))
    print("- recall: {:.4f}".format(recall_test))
    print("- f1: {:.4f}".format(f1_test))

    accuracy_list.append(f1_test)
    
    print('='*35)
    print('\n')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Logistic Regression
Model performance for Training set
- accuracy: 0.6733
- precision: 0.6564
- recall: 0.6733
- f1: 0.6618
Model performance for Testing set
- accuracy: 0.6077
- precision: 0.6283
- recall: 0.6077
- f1: 0.5902


Naive Bayes
Model performance for Training set
- accuracy: 0.5616
- precision: 0.6244
- recall: 0.5616
- f1: 0.5144
Model performance for Testing set
- accuracy: 0.4498
- precision: 0.4945
- recall: 0.4498
- f1: 0.3921


K-Neighbors Classifier
Model performance for Training set
- accuracy: 0.9123
- precision: 0.9124
- recall: 0.9123
- f1: 0.9104
Model performance for Testing set
- accuracy: 0.8549
- precision: 0.8578
- recall: 0.8549
- f1: 0.8470


Decision Tree
Model performance for Training set
- accuracy: 1.0000
- precision: 1.0000
- recall: 1.0000
- f1: 1.0000
Model performance for Testing set
- accuracy: 0.9346
- precision: 0.9359
- recall: 0.9346
- f1: 0.9344


Random Forest Classifier
Model performance for Training set
- accuracy: 1.0000
- precision: 1.0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


support vector machine
Model performance for Training set
- accuracy: 0.5705
- precision: 0.5811
- recall: 0.5705
- f1: 0.5650
Model performance for Testing set
- accuracy: 0.5311
- precision: 0.5572
- recall: 0.5311
- f1: 0.5270


