In [292]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [293]:
df = pd.read_csv('synthetic_health_lifestyle_dataset.csv')

In [294]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   1000 non-null   int64  
 1   Age                  998 non-null    float64
 2   Gender               999 non-null    object 
 3   Height_cm            995 non-null    float64
 4   Weight_kg            1000 non-null   float64
 5   BMI                  996 non-null    float64
 6   Smoker               1000 non-null   object 
 7   Diet_Quality         998 non-null    object 
 8   Alcohol_Consumption  749 non-null    object 
 9   Chronic_Disease      996 non-null    object 
 10  Stress_Level         1000 non-null   int64  
 11  Sleep_Hours          999 non-null    float64
dtypes: float64(5), int64(2), object(5)
memory usage: 93.9+ KB


In [295]:
df['Chronic_Disease'] = df['Chronic_Disease'].map({'No': 0, 'Yes': 1})

In [296]:
class DataPreprocessing:
    def __init__(self, df):
        self.df = df.copy()
        self.encoder = LabelEncoder()
        self.scaler = MinMaxScaler()
    
    def fillMissingValues(self):
        for col in self.df.columns:
            if self.df[col].isnull().any():
                if self.df[col].dtype == 'object':
                    self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                else:
                    self.df[col].fillna(self.df[col].mean(), inplace=True)
        return self
    
    def encode(self):
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                if self.df[col].nunique() <= 5:
                    dummies = pd.get_dummies(self.df[col], prefix=col, dtype=int)
                    self.df = pd.concat([self.df.drop(columns=[col]), dummies], axis=1)
                else:
                    self.df[col] = self.encoder.fit_transform(self.df[col])
        return self
    
    def scale(self):
        num_col = self.df.select_dtypes(include=['float64', 'int64']).columns.drop('Chronic_Disease')
        self.df[num_col] = self.scaler.fit_transform(self.df[num_col])
        return self
    
    def dataset(self):
        return self.df

In [297]:
preprocessing = DataPreprocessing(df)
df = preprocessing.fillMissingValues().encode().scale().dataset()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mode()[0], inplace=True)


In [298]:
df.head(10)

Unnamed: 0,ID,Age,Height_cm,Weight_kg,BMI,Chronic_Disease,Stress_Level,Sleep_Hours,Gender_Female,Gender_Male,Gender_Other,Smoker_No,Smoker_Yes,Diet_Quality_Average,Diet_Quality_Excellent,Diet_Quality_Good,Diet_Quality_Poor,Alcohol_Consumption_High,Alcohol_Consumption_Low,Alcohol_Consumption_Moderate
0,0.0,0.745098,0.718462,0.093049,0.05,0.0,0.888889,0.673684,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.001001,1.0,0.590769,0.467489,0.3725,0.0,0.111111,0.4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.002002,0.54902,0.433846,0.44843,0.4375,0.0,0.222222,0.284211,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.003003,0.27451,0.610769,0.53139,0.4125,0.0,0.888889,0.473684,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.004004,0.823529,0.423077,0.352018,0.3575,1.0,0.555556,0.421053,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,0.005005,0.137255,0.566154,0.369955,0.305,0.0,0.0,0.389474,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6,0.006006,0.392157,0.332308,0.661435,0.7,1.0,0.111111,0.389474,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7,0.007007,0.505992,0.529231,0.403587,0.35,0.0,0.666667,0.494737,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
8,0.008008,0.352941,0.707692,0.137892,0.085,0.0,0.555556,0.442105,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.009009,0.431373,0.478462,0.542601,0.4925,1.0,0.0,0.368421,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


# Feature Transforming

In [299]:
import numpy as np

In [300]:
skew_check = df.select_dtypes(include=['float64'])

In [301]:
skew_check.skew()

ID                              0.000000
Age                            -0.040218
Height_cm                       0.034965
Weight_kg                       0.012734
BMI                             0.307322
Chronic_Disease                 1.514746
Stress_Level                   -0.065347
Sleep_Hours                     0.065550
Gender_Female                   0.552712
Gender_Male                     0.704985
Gender_Other                    0.879394
Smoker_No                      -0.975908
Smoker_Yes                      0.975908
Diet_Quality_Average            1.150281
Diet_Quality_Excellent          1.298599
Diet_Quality_Good               1.049173
Diet_Quality_Poor               1.138043
Alcohol_Consumption_High       -0.028045
Alcohol_Consumption_Low         1.119865
Alcohol_Consumption_Moderate    1.238801
dtype: float64

In [302]:
skewness = skew_check.skew()

In [303]:
log_transformation = skewness[(skewness>=0.5)].index.tolist()

In [304]:
log_transformation

['Chronic_Disease',
 'Gender_Female',
 'Gender_Male',
 'Gender_Other',
 'Smoker_Yes',
 'Diet_Quality_Average',
 'Diet_Quality_Excellent',
 'Diet_Quality_Good',
 'Diet_Quality_Poor',
 'Alcohol_Consumption_Low',
 'Alcohol_Consumption_Moderate']

In [305]:
for col in log_transformation:
    df[col+'_log']=np.log1p(df[col])

# Import algorithms

In [306]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Import metrics

In [307]:
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error

# Train/Test split

In [308]:
from sklearn.model_selection import train_test_split

x = df.drop('Chronic_Disease', axis=1)
y = df['Chronic_Disease']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Logistic Regression

In [309]:
lr = LogisticRegression()

lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

In [310]:
lr_score = accuracy_score(y_test, y_pred)
lr_report = classification_report(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

In [311]:
lr_score

1.0

In [312]:
print(lr_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       165
           1       1.00      1.00      1.00        35

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [313]:
lr_mae

0.0

# Decision Tree

In [314]:
dt = DecisionTreeClassifier(random_state=42)

dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

In [315]:
dt_score = accuracy_score(y_test, y_pred)
dt_report = classification_report(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)

In [316]:
dt_score

1.0

In [317]:
print(dt_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       165
           1       1.00      1.00      1.00        35

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [318]:
dt_mae

0.0

# Random Forest

In [319]:
rf = RandomForestClassifier(random_state=42)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

In [320]:
rf_score = accuracy_score(y_test, y_pred)
rf_report = classification_report(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)

In [321]:
rf_score

1.0

In [322]:
print(rf_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       165
           1       1.00      1.00      1.00        35

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [323]:
rf_mae

0.0

# SVM

In [324]:
svc = SVC(kernel='linear', C=0.5)

svc.fit(x_train, y_train)

y_pred = svc.predict(x_test)

In [325]:
svc_score = accuracy_score(y_test, y_pred)
svc_report = classification_report(y_test, y_pred)
svc_mae = mean_absolute_error(y_test, y_pred)

In [326]:
svc_score

1.0

In [327]:
print(svc_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       165
           1       1.00      1.00      1.00        35

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [328]:
svc_mae

0.0

# KNN

In [329]:
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

In [330]:
knn_score = accuracy_score(y_test, y_pred)
knn_report = classification_report(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)

In [331]:
knn_score

0.92

In [332]:
print(knn_report)

              precision    recall  f1-score   support

           0       0.92      0.99      0.95       165
           1       0.95      0.57      0.71        35

    accuracy                           0.92       200
   macro avg       0.93      0.78      0.83       200
weighted avg       0.92      0.92      0.91       200



In [333]:
knn_mae

0.08

# Tabulate

In [334]:
from tabulate import tabulate

In [335]:
result = [
    ['Logistic Regression', lr_score, lr_mae],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['SVM', svc_score, svc_mae],
    ['KNN', knn_score, knn_mae],
]

headers = ['Algorithm', 'Accuracy_score', 'mean absolute error']

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.2f')

In [336]:
print(table)

+---------------------+------------------+-----------------------+
| Algorithm           |   Accuracy_score |   mean absolute error |
| Logistic Regression |             1.00 |                  0.00 |
+---------------------+------------------+-----------------------+
| Decision Tree       |             1.00 |                  0.00 |
+---------------------+------------------+-----------------------+
| Random Forest       |             1.00 |                  0.00 |
+---------------------+------------------+-----------------------+
| SVM                 |             1.00 |                  0.00 |
+---------------------+------------------+-----------------------+
| KNN                 |             0.92 |                  0.08 |
+---------------------+------------------+-----------------------+


# Joblib

In [337]:
from joblib import dump
import os

In [338]:
def JoblibSave(algo):
    os.makedirs("model", exist_ok=True)
    algorithm = str(algo).split("(")[0]
    return dump(algo, f'model/{algorithm}_prediction.joblib')

In [339]:
JoblibSave(lr)
JoblibSave(dt)
JoblibSave(rf)
JoblibSave(svc)
JoblibSave(knn)

['model/KNeighborsClassifier_prediction.joblib']