In [127]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [128]:
df = pd.read_csv('synthetic_health_lifestyle_dataset.csv')

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   1000 non-null   int64  
 1   Age                  998 non-null    float64
 2   Gender               999 non-null    object 
 3   Height_cm            995 non-null    float64
 4   Weight_kg            1000 non-null   float64
 5   BMI                  996 non-null    float64
 6   Smoker               1000 non-null   object 
 7   Diet_Quality         998 non-null    object 
 8   Alcohol_Consumption  749 non-null    object 
 9   Chronic_Disease      996 non-null    object 
 10  Stress_Level         1000 non-null   int64  
 11  Sleep_Hours          999 non-null    float64
dtypes: float64(5), int64(2), object(5)
memory usage: 93.9+ KB


In [130]:
df['Chronic_Disease'] = df['Chronic_Disease'].map({'No': 0, 'Yes': 1})

In [131]:
class DataPreprocessing:
    def __init__(self, df):
        self.df = df.copy()
        self.encoder = LabelEncoder()
        self.scaler = MinMaxScaler()
    
    def fillMissingValues(self):
        for col in self.df.columns:
            if self.df[col].isnull().any():
                if self.df[col].dtype == 'object':
                    self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                else:
                    self.df[col].fillna(self.df[col].mean(), inplace=True)
        return self
    
    def encode(self):
        for col in self.df.columns:
            if self.df[col].dtype == 'object':
                if self.df[col].nunique() <= 5:
                    dummies = pd.get_dummies(self.df[col], prefix=col, dtype=int)
                    self.df = pd.concat([self.df.drop(columns=[col]), dummies], axis=1)
                else:
                    self.df[col] = self.encoder.fit_transform(self.df[col])
        return self
    
    def scale(self):
        num_col = self.df.select_dtypes(include=['float64', 'int64']).columns.drop('Chronic_Disease')
        self.df[num_col] = self.scaler.fit_transform(self.df[num_col])
        return self
    
    def dataset(self):
        return self.df

In [132]:
preprocessing = DataPreprocessing(df)
df = preprocessing.fillMissingValues().encode().scale().dataset()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[col].fillna(self.df[col].mode()[0], inplace=True)


In [133]:
df.head(10)

Unnamed: 0,ID,Age,Height_cm,Weight_kg,BMI,Chronic_Disease,Stress_Level,Sleep_Hours,Gender_Female,Gender_Male,Gender_Other,Smoker_No,Smoker_Yes,Diet_Quality_Average,Diet_Quality_Excellent,Diet_Quality_Good,Diet_Quality_Poor,Alcohol_Consumption_High,Alcohol_Consumption_Low,Alcohol_Consumption_Moderate
0,0.0,0.745098,0.718462,0.093049,0.05,0.0,0.888889,0.673684,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.001001,1.0,0.590769,0.467489,0.3725,0.0,0.111111,0.4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.002002,0.54902,0.433846,0.44843,0.4375,0.0,0.222222,0.284211,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.003003,0.27451,0.610769,0.53139,0.4125,0.0,0.888889,0.473684,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.004004,0.823529,0.423077,0.352018,0.3575,1.0,0.555556,0.421053,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,0.005005,0.137255,0.566154,0.369955,0.305,0.0,0.0,0.389474,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6,0.006006,0.392157,0.332308,0.661435,0.7,1.0,0.111111,0.389474,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7,0.007007,0.505992,0.529231,0.403587,0.35,0.0,0.666667,0.494737,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
8,0.008008,0.352941,0.707692,0.137892,0.085,0.0,0.555556,0.442105,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.009009,0.431373,0.478462,0.542601,0.4925,1.0,0.0,0.368421,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


# Import algorithms

In [134]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Import metrics

In [135]:
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error

# Train/Test split

In [136]:
from sklearn.model_selection import train_test_split

x = df.drop('Chronic_Disease', axis=1)
y = df['Chronic_Disease']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Logistic Regression

In [137]:
lr = LogisticRegression()

lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

In [138]:
lr_score = accuracy_score(y_test, y_pred)
lr_report = classification_report(y_test, y_pred)
lr_mae = mean_absolute_error(y_test, y_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [139]:
lr_score

0.825

In [140]:
print(lr_report)

              precision    recall  f1-score   support

           0       0.82      1.00      0.90       165
           1       0.00      0.00      0.00        35

    accuracy                           0.82       200
   macro avg       0.41      0.50      0.45       200
weighted avg       0.68      0.82      0.75       200



In [141]:
lr_mae

0.175

# Decision Tree

In [142]:
dt = DecisionTreeClassifier(random_state=42)

dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

In [143]:
dt_score = accuracy_score(y_test, y_pred)
dt_report = classification_report(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)

In [144]:
dt_score

0.695

In [145]:
print(dt_report)

              precision    recall  f1-score   support

           0       0.82      0.81      0.81       165
           1       0.16      0.17      0.16        35

    accuracy                           0.69       200
   macro avg       0.49      0.49      0.49       200
weighted avg       0.70      0.69      0.70       200



In [146]:
dt_mae

0.305

# Random Forest

In [147]:
rf = RandomForestClassifier(random_state=42)

rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

In [148]:
rf_score = accuracy_score(y_test, y_pred)
rf_report = classification_report(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)

In [149]:
rf_score

0.82

In [150]:
print(rf_report)

              precision    recall  f1-score   support

           0       0.82      0.99      0.90       165
           1       0.00      0.00      0.00        35

    accuracy                           0.82       200
   macro avg       0.41      0.50      0.45       200
weighted avg       0.68      0.82      0.74       200



In [151]:
rf_mae

0.18

# SVM

In [152]:
svc = SVC(kernel='linear', C=0.5)

svc.fit(x_train, y_train)

y_pred = svc.predict(x_test)

In [153]:
svc_score = accuracy_score(y_test, y_pred)
svc_report = classification_report(y_test, y_pred)
svc_mae = mean_absolute_error(y_test, y_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [154]:
svc_score

0.825

In [155]:
print(svc_report)

              precision    recall  f1-score   support

           0       0.82      1.00      0.90       165
           1       0.00      0.00      0.00        35

    accuracy                           0.82       200
   macro avg       0.41      0.50      0.45       200
weighted avg       0.68      0.82      0.75       200



In [156]:
svc_mae

0.175

# KNN

In [157]:
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

In [158]:
knn_score = accuracy_score(y_test, y_pred)
knn_report = classification_report(y_test, y_pred)
knn_mae = mean_absolute_error(y_test, y_pred)

In [159]:
knn_score

0.78

In [160]:
print(knn_report)

              precision    recall  f1-score   support

           0       0.83      0.92      0.87       165
           1       0.24      0.11      0.15        35

    accuracy                           0.78       200
   macro avg       0.53      0.52      0.51       200
weighted avg       0.73      0.78      0.75       200



In [161]:
knn_mae

0.22

# Tabulate

In [162]:
from tabulate import tabulate

In [163]:
result = [
    ['Logistic Regression', lr_score, lr_mae],
    ['Decision Tree', dt_score, dt_mae],
    ['Random Forest', rf_score, rf_mae],
    ['SVM', svc_score, svc_mae],
    ['KNN', knn_score, knn_mae],
]

headers = ['Algorithm', 'Accuracy_score', 'mean absolute error']

table = tabulate(result, headers=headers, tablefmt='grid', floatfmt='.2f')

In [164]:
print(table)

+---------------------+------------------+-----------------------+
| Algorithm           |   Accuracy_score |   mean absolute error |
| Logistic Regression |             0.82 |                  0.17 |
+---------------------+------------------+-----------------------+
| Decision Tree       |             0.69 |                  0.30 |
+---------------------+------------------+-----------------------+
| Random Forest       |             0.82 |                  0.18 |
+---------------------+------------------+-----------------------+
| SVM                 |             0.82 |                  0.17 |
+---------------------+------------------+-----------------------+
| KNN                 |             0.78 |                  0.22 |
+---------------------+------------------+-----------------------+
