In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train = pd.read_csv("../data/cleaned_train.csv")
df_test = pd.read_csv("../data/cleaned_test.csv")


In [3]:
print(df_train.describe())

              HighBP       HighChol      CholCheck            BMI  \
count  202944.000000  202944.000000  202944.000000  202944.000000   
mean        0.428700       0.424344       0.962655      28.379824   
std         0.494891       0.494244       0.189607       6.612738   
min         0.000000       0.000000       0.000000      12.000000   
25%         0.000000       0.000000       1.000000      24.000000   
50%         0.000000       0.000000       1.000000      27.000000   
75%         1.000000       1.000000       1.000000      31.000000   
max         1.000000       1.000000       1.000000      98.000000   

              Smoker         Stroke  HeartDiseaseorAttack   PhysActivity  \
count  202944.000000  202944.000000         202944.000000  202944.000000   
mean        0.442634       0.040844              0.094391       0.756302   
std         0.496700       0.197929              0.292372       0.429313   
min         0.000000       0.000000              0.000000       0.000000  

In [4]:
print(df_test.describe())

             HighBP      HighChol     CholCheck           BMI        Smoker  \
count  50736.000000  50736.000000  50736.000000  50736.000000  50736.000000   
mean       0.430207      0.423230      0.962729     28.392522      0.445305   
std        0.495110      0.494076      0.189428      6.592550      0.497004   
min        0.000000      0.000000      0.000000     12.000000      0.000000   
25%        0.000000      0.000000      1.000000     24.000000      0.000000   
50%        0.000000      0.000000      1.000000     27.000000      0.000000   
75%        1.000000      1.000000      1.000000     31.000000      1.000000   
max        1.000000      1.000000      1.000000     95.000000      1.000000   

             Stroke  HeartDiseaseorAttack  PhysActivity        Fruits  \
count  50736.000000          50736.000000  50736.000000  50736.000000   
mean       0.039479              0.093366      0.757509      0.629789   
std        0.194733              0.290947      0.428594      0.482866

## Baseline Model

We willl use Logistic Regression model with linear features as our baseline model for simplicity when comparing the differences with the model after data preprocessing steps by using metrics from [classification_report](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) function: precision, recall and f1 score

**Note:** random_state will automatically set to 0 to ensure reproducable result

In [5]:
df_X = df_train.drop(["Status"], axis=1)
df_y = df_train[["Status"]]

In [6]:
train_X, test_X, train_y, test_y = train_test_split(
    df_X, df_y,
    shuffle=True, 
    test_size=0.2,
    random_state=0
)

baseline_clf = LogisticRegression(
    penalty=None,           
    class_weight="balanced",  
    max_iter=1_000,
    random_state=0          
)

baseline_clf.fit(train_X, train_y.to_numpy().ravel())
pred_y = baseline_clf.predict(test_X)
target_name = ['no diabetes', 'diabetes']

In [7]:
baseline_performance_report = classification_report(test_y, pred_y, target_names=target_name)
print(f"Baseline model performance\n{baseline_performance_report}")

BASELINE MODEL PERFORMANCE
              precision    recall  f1-score   support

 no diabetes       0.97      0.89      0.93     33494
    diabetes       0.62      0.88      0.73      7095

    accuracy                           0.88     40589
   macro avg       0.80      0.88      0.83     40589
weighted avg       0.91      0.88      0.89     40589



The model perform better at predicting No diabetes over Has diabetes like stated in [1. EDA](1.EDA.ipynb)

## Data Processing

Handling outliers: The outliers will be capped using the IQR method

In [8]:
for col in df_train.columns:
    if col == "Status":
        continue

    q1 = df_train[col].quantile(0.25)
    q3 = df_train[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Cap outliers
    df_train[col] = np.where(df_train[col] < lower_bound, lower_bound, df_train[col])
    df_train[col] = np.where(df_train[col] > upper_bound, upper_bound, df_train[col])

    

We will normalize our dataset using MinMaxScaler on the numerical features to turn them in (0, 1) scale like the rest of the dataset

In [9]:
numerical_features = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income',
                      'ExtraMedTest', 'ExtraAlcoholTest']

scaler = MinMaxScaler()
df_train[numerical_features] = scaler.fit_transform(df_train[numerical_features])
df_test[numerical_features] = scaler.fit_transform(df_test[numerical_features])

Since our dataset is imbalanced with a ratio of 1:4 between 0 and 1 (No diabetes and Has diabetes), I will use [resample](https://scikit-learn.org/stable/modules/generated/sklearn.utils.resample.html) to perform oversampling on cases with Status of 0

In [10]:
# Separate majority and minority classes
no_diabetes = df_train[df_train["Status"] == 0]
diabetes = df_train[df_train["Status"] == 1]

diabetes_upsampled = resample(
    diabetes,
    replace=True,
    n_samples=len(no_diabetes),
    random_state=0
)
    
# Combine majority class with upsampled minority class
df_train = pd.concat([no_diabetes, diabetes_upsampled])


In [11]:
status_counts = df_train['Status'].value_counts()

# Display the counts
print("Count of 0:", status_counts[0])
print("Count of 1:", status_counts[1])

Count of 0: 167313
Count of 1: 167313


In [12]:
print(df_train.shape)

(334626, 24)


Our training dataset now contains > 330k rows 

Since our dataset is already standardized with the categorical features, we dont need to do encoding 

In [13]:
df_X = df_train.drop(["Status"], axis=1)
df_y = df_train[["Status"]]

train_X, test_X, train_y, test_y = train_test_split(
    df_X, df_y,
    shuffle=True,
    random_state=0, 
    test_size=0.2
)

baseline_clf.fit(train_X, train_y.to_numpy().ravel())
pred_y = baseline_clf.predict(test_X)

target_name = ['no diabetes', 'diabetes']

print(f"Baseline model performance\n{baseline_performance_report}")
print(f"Model performance after preprocessing\n{classification_report(test_y, pred_y, target_names=target_name)}")

Baseline model performance
              precision    recall  f1-score   support

 no diabetes       0.97      0.89      0.93     33494
    diabetes       0.62      0.88      0.73      7095

    accuracy                           0.88     40589
   macro avg       0.80      0.88      0.83     40589
weighted avg       0.91      0.88      0.89     40589

Model performance after preprocessing
              precision    recall  f1-score   support

 no diabetes       0.87      0.88      0.88     33547
    diabetes       0.88      0.87      0.88     33379

    accuracy                           0.88     66926
   macro avg       0.88      0.88      0.88     66926
weighted avg       0.88      0.88      0.88     66926



The model now perform better on average on predicting both No diabetes and Has diabetes

In [14]:
df_train.to_csv("../data/transformed_train.csv", index=False)
df_test.to_csv("../data/transformed_test.csv", index=False)
