In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

In [2]:
df_train = pd.read_csv("../data/cleaned_train.csv")
df_test = pd.read_csv("../data/cleaned_test.csv")


In [3]:
print(df_train.shape)

(202944, 23)


In [4]:
df_X = df_train.drop(["Status"], axis=1)
df_y = df_train[["Status"]]

In [5]:
train_X, test_X, train_y, test_y = train_test_split(
    df_X, df_y,
    shuffle=True,
    random_state=0,  
    test_size=0.2
)

train_y = train_y["Status"].values
test_y = test_y["Status"].values

print(f"train_X shape: {train_X.shape}")
print(f"train_y shape: {train_y.shape}")
print(f"test_X shape: {test_X.shape}")
print(f"test_y shape: {test_y.shape}")

train_X shape: (162355, 22)
train_y shape: (162355,)
test_X shape: (40589, 22)
test_y shape: (40589,)


In [6]:
baseline_clf = LogisticRegression(
    penalty=None,             # No regularization
    class_weight="balanced",  # Because our training data is imbalanced
    max_iter=1_000,
    random_state=99           # Reproducible result
)

baseline_clf.fit(train_X, train_y)
pred_y = baseline_clf.predict(test_X)

In [8]:
baseline_performance_report = classification_report(test_y, pred_y)
print(f"'BASELINE MODEL PERFORMANCE'\n{baseline_performance_report}")

'BASELINE MODEL PERFORMANCE'
              precision    recall  f1-score   support

           0       0.97      0.89      0.93     33494
           1       0.62      0.88      0.73      7095

    accuracy                           0.88     40589
   macro avg       0.80      0.88      0.83     40589
weighted avg       0.91      0.88      0.89     40589



In [9]:
for col in df_train.columns:
    if col == "Status":
        continue

    q1 = df_train[col].quantile(0.25)
    q3 = df_train[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Cap outliers
    df_train[col] = np.where(df_train[col] < lower_bound, lower_bound, df_train[col])
    df_train[col] = np.where(df_train[col] > upper_bound, upper_bound, df_train[col])

    

In [11]:

scaler = StandardScaler()
for col in df_train.columns:
    if col == "Status":
        continue

    df_train[[col]] = scaler.fit_transform(df_train[[col]])
    df_test[[col]] = scaler.fit_transform(df_test[[col]])


In [12]:
print(df_train.isna().sum())

HighBP                  0
HighChol                0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
ExtraMedTest            0
ExtraAlcoholTest        0
Status                  0
dtype: int64


In [13]:
# Separate majority and minority classes
no_diabetes = df_train[df_train["Status"] == 0]
diabetes = df_train[df_train["Status"] == 1]

diabetes_upsampled = resample(
    diabetes,
    replace=True,
    n_samples=len(no_diabetes),
    random_state=0
)
    
# Combine majority class with upsampled minority class
df_train = pd.concat([no_diabetes, diabetes_upsampled])


In [14]:
status_counts = df_train['Status'].value_counts()

# Display the counts
print("Count of 0:", status_counts[0])
print("Count of 1:", status_counts[1])

Count of 0: 167313
Count of 1: 167313


In [15]:
print(df_train.shape)

(334626, 23)


In [16]:
print(df_train.isna().sum())

HighBP                  0
HighChol                0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
ExtraMedTest            0
ExtraAlcoholTest        0
Status                  0
dtype: int64


In [17]:
df_X = df_train.drop(["Status"], axis=1)
df_y = df_train[["Status"]]

train_X, test_X, train_y, test_y = train_test_split(
    df_X, df_y,
    shuffle=True,
    random_state=0,  # Ensure reproducible results
    test_size=0.2
)

train_y = train_y["Status"].values
test_y = test_y["Status"].values

baseline_clf.fit(train_X, train_y)
pred_y = baseline_clf.predict(test_X)

print(f"'BASELINE MODEL PERFORMANCE'\n{baseline_performance_report}")
print(f"'NEW MODEL PERFORMANCE\n{classification_report(test_y, pred_y)}")

'BASELINE MODEL PERFORMANCE'
              precision    recall  f1-score   support

           0       0.97      0.89      0.93     33494
           1       0.62      0.88      0.73      7095

    accuracy                           0.88     40589
   macro avg       0.80      0.88      0.83     40589
weighted avg       0.91      0.88      0.89     40589

'NEW MODEL PERFORMANCE
              precision    recall  f1-score   support

           0       0.86      0.87      0.86     33547
           1       0.86      0.85      0.86     33379

    accuracy                           0.86     66926
   macro avg       0.86      0.86      0.86     66926
weighted avg       0.86      0.86      0.86     66926



In [18]:
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 334626 entries, 0 to 93340
Data columns (total 23 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   HighBP                334626 non-null  float64
 1   HighChol              334626 non-null  float64
 2   BMI                   334626 non-null  float64
 3   Smoker                334626 non-null  float64
 4   Stroke                334626 non-null  float64
 5   HeartDiseaseorAttack  334626 non-null  float64
 6   PhysActivity          334626 non-null  float64
 7   Fruits                334626 non-null  float64
 8   Veggies               334626 non-null  float64
 9   HvyAlcoholConsump     334626 non-null  float64
 10  AnyHealthcare         334626 non-null  float64
 11  NoDocbcCost           334626 non-null  float64
 12  GenHlth               334626 non-null  float64
 13  MentHlth              334626 non-null  float64
 14  PhysHlth              334626 non-null  float64
 15  DiffWa

In [19]:
df_train.to_csv("../data/transformed_train.csv", index=False)
df_test.to_csv("../data/transformed_test.csv", index=False)
