In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

In [16]:
df_train = pd.read_csv("../data/cleaned_train.csv")
df_test = pd.read_csv("../data/cleaned_test.csv")


In [17]:
print(df_train.shape)

(202944, 23)


In [18]:
df_X = df_train.drop(["Status"], axis=1)
df_y = df_train[["Status"]]

In [19]:
train_X, test_X, train_y, test_y = train_test_split(
    df_X, df_y,
    shuffle=True,
    random_state=0,  
    test_size=0.2
)

train_y = train_y["Status"].values
test_y = test_y["Status"].values

print(f"train_X shape: {train_X.shape}")
print(f"train_y shape: {train_y.shape}")
print(f"test_X shape: {test_X.shape}")
print(f"test_y shape: {test_y.shape}")

train_X shape: (162355, 22)
train_y shape: (162355,)
test_X shape: (40589, 22)
test_y shape: (40589,)


In [20]:
baseline_clf = LogisticRegression(
    penalty=None,             # No regularization
    class_weight="balanced",  # Because our training data is imbalanced
    max_iter=1_000,
    random_state=99           # Reproducible result
)

baseline_clf.fit(train_X, train_y)
pred_y = baseline_clf.predict(test_X)

In [21]:
baseline_performance_report = classification_report(test_y, pred_y)
print(f"'BASELINE MODEL PERFORMANCE'\n{baseline_performance_report}")

'BASELINE MODEL PERFORMANCE'
              precision    recall  f1-score   support

           0       0.97      0.89      0.93     33494
           1       0.62      0.88      0.73      7095

    accuracy                           0.88     40589
   macro avg       0.80      0.88      0.83     40589
weighted avg       0.91      0.88      0.89     40589



In [22]:
for col in df_train.columns:
    if col == "Status":
        continue

    q1 = df_train[col].quantile(0.25)
    q3 = df_train[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Cap outliers
    df_train[col] = np.where(df_train[col] < lower_bound, lower_bound, df_train[col])
    df_train[col] = np.where(df_train[col] > upper_bound, upper_bound, df_train[col])

In [23]:
print(df_train.shape)

(202944, 23)


In [24]:
# Define categorical and numerical features
categorical_features = ['Sex', 'HighBP', 'HighChol', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost',
                        'DiffWalk']
numerical_features = ['BMI', 'ExtraMedTest', 'ExtraAlcoholTest']

# Define the data transformation pipeline
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_features),
#         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
#     ])

# # Fit and transform the training data
# transformed_train_data = preprocessor.fit_transform(df_train)

# # Transform the test data (without fitting)
# transformed_test_data = preprocessor.transform(df_test)

# # Get the names of the categorical features after one-hot encoding
# cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out()

# # Combine numerical and categorical features for both train and test datasets
# transformed_train_df = pd.DataFrame(transformed_train_data, columns=numerical_features + list(cat_feature_names))
# transformed_train_df['Status'] = df_train['Status']  # Add Status column back

# transformed_test_df = pd.DataFrame(transformed_test_data, columns=numerical_features + list(cat_feature_names))

# # Check the transformed DataFrames
# print("Transformed Train DataFrame:")
# print(transformed_train_df.head())

# print("\nTransformed Test DataFrame:")
# print(transformed_test_df.head())

scaler = StandardScaler()
for col in numerical_features:

    df_train[[col]] = scaler.fit_transform(df_train[[col]])
    df_test[[col]] = scaler.fit_transform(df_test[[col]])


In [25]:
no_diabetes = df_train[df_train["Status"] == 0]
diabetes = df_train[df_train["Status"] == 1]

diabetes_up_sampled = resample(
    diabetes,
    replace=True,              # sample with replacement
    n_samples=len(no_diabetes),  # 1:1 balanced
    random_state=0             # reproducible results
)

df_train = pd.concat([no_diabetes, diabetes_up_sampled])

In [26]:
status_counts = df_train['Status'].value_counts()

# Display the counts
print("Count of 0:", status_counts[0])
print("Count of 1:", status_counts[1])

Count of 0: 167313
Count of 1: 167313


In [27]:
print(df_train.shape)

(334626, 23)


In [28]:
df_X = df_train.drop(["Status"], axis=1)
df_y = df_train[["Status"]]

train_X, test_X, train_y, test_y = train_test_split(
    df_X, df_y,
    shuffle=True,
    random_state=0,  # Ensure reproducible results
    test_size=0.2
)

train_y = train_y["Status"].values
test_y = test_y["Status"].values

baseline_clf.fit(train_X, train_y)
pred_y = baseline_clf.predict(test_X)

print(f"'BASELINE MODEL PERFORMANCE'\n{baseline_performance_report}")
print(f"'NEW MODEL PERFORMANCE\n{classification_report(test_y, pred_y)}")

'BASELINE MODEL PERFORMANCE'
              precision    recall  f1-score   support

           0       0.97      0.89      0.93     33494
           1       0.62      0.88      0.73      7095

    accuracy                           0.88     40589
   macro avg       0.80      0.88      0.83     40589
weighted avg       0.91      0.88      0.89     40589

'NEW MODEL PERFORMANCE
              precision    recall  f1-score   support

           0       0.87      0.88      0.88     33547
           1       0.88      0.87      0.88     33379

    accuracy                           0.88     66926
   macro avg       0.88      0.88      0.88     66926
weighted avg       0.88      0.88      0.88     66926



In [29]:
df_train.to_csv("../data/transformed_train.csv", index=False)
df_test.to_csv("../data/transformed_test.csv", index=False)
