In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("health_lifestyle_dataset.csv")

feature_cols = [
    "age","bmi","daily_steps","sleep_hours","water_intake_l",
    "calories_consumed","smoker","alcohol","resting_hr",
    "systolic_bp","diastolic_bp","cholesterol","family_history"
]
X = df[feature_cols].values
y = df["disease_risk"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0
)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test  = sc.transform(X_test)


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

log_model = LogisticRegression(random_state=0, max_iter=1000)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

print(confusion_matrix(y_test, log_pred))
print("Accuracy (Logistic):", accuracy_score(y_test, log_pred))

[[18740     0]
 [ 6260     0]]
Accuracy (Logistic): 0.7496


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

knn_model = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print(confusion_matrix(y_test, knn_pred))
print("Accuracy (KNN):", accuracy_score(y_test, knn_pred))


[[16813  1927]
 [ 5644   616]]
Accuracy (KNN): 0.69716


In [4]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

print(confusion_matrix(y_test, nb_pred))
print("Accuracy (Naive Bayes):", accuracy_score(y_test, nb_pred))


[[18740     0]
 [ 6260     0]]
Accuracy (Naive Bayes): 0.7496


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

dt_model = DecisionTreeClassifier(criterion="entropy", random_state=0)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

print(confusion_matrix(y_test, dt_pred))
print("Accuracy (Decision Tree):", accuracy_score(y_test, dt_pred))


[[13910  4830]
 [ 4700  1560]]
Accuracy (Decision Tree): 0.6188


In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score

models = {
    "Logistic Regression": log_model,
    "KNN": knn_model,
    "Naive Bayes": nb_model,
    "Decision Tree": dt_model
}

for name, model in models.items():
    pred = model.predict(X_test)
    cm = confusion_matrix(y_test, pred)
    acc = accuracy_score(y_test, pred)
    print(f"{name} | Accuracy = {acc:.4f}\n{cm}\n")


Logistic Regression | Accuracy = 0.7496
[[18740     0]
 [ 6260     0]]

KNN | Accuracy = 0.6972
[[16813  1927]
 [ 5644   616]]

Naive Bayes | Accuracy = 0.7496
[[18740     0]
 [ 6260     0]]

Decision Tree | Accuracy = 0.6188
[[13910  4830]
 [ 4700  1560]]



In [8]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


Xc = StandardScaler().fit_transform(df[feature_cols].values)


kmeans = KMeans(n_clusters=3, random_state=0, n_init=10)
labels_km = kmeans.fit_predict(Xc)

print("KMeans centers (scaled):\n", kmeans.cluster_centers_)
print("KMeans labels (sample):", labels_km[:20])


KMeans centers (scaled):
 [[ 1.59140960e-03  6.85582173e-03  3.05641725e-03 -1.18148799e-02
   6.52986832e-03  1.01209691e-02 -5.01468322e-01  1.52745250e+00
   1.15401921e-02 -8.66443943e-03  4.18773708e-03 -5.02871365e-03
  -4.75171880e-03]
 [-4.89474138e-03 -6.79225542e-03  9.02791520e-04  2.10425504e-03
   3.10102792e-03  4.43359698e-03  1.99414391e+00  4.93007529e-03
  -2.86858241e-03  7.28294894e-03 -1.46192624e-02  1.99183791e-03
   5.42128369e-03]
 [ 1.07677452e-03 -4.92381329e-04 -1.63054494e-03  4.29496489e-03
  -3.90436858e-03 -5.91773995e-03 -5.01468322e-01 -6.54684845e-01
  -3.90318378e-03  1.08934101e-03  3.45770279e-03  1.43454695e-03
   8.51022376e-05]]
KMeans labels (sample): [2 0 0 2 0 0 1 0 0 0 1 2 2 0 2 1 2 2 2 2]


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


subset_size = 5000
_, X_sub, _, y_sub = train_test_split(
    df[feature_cols].values,
    df["disease_risk"].values,
    test_size=subset_size/len(df),
    random_state=0,
    stratify=df["disease_risk"].values
)


Xc_sub = StandardScaler().fit_transform(X_sub)
Xc_sub_pca = PCA(n_components=3, random_state=0).fit_transform(Xc_sub)


agg = AgglomerativeClustering(n_clusters=3, linkage="ward")
labels_ag = agg.fit_predict(Xc_sub_pca)

print("Agglomerative labels (sample):", labels_ag[:20])


Agglomerative labels (sample): [0 0 0 1 1 2 1 0 1 2 2 1 0 2 1 1 2 0 2 0]
