#### Team members

1. Mostafa Allahmoradi - 9087818
2. Cemil Caglar Yapici – 9081058
3. Jarius Bedward - 8841640

# Import Libraries & Dependencies

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score


#### Problem statement

Area of Focus:
In today’s world, health conditions such as heart disease, obesity, and diabetes are rapidly increasing. Many of these illnesses are directly linked to preventable lifestyle factors such as poor diet, lack of physical activity, smoking, alcohol consumption, and inadequate sleep. Despite increased awareness about healthy living, many individuals are still at risk because they underestimate how daily habits affect their well-being in long-term.

Outcome:
Early detection and prevention could save lives and reduce medical costs, but traditional screening methods are often not effective, and they typically only identify problems after symptoms appear, missing critical opportunities for prevention.

This project aims to explore how machine learning can help predict the likelihood of developing a heart condition (or general illness) based on an individual’s diet, lifestyle, and fitness-related factors. By analyzing real-world health data, our goal is to identify key risk factors and build a predictive model that can help individuals take proactive measures toward healthier living.

#### Hypothesis

Null hypothesis:

H0: Individual with composite lifestyle score (worse diet + smoking + high alcohol consumption + low activity + poor sleep) are at high risk of heart disease


Alternative hypothesis:

H1: Current smokers have higher odds of heart disease than non-smokers
Test: logistic regression coefficient for smoking; report odds ratio (O.R.) and 95% CI.

H2: Individuals with low physical activity (e.g., <150 min/week) have a higher chance of heart disease than those meeting activity guidelines.
Test: logistic regression or compare group prevalences; p-value for activity variable.

H3: Higher BMI is positively associated with heart disease risk (dose–response).
Test: regression coefficient per unit BMI and/or BMI category comparisons.

H4: A model using lifestyle and fitness variables (diet quality, physical activity, smoking, alcohol use, sleep, BMI) will predict the likelihood of heart disease occuring
Test: compare model performance (ROC-AUC, F1) for (a) single best predictor vs (b) composite score vs (c) full model.

#### Data Source: Heart Disease Risk Factors

## 1. Load the Data sets

In [None]:
heart_disease_dataset = pd.read_csv("data/heart_disease.csv")

display(heart_disease_dataset.head(5))
display(heart_disease_dataset.info())

display(heart_disease_dataset.describe().T)

## 2. Clean the data
  basic cleaning of the data

In [None]:
#Remove leading or trailing spaces from column names
heart_disease_dataset.columns = heart_disease_dataset.columns.str.strip()

#Drop rows with missing target variable
heart_disease_dataset = heart_disease_dataset.dropana(subset=["Heart Disease Status"])

#Fill or drop missing values
heart_disease_dataset = heart_disease_dataset.fillna(heart_disease_dataset.median(numeric_only=True)) #numeric
heart_disease_dataset = heart_disease_dataset.fillna(heart_disease_dataset.mode().iloc[0]) #categorical

print("Remaining missin values: \n", heart_disease_dataset.isna().sum().sum)


## Step 3 Standardize Numeric Variables

- To ensure the cluster isnt dominated by features with large scales

In [None]:
#Only use numeric cols

numeric_cols = heart_disease_dataset.select_dtypes(include=np.number).columns.tolist()

scaler = StandardScaler()
heart_disease_dataset_scaled = pd.DataFrame(scaler.fit_transform(heart_disease_dataset[numeric_cols]), columns=numeric_cols)

print("Data has been standardized")
heart_disease_dataset_scaled.head()

## 4. Select Lifestyle features
- Uses the standardized numeric values and select the features for cluster

In [None]:


#Lifestyle features
lifestyle_features = ["Smoking", "Alcohol Consumption", "Exercise Habits", "Sleep Hours", "BMI", "Stress Level"]

#Copies the releveant cols
heart_disease_dataset_life = heart_disease_dataset[lifestyle_features].copy()

#Encode Categorical variables only

le = LabelEncoder()
for col in ['Smoking', 'Alcohol Consumption', 'Exercise Habits', 'Stress Level']:
    heart_disease_dataset_life[col] = le.fit_transform(heart_disease_dataset_life[col].astype(str))

heart_disease_dataset_life["BMI"] = heart_disease_dataset_scaled["BMI"]
heart_disease_dataset_life["Sleep Hours"] = heart_disease_dataset_scaled["Sleep Hours"]

#Check structure

heart_disease_dataset_life.head()

##### Talking Point: (Encoding Categorical)
-


## 5. Determine Optimal Number of Clusters
- Determine the optimal # of clusters using Elbow & Silhouette methods

In [None]:


inertia = []
sil_scores = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(heart_disease_dataset)
    inertia.append(kmeans.inertia_)
    sil_scores.append(silhouette_score(heart_disease_dataset_life, kmeans.labels_))

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(K, inertia, 'o-')
plt.title('Elbow Method (Inertia)')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')

plt.subplot(1,2,2)
plt.plot(K, sil_scores, 'o-', color='green')
plt.title('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.show()

##### Talking Point: (Elbow & Silhouette)

## 6. Apply K-Means Clusters with optimal k
- Choose the best k (in this case 3-5 for this data)

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
heart_disease_dataset_life["Cluster"] = kmeans.fit_predict(heart_disease_dataset_life)

## 7. Apply Cluster Profiles


In [None]:
cluster_summary = heart_disease_dataset_life.groupby("Cluster").mean().round(2)
display(cluster_summary)

print("\nCluster counts:")
print(heart_disease_dataset_life["Cluster"].value_counts())