# Heart Disease ML Portfolio Project
This notebook implements K-means clustering, PCA, and supervised learning on the Heart Disease UCI dataset (Cleveland subset).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load the dataset
df = pd.read_csv('../data/heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


## Dataset Verification
Confirming the Heart Disease UCI dataset structure and filtering to Cleveland data.

In [2]:
# Check shape and columns
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())

Shape: (920, 16)

Columns: ['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']


In [3]:
# Check dataset sources
print("\nDataset sources:", df['dataset'].unique())


Dataset sources: ['Cleveland' 'Hungary' 'Switzerland' 'VA Long Beach']


In [4]:
# Filter to Cleveland data
df_cleveland = df[df['dataset'] == 'Cleveland'].copy()
print("\nCleveland Shape:", df_cleveland.shape)
print("\nCleveland Columns:", df_cleveland.columns.tolist())


Cleveland Shape: (304, 16)

Cleveland Columns: ['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']


In [5]:
# Check for missing values
print("\nMissing Values:\n", df_cleveland.isnull().sum())


Missing Values:
 id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       1
ca          5
thal        3
num         0
dtype: int64


In [6]:
# Check target distribution
print("\nTarget Distribution:\n", df_cleveland['num'].value_counts())


Target Distribution:
 num
0    165
1     55
2     36
3     35
4     13
Name: count, dtype: int64
