In [1]:
#import dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans

In [2]:
#Read in the CSV file into a pandas DataFrame
heart_data = pd.read_csv("./heartdisease.csv")

#Review the data frame
heart_data.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,Diagnosis
0,0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,2
2,2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0


In [3]:
# Further clean data - remove "Unnamed: 0" column
heart_data = heart_data.drop(columns=['Unnamed: 0'])

# Verify
heart_data.columns

Index(['Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure',
       'Cholesterol', 'Fasting Blood Sugar', 'Resting EKG', 'Max Heart Rate',
       'Exercise Enduced Chest Pain', 'STDep Induced by Exercise',
       'Slope of Peak Exercise ST', 'Diagnosis'],
      dtype='object')

In [4]:
# Change Diagnosis column so we only have 0 and 1 (absense and presence)
# Copy data
heart_new = heart_data.copy()

heart_new['Diagnosis'].head()

0    0
1    2
2    1
3    0
4    0
Name: Diagnosis, dtype: int64

In [9]:
# Change values
# Change Diagnosis column so we only have 0 and 1 (absense and presence)

dgHeartData = heart_data['Diagnosis'].values[heart_data['Diagnosis'] > 0] = 1

# Verify
heart_data.head()

Unnamed: 0,Age,Sex,Chest Pain Type,Resting Blood Pressure,Cholesterol,Fasting Blood Sugar,Resting EKG,Max Heart Rate,Exercise Enduced Chest Pain,STDep Induced by Exercise,Slope of Peak Exercise ST,Diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0


In [10]:
#Step 1: Use PCA to reduce the dimensionality of the transformed heart data dataframe to 2 principal components
# Import the PCA module
from sklearn.decomposition import PCA

In [11]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=2)

In [12]:
# Fit the PCA model on the transformed credit card DataFrame
heart_data_pca = pca.fit_transform(heart_data)

# Review the first 5 rows of the array of list data
heart_data_pca[:5]

array([[ 23.69076086, -31.32989163],
       [ 70.23998239,   1.85865242],
       [ 12.33348475,  -0.72275236],
       [ 44.4660328 , -45.29261952],
       [ -2.90943885, -44.87849767]])

In [13]:
#Step 2: Using the explnained_variance_ration_function from PCA, 
#calculate the percentage of the total variance that is captured by the two PCA variables.
#Calculate the PCA explained Variance ratio
#85% of the total variance is condensed into the 2 PCA variables
pca.explained_variance_ratio_

array([0.66376382, 0.29120809])

In [15]:
#Step 3: Using the heartNew, create a Pandas DataFrame called heartNew_pca_df. 
#Create the PCA Dataframe
heartData_pca_df = pd.DataFrame(
    heart_data_pca,
    columns=["PCA1", "PCA2"]
)

#Review the PCA Data Frame
heartData_pca_df.head()              

Unnamed: 0,PCA1,PCA2
0,23.690761,-31.329892
1,70.239982,1.858652
2,12.333485,-0.722752
3,44.466033,-45.29262
4,-2.909439,-44.878498


In [16]:
#Step 4: Using the heartNew_pca_df DataFrame, Utilize the Elbow method to determine the optimal value of k
#Create a list to store inertia values of the vaules of k
inertia = []
k = list (range(1, 11))

In [18]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(heartData_pca_df)
    inertia.append(k_model.inertia_)

In [19]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,6207460.0
1,2,3134392.0
2,3,1729674.0
3,4,1101454.0
4,5,805841.2


In [20]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [21]:
#Step 5: Segment the heartNew_pca_df DataFrame using the K-means algorithm
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(heartData_pca_df)

# Make predictions
k_4 = model.predict(heartData_pca_df)

# Create a copy of the customers_pca_df DataFrame
heartData_pca_predictions_df = heartData_pca_df.copy()

# Add a class column with the labels
heartData_pca_predictions_df["heartData_segments"] = k_4

In [22]:
# Plot the clusters
heartData_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="heartData_segments"
)