In [22]:
# Import the required libraries and dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [23]:
# Read the CSV file into a Pandas DataFrame
df_heart = pd.read_csv(
    Path("../heart_disease_dataset.csv")
)

# Review the DataFrame
df_heart.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4,0,Within past year (anytime less than 12 months ...,Yes,9,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0,0,Within past year (anytime less than 12 months ...,Yes,6,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0,0,Within past year (anytime less than 12 months ...,No,8,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5,0,Within past year (anytime less than 12 months ...,Yes,9,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3,15,Within past year (anytime less than 12 months ...,Yes,5,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [24]:
# get dummies for categorical variables
df_heart_dummies = pd.get_dummies(df_heart)
list(df_heart_dummies.columns)

['PhysicalHealthDays',
 'MentalHealthDays',
 'SleepHours',
 'HeightInMeters',
 'WeightInKilograms',
 'BMI',
 'State_Alabama',
 'State_Alaska',
 'State_Arizona',
 'State_Arkansas',
 'State_California',
 'State_Colorado',
 'State_Connecticut',
 'State_Delaware',
 'State_District of Columbia',
 'State_Florida',
 'State_Georgia',
 'State_Guam',
 'State_Hawaii',
 'State_Idaho',
 'State_Illinois',
 'State_Indiana',
 'State_Iowa',
 'State_Kansas',
 'State_Kentucky',
 'State_Louisiana',
 'State_Maine',
 'State_Maryland',
 'State_Massachusetts',
 'State_Michigan',
 'State_Minnesota',
 'State_Mississippi',
 'State_Missouri',
 'State_Montana',
 'State_Nebraska',
 'State_Nevada',
 'State_New Hampshire',
 'State_New Jersey',
 'State_New Mexico',
 'State_New York',
 'State_North Carolina',
 'State_North Dakota',
 'State_Ohio',
 'State_Oklahoma',
 'State_Oregon',
 'State_Pennsylvania',
 'State_Puerto Rico',
 'State_Rhode Island',
 'State_South Carolina',
 'State_South Dakota',
 'State_Tennessee',
 'S

In [25]:
# Scale price data, return, and variance values
df_heart_scaled = StandardScaler().fit_transform(df_heart_dummies)

df_heart_scaled_df = pd.DataFrame(df_heart_scaled, 
                                  columns = list(df_heart_dummies.columns))

df_heart_scaled_df.head()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI,State_Alabama,State_Alaska,State_Arizona,State_Arkansas,...,PneumoVaxEver_Yes,"TetanusLast10Tdap_No, did not receive any tetanus shot in the past 10 years","TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap",HighRiskLastYear_No,HighRiskLastYear_Yes,CovidPos_No,CovidPos_Tested positive using home test without a health professional,CovidPos_Yes
0,-0.01416,-0.514292,1.373428,-0.985904,-0.560199,-0.104105,11.329126,-0.114888,-0.150683,-0.109976,...,1.209496,-0.705424,1.581233,-0.656634,-0.296414,0.211941,-0.211941,0.685924,-0.187924,-0.632657
1,-0.49002,-0.514292,-0.708924,0.701799,0.545644,0.22442,11.329126,-0.114888,-0.150683,-0.109976,...,1.209496,-0.705424,-0.632418,1.52292,-0.296414,0.211941,-0.211941,0.685924,-0.187924,-0.632657
2,-0.49002,-0.514292,0.679311,1.358128,1.183918,0.4593,11.329126,-0.114888,-0.150683,-0.109976,...,1.209496,1.417587,-0.632418,-0.656634,-0.296414,0.211941,-0.211941,-1.457888,-0.187924,1.580635
3,0.104805,-0.514292,1.373428,-0.048291,0.333198,0.407105,11.329126,-0.114888,-0.150683,-0.109976,...,1.209496,1.417587,-0.632418,-0.656634,-0.296414,0.211941,-0.211941,-1.457888,-0.187924,1.580635
4,-0.133125,1.336949,-1.403041,-1.45471,-0.198619,0.675759,11.329126,-0.114888,-0.150683,-0.109976,...,1.209496,1.417587,-0.632418,-0.656634,-0.296414,0.211941,-0.211941,0.685924,-0.187924,-0.632657


In [26]:
# Create the PCA model instance where n_components=2
pca = PCA(n_components=20)

In [27]:
# Fit the df_stocks_scaled data to the PCA
heart_pca_data = pca.fit_transform(df_heart_scaled_df)

# Review the first five rose of the PCA data
# using bracket notation ([0:5])
heart_pca_data[:5]

array([[-0.40567206, -2.94812821, -2.63154939, -0.51769133, -0.11245939,
         0.30518869, -0.76348258, -0.32222436,  0.6051501 , -1.05982663,
        -0.36136001, -0.73468745, -0.60299172,  1.11358399, -0.16293271,
         0.21662792, -0.69658765, -0.23042705,  0.90000621,  0.29536887],
       [ 0.3333674 , -3.23077543,  0.8141238 , -0.11987825,  1.32133081,
         1.77570812, -0.70295799,  0.42316602, -0.85942709, -1.19133901,
        -1.08929965, -1.00385034,  0.04822747,  0.86337251,  0.71488351,
         1.01488227, -1.13890514, -0.55631165,  1.78146319,  0.29944939],
       [ 3.11891143, -0.93191741,  1.95878873,  1.37087697, -0.26839434,
         1.9797124 , -1.96369862,  1.35860036,  0.53200891,  2.47762224,
         2.45803465,  0.11529167, -1.03317399,  3.26727073, -1.51569067,
         2.42370153, -2.66921796,  1.10481399, -2.51517556,  0.4103127 ],
       [ 2.33499727, -2.21092461, -2.93177912, -0.36098025, -0.48050758,
         1.81871903,  0.22449305, -1.09075784, -

In [28]:
# Calculate the explained variance ratio
print(sum(pca.explained_variance_ratio_))

0.3245331234655393


Even with 20 principal components, less than 1/3rd of the variance in the dataset can be explained.