In [12]:
# Import the required libraries and dependencies
import pandas as pd
import numpy as np
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
df_heart = pd.read_csv(
    Path("../heart_disease_dataset.csv")
)

# Review the DataFrame
df_heart.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4,0,Within past year (anytime less than 12 months ...,Yes,9,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0,0,Within past year (anytime less than 12 months ...,Yes,6,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0,0,Within past year (anytime less than 12 months ...,No,8,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5,0,Within past year (anytime less than 12 months ...,Yes,9,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3,15,Within past year (anytime less than 12 months ...,Yes,5,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [43]:
# get dummies for categorical variables
df_heart_dummies = pd.get_dummies(df_heart)
list(df_heart_dummies.columns)

['PhysicalHealthDays',
 'MentalHealthDays',
 'SleepHours',
 'HeightInMeters',
 'WeightInKilograms',
 'BMI',
 'State_Alabama',
 'State_Alaska',
 'State_Arizona',
 'State_Arkansas',
 'State_California',
 'State_Colorado',
 'State_Connecticut',
 'State_Delaware',
 'State_District of Columbia',
 'State_Florida',
 'State_Georgia',
 'State_Guam',
 'State_Hawaii',
 'State_Idaho',
 'State_Illinois',
 'State_Indiana',
 'State_Iowa',
 'State_Kansas',
 'State_Kentucky',
 'State_Louisiana',
 'State_Maine',
 'State_Maryland',
 'State_Massachusetts',
 'State_Michigan',
 'State_Minnesota',
 'State_Mississippi',
 'State_Missouri',
 'State_Montana',
 'State_Nebraska',
 'State_Nevada',
 'State_New Hampshire',
 'State_New Jersey',
 'State_New Mexico',
 'State_New York',
 'State_North Carolina',
 'State_North Dakota',
 'State_Ohio',
 'State_Oklahoma',
 'State_Oregon',
 'State_Pennsylvania',
 'State_Puerto Rico',
 'State_Rhode Island',
 'State_South Carolina',
 'State_South Dakota',
 'State_Tennessee',
 'S

In [46]:
# Remove over-defined columns

# df_heart_dummies = df_heart_dummies.drop(columns = ['State_Alabama','Sex_Female','GeneralHealth_Excellent','LastCheckupTime_5 or more years ago',
#                                                     'PhysicalActivities_No','RemovedTeeth_1 to 5','HadHeartAttack_No','HadAngina_No','HadStroke_No',
#                                                     'HadAsthma_No','HadSkinCancer_No','HadCOPD_No','HadDepressiveDisorder_No','HadKidneyDisease_No',
#                                                     'HadArthritis_No','HadDiabetes_No','DeafOrHardOfHearing_No','BlindOrVisionDifficulty_No',
#                                                     'DifficultyConcentrating_No','DifficultyWalking_No','DifficultyDressingBathing_No',
#                                                     'DifficultyErrands_No','SmokerStatus_Current smoker - now smokes every day',
#                                                     'ECigaretteUsage_Never used e-cigarettes in my entire life','ChestScan_No',
#                                                     'RaceEthnicityCategory_Black only, Non-Hispanic','AgeCategory_Age 18 to 24','AlcoholDrinkers_No',
#                                                     'HIVTesting_No','FluVaxLast12_No','PneumoVaxEver_No',
#                                                     'TetanusLast10Tdap_No, did not receive any tetanus shot in the past 10 years','HighRiskLastYear_No',
#                                                     'CovidPos_No'])

df_heart_dummies = df_heart_dummies.drop(columns = ['State_Alabama'])

In [47]:
# Scale price data, return, and variance values
df_heart_scaled = StandardScaler().fit_transform(df_heart_dummies)

df_heart_scaled_df = pd.DataFrame(df_heart_scaled, 
                                  columns = list(df_heart_dummies.columns))

df_heart_scaled_df.head()

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI,State_Alaska,State_Arizona,State_Arkansas,State_California,...,PneumoVaxEver_Yes,"TetanusLast10Tdap_No, did not receive any tetanus shot in the past 10 years","TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap",HighRiskLastYear_No,HighRiskLastYear_Yes,CovidPos_No,CovidPos_Tested positive using home test without a health professional,CovidPos_Yes
0,-0.01416,-0.514292,1.373428,-0.985904,-0.560199,-0.104105,-0.114888,-0.150683,-0.109976,-0.145436,...,1.209496,-0.705424,1.581233,-0.656634,-0.296414,0.211941,-0.211941,0.685924,-0.187924,-0.632657
1,-0.49002,-0.514292,-0.708924,0.701799,0.545644,0.22442,-0.114888,-0.150683,-0.109976,-0.145436,...,1.209496,-0.705424,-0.632418,1.52292,-0.296414,0.211941,-0.211941,0.685924,-0.187924,-0.632657
2,-0.49002,-0.514292,0.679311,1.358128,1.183918,0.4593,-0.114888,-0.150683,-0.109976,-0.145436,...,1.209496,1.417587,-0.632418,-0.656634,-0.296414,0.211941,-0.211941,-1.457888,-0.187924,1.580635
3,0.104805,-0.514292,1.373428,-0.048291,0.333198,0.407105,-0.114888,-0.150683,-0.109976,-0.145436,...,1.209496,1.417587,-0.632418,-0.656634,-0.296414,0.211941,-0.211941,-1.457888,-0.187924,1.580635
4,-0.133125,1.336949,-1.403041,-1.45471,-0.198619,0.675759,-0.114888,-0.150683,-0.109976,-0.145436,...,1.209496,1.417587,-0.632418,-0.656634,-0.296414,0.211941,-0.211941,0.685924,-0.187924,-0.632657


In [38]:
# Create the PCA model instance where n_components=2
pca = PCA(n_components=20)

In [39]:
# Fit the df_stocks_scaled data to the PCA
heart_pca_data = pca.fit_transform(df_heart_scaled_df)

# Review the first five rose of the PCA data
# using bracket notation ([0:5])
heart_pca_data[:5]

array([[-4.50438577e-01, -2.57342989e+00, -1.12689159e+00,
        -1.14672976e+00,  5.85987937e-01, -1.18488598e+00,
        -1.14350532e-01, -7.61797242e-01,  4.34820126e-01,
         2.09777085e+00, -5.46368111e-01, -5.24662082e-01,
         4.51850827e-01,  3.88343049e-02,  1.74664973e+00,
         3.35483684e-01, -1.49699616e-01,  2.88384653e-01,
         5.95519637e-02,  4.73445930e-01],
       [ 1.81439182e-01, -2.49495991e+00,  1.74853066e+00,
         4.83654787e-01, -1.59406511e-01, -5.86693983e-01,
         6.58429046e-01, -8.14942748e-01,  3.68363253e-01,
         6.12849541e-01, -1.44577474e+00, -4.16235880e-01,
        -5.80449269e-01, -2.06118490e-02, -1.08512203e+00,
        -1.84994935e-01, -2.98005657e-01,  1.05427477e+00,
        -6.50014220e-01,  1.76520220e-01],
       [ 2.41401863e+00, -1.46352749e+00,  2.04766772e+00,
         3.70325978e-01, -1.06580504e+00, -1.50095210e-01,
        -3.39263444e-01, -1.70136898e+00, -1.61049727e-01,
         5.26425386e-01, -9.7

In [40]:
# Calculate the explained variance
print(sum(pca.explained_variance_ratio_))
print(sum(pca.explained_variance_))

0.27240889047192146
33.23401972313476


In [48]:
pca = PCA()
X = df_heart_scaled_df
X_transformed = pca.fit_transform(X)
n_samples = X.shape[0]

# We center the data and compute the sample covariance matrix.
X_centered = X - np.mean(X, axis=0)
cov_matrix = np.dot(X_centered.T, X_centered) / n_samples
eigenvalues = pca.explained_variance_
print(sum(eigenvalues))
print(pca.components_)
for eigenvalue, eigenvector in zip(eigenvalues, pca.components_):    
    print(np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector)))
    print(eigenvalue)

155.000630027518
[[ 1.92962735e-01  9.88589138e-02 -8.68171022e-03 ...  2.22756683e-02
  -2.06052687e-02 -1.47195962e-02]
 [ 9.56924975e-02  2.07359400e-01 -8.97186055e-02 ... -8.52937438e-02
   3.73111085e-05  8.80430987e-02]
 [-1.35821191e-02 -6.48241959e-02 -2.22581338e-02 ... -6.75698568e-03
  -1.26031188e-02  1.20391907e-02]
 ...
 [ 0.00000000e+00  1.44404894e-17 -1.89472942e-16 ...  5.79551845e-03
   2.25521732e-03  5.61358390e-03]
 [-0.00000000e+00 -1.57349398e-16 -1.52655431e-16 ...  7.82346339e-03
   3.04435406e-03  7.57786702e-03]
 [-0.00000000e+00 -1.67271301e-16  1.03621015e-16 ... -1.57367779e-03
  -6.12367198e-04 -1.52427645e-03]]
8.308834573531676
8.308868346398297
4.8544905195978405
4.854510251615708
3.413358611803831
3.41337248606086
3.0640531479560558
3.064065602394675
2.6394722420472903
2.639482970692953
2.53170833421336
2.53171862483126
2.4215042903837616
2.421514133055071
2.1688662921789272
2.1688751079572657
2.083481109074092
2.0834895777860276
2.000624331141837
2