In [2]:
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np


In [4]:
# Load dataset
data = load_diabetes(as_frame=True)
df = data.frame.copy()
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


In [6]:
X = df.drop(columns=["target"])
feature_names = X.columns.tolist()
print("Feature names:", feature_names)


Feature names: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


In [8]:
# Standardize features (mean=0, std=1)
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(X_std, columns=feature_names)
X_std

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.800500,1.065488,1.297088,0.459841,-0.929746,-0.732065,-0.912451,-0.054499,0.418531,-0.370989
1,-0.039567,-0.938537,-1.082180,-0.553505,-0.177624,-0.402886,1.564414,-0.830301,-1.436589,-1.938479
2,1.793307,1.065488,0.934533,-0.119214,-0.958674,-0.718897,-0.680245,-0.054499,0.060156,-0.545154
3,-1.872441,-0.938537,-0.243771,-0.770650,0.256292,0.525397,-0.757647,0.721302,0.476983,-0.196823
4,0.113172,-0.938537,-0.764944,0.459841,0.082726,0.327890,0.171178,-0.054499,-0.672502,-0.980568
...,...,...,...,...,...,...,...,...,...,...
437,0.876870,1.065488,0.413360,1.256040,-0.119769,-0.053957,-0.602843,-0.054499,0.655787,0.151508
438,-0.115937,1.065488,-0.334410,-1.422086,1.037341,1.664355,-0.602843,0.721302,-0.380819,0.935254
439,0.876870,1.065488,-0.334410,0.363573,-0.785107,-0.290965,-0.525441,-0.232934,-0.985649,0.325674
440,-0.956004,-0.938537,0.821235,0.025550,0.343075,0.321306,-0.602843,0.558384,0.936163,-0.545154


In [12]:
# Summary
print("X_std shape:", X_std.shape)
print(X_std.describe().T)

X_std shape: (442, 10)
     count          mean       std       min       25%       50%       75%  \
age  442.0  8.037814e-18  1.001133 -2.254290 -0.784172  0.113172  0.800500   
sex  442.0  1.607563e-17  1.001133 -0.938537 -0.938537 -0.938537  1.065488   
bmi  442.0 -2.813235e-17  1.001133 -1.897929 -0.719625 -0.153132  0.656952   
bp   442.0 -1.607563e-17  1.001133 -2.363050 -0.770650 -0.119214  0.749368   
s1   442.0 -8.037814e-18  1.001133 -2.665411 -0.720020 -0.090841  0.596193   
s2   442.0  0.000000e+00  1.001133 -2.430626 -0.638249 -0.080291  0.627442   
s3   442.0  0.000000e+00  1.001133 -2.150883 -0.738296 -0.138431  0.616239   
s4   442.0  1.607563e-17  1.001133 -1.606102 -0.830301 -0.054499  0.721302   
s5   442.0 -8.037814e-18  1.001133 -2.651040 -0.698949 -0.040937  0.681851   
s6   442.0 -1.607563e-17  1.001133 -2.896390 -0.697549 -0.022657  0.586922   

          max  
age  2.327895  
sex  1.065488  
bmi  3.585718  
bp   2.776058  
s1   3.235851  
s2   4.179278  
s3   3

**Apply PCA, compute variance explained, and cumulative variance**

In [18]:
from sklearn.decomposition import PCA

# Fit PCA (all components)
pca = PCA()
pca.fit(X_std)


In [19]:
explained_var = pca.explained_variance_
explained_var_ratio = pca.explained_variance_ratio_
cumulative_var_ratio = np.cumsum(explained_var_ratio)

# nice summary table
pca_table = pd.DataFrame({
    "PC": np.arange(1, 11),
    "Explained Variance": explained_var,
    "Explained Variance Ratio": explained_var_ratio,
    "Cumulative Variance Ratio": cumulative_var_ratio
})

print("PCA Variance Summary:")
print(pca_table)

PCA Variance Summary:
   PC  Explained Variance  Explained Variance Ratio  Cumulative Variance Ratio
0   1            4.033336                  0.402421                   0.402421
1   2            1.495704                  0.149232                   0.551653
2   3            1.208701                  0.120597                   0.672250
3   4            0.957643                  0.095548                   0.767797
4   5            0.663683                  0.066218                   0.834015
5   6            0.604084                  0.060272                   0.894287
6   7            0.537782                  0.053657                   0.947944
7   8            0.434665                  0.043368                   0.991312
8   9            0.078498                  0.007832                   0.999144
9  10            0.008580                  0.000856                   1.000000


**Loadings: interpret how each original feature contributes to PCs**

In [20]:
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i}" for i in range(1, 11)],
    index=feature_names
)

print("PCA Loadings:")
print(loadings)

PCA Loadings:
          PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
age  0.216431  0.044367  0.494668  0.414012  0.686876 -0.225815 -0.109530   
sex  0.186967 -0.386547 -0.106864  0.679860 -0.373464  0.041705 -0.067622   
bmi  0.303162 -0.156281  0.167527 -0.499825 -0.129333 -0.403151 -0.519874   
bp   0.271738 -0.138266  0.513571  0.019662 -0.486874 -0.272783  0.320655   
s1   0.343255  0.573027 -0.068579  0.068396 -0.129180  0.005398  0.073644   
s2   0.351861  0.455942 -0.269689  0.167774 -0.116722 -0.133262 -0.230535   
s3  -0.282437  0.506239  0.386032  0.076020 -0.245001  0.106354 -0.007548   
s4   0.428834 -0.068181 -0.380680 -0.007921  0.143646 -0.033936  0.071244   
s5   0.378618 -0.026187  0.063630 -0.264427  0.151639  0.178731  0.647302   
s6   0.322183 -0.084949  0.276842 -0.087085 -0.031423  0.805066 -0.357267   

          PC8       PC9      PC10  
age -0.014937  0.008100  0.003263  
sex -0.442933 -0.002106  0.003660  
bmi -0.392922  0.042377  0.008