In [13]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [14]:
#(a) using PCA() function.

# Load the dataset from the 'datasets' package in R
data = sm.datasets.get_rdataset('USArrests', 'datasets').data

# StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(data)

# PCA with 4 components
X_pca = PCA(n_components = 4)
X_pca.fit(X)


principal_loadings = pd.DataFrame(X_pca.components_.T, index=data.columns)
principal_loadings

Unnamed: 0,0,1,2,3
Murder,0.535899,0.418181,-0.341233,0.649228
Assault,0.583184,0.187986,-0.268148,-0.743407
UrbanPop,0.278191,-0.872806,-0.378016,0.133878
Rape,0.543432,-0.167319,0.817778,0.089024


In [15]:
#(b) using np.linalg.eig() function.

cov_matrix = np.cov(X.T)
λ, v = np.linalg.eig(cov_matrix)

i = λ.argsort()[::-1]
λ = λ[i]
v = v[:, i]

loadings_eig = pd.DataFrame(v, index=data.columns)
loadings_eig

Unnamed: 0,0,1,2,3
Murder,0.535899,0.418181,-0.341233,0.649228
Assault,0.583184,0.187986,-0.268148,-0.743407
UrbanPop,0.278191,-0.872806,-0.378016,0.133878
Rape,0.543432,-0.167319,0.817778,0.089024


In [16]:
#(c) using np.linalg.svd() function.

U, S, VT = np.linalg.svd(X)
svd = VT.T

loadings_svd = pd.DataFrame(loadings_svd, index=data.columns)
loadings_svd

Unnamed: 0,0,1,2,3
Murder,-0.535899,-0.418181,0.341233,0.649228
Assault,-0.583184,-0.187986,0.268148,-0.743407
UrbanPop,-0.278191,0.872806,0.378016,0.133878
Rape,-0.543432,0.167319,-0.817778,0.089024


In [17]:
#(d) Are those from (a), (b), and (c) exactly the same? why or why not?

The results are essentially the same, but there may be some minor differences:

(1) Sign Ambiguity
    In both (b)eigenvector and (c)singular value decomposition results, the vectors pointing in the same direction can have opposite signs but still represent the same principal components.
    
(2) Numerical Precision
    PCA() from sklearn is highly optimized and handles many of the details, providing very stable results. np.linalg.eig() and np.linalg.svd() may introduce minor numerical differences, especially when dealing with large datasets.
    
(3) Data Preprocessing
    PCA() automatically performs mean centering and, in some cases, scaling of the data, whereas with np.linalg.eig() and np.linalg.svd(), this must be done manually.
    
In most cases, these differences do not affect the interpretation or analysis of the principal components.