# Computing explained variance using scikit-learn and by hand using numpy 
## Christian Igel, 2021

In [14]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn import datasets

Load some data

In [15]:
X = datasets.load_diabetes().data

Do computations using scikit-learn

In [12]:
pca = PCA()
pca.fit(X)
eigenvalues = pca.singular_values_**2
print("Squared singular values:\n", eigenvalues)
print("'Explained variance' (not normalized):\n", pca.explained_variance_)
print("Explained variance per component (computed from previous result):\n", pca.explained_variance_ / np.sum(pca.explained_variance_))
print("Explained variance per component:\n", pca.explained_variance_ratio_)

Squared singular values:
 [4.02421418 1.49231824 1.20596231 0.95547639 0.66218558 0.60271921
 0.53656046 0.4336832  0.07831991 0.00856053]
'Explained variance' (not normalized):
 [9.12520221e-03 3.38394158e-03 2.73460842e-03 2.16661312e-03
 1.50155460e-03 1.36671023e-03 1.21669038e-03 9.83408623e-04
 1.77596168e-04 1.94116324e-05]
Explained variance per component (computed from previous result):
 [0.40242142 0.14923182 0.12059623 0.09554764 0.06621856 0.06027192
 0.05365605 0.04336832 0.00783199 0.00085605]
Explained variance per component:
 [0.40242142 0.14923182 0.12059623 0.09554764 0.06621856 0.06027192
 0.05365605 0.04336832 0.00783199 0.00085605]


Do computations by hand

In [13]:
# Remove mean 
Xmean=X.mean(axis=0)
Xcentered=X-Xmean

# Compute scatter matrix/empirical covariance matrix
N = Xcentered.shape[0]  # Number of samples
S = np.dot(Xcentered.T, Xcentered)  # Sum up outer products

# Eigenvalue decomposition of empirical covariance matrix
decomp = np.linalg.eig(S / N) # Divide by number of samples  
eigenvalues_by_hand = -np.sort(-decomp[0])
print("Eigenvalues (not Bessel corrected):\n", eigenvalues_by_hand)
print("Explained variance per component (not Bessel corrected):\n", eigenvalues_by_hand / np.sum(eigenvalues_by_hand)) 

# Eigenvalue decomposition of empirical covariance matrix using Bessel's correction
decomp = np.linalg.eig(S / (N-1)) # Divide by number of samples minus 1
eigenvalues_by_hand = -np.sort(-decomp[0])
print("Eigenvalues (Bessel corrected):\n", eigenvalues_by_hand)  
print("Explained variance per component (Bessel corrected):\n", eigenvalues_by_hand / np.sum(eigenvalues_by_hand))  

Eigenvalues (not Bessel corrected):
 [9.10455696e-03 3.37628560e-03 2.72842152e-03 2.16171128e-03
 1.49815742e-03 1.36361812e-03 1.21393769e-03 9.81183716e-04
 1.77194367e-04 1.93677147e-05]
Explained variance per component (not Bessel corrected):
 [0.40242142 0.14923182 0.12059623 0.09554764 0.06621856 0.06027192
 0.05365605 0.04336832 0.00783199 0.00085605]
Eigenvalues (Bessel corrected):
 [9.12520221e-03 3.38394158e-03 2.73460842e-03 2.16661312e-03
 1.50155460e-03 1.36671023e-03 1.21669038e-03 9.83408623e-04
 1.77596168e-04 1.94116324e-05]
Explained variance per component (Bessel corrected):
 [0.40242142 0.14923182 0.12059623 0.09554764 0.06621856 0.06027192
 0.05365605 0.04336832 0.00783199 0.00085605]


**That is:** `pca.singular_values_` are from the decomposition of `X`, the devision by the number of training examples `N` (or `N-1`) is missing. The `pca.explained_variance_` corresponds to the eigenvalues of the empirical covariance matrix using Bessel's correction (i.e., using `N-1`). `pca.explained_variance_ratio_` is normalized such that the explained variances sum up to one.