# Physics 494/594

# Autoencoders and PCA

In [None]:
# %load ./include/header.py
import numpy as np
import matplotlib.pyplot as plt
import sys
from tqdm import trange,tqdm
sys.path.append('./include')
import ml4s

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('./include/notebook.mplstyle')
np.set_printoptions(linewidth=120)
ml4s.set_css_style('./include/bootstrap.css')
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
π = np.pi

## Last Time

- Principal Component Analysis
- Identifying the low-dimensional latent space which maximally explains the *variance* of the data
- Implementing PCA by hand and with `sklearn`

## Today

- Connection between PCA and autoencoders, a compressive deep neural network architecture
- Application of PCA for clustering

## Principal Component Analysis (PCA)

Recall that for a given set of unlabelled data $\{ \boldsymbol{x}^{(n)} \}_{n=0}^{N}$ our goal is to project the data onto a latent space having dimensionality $M < D$.  We did this by performing a spectral decomposition of the covariance matrix

\begin{equation}
\Sigma(\mathbf{X}) = \frac{1}{N-1} \mathbf{X}^{\top}\mathbf{X}
\end{equation}

where $\mathbf{X}$ is the  data design matrix: 

\begin{equation}
\mathbf{X} = \left( \begin{array}{cccc}
        x_{1}^{(1)} & x_{2}^{(1)} & \cdots & x_{D}^{(1)} \\
\vdots        &      \vdots    & \ddots & \vdots \\
        x_{1}^{(N)} & x_{2}^{(N)} & \cdots & x_{D}^{(N)} \\
\end{array}
\right)\, .
\end{equation}

We determine:

\begin{equation}
\boldsymbol{V}^\top \Sigma(\mathbf{X}) \boldsymbol{V} = \Lambda
\end{equation}

where $\Lambda_{ij} = \lambda_i \delta_{ij}$ is the diagonal matrix of principle components and the PCA vectors are encoded as the columns of the orthogonal matrix $\boldsymbol{V}$.

Also recall the *percentage of the explained variance* defined:

\begin{equation}
\text{PCA-j} = \frac{\lambda_j}{\sum_{j=1}^{D} \lambda_j}
\end{equation}

and the projector:

\begin{equation}
\boldsymbol{P} = \sum_{j=1}^M\boldsymbol{v}_j\boldsymbol{v}_j^\mathsf{T}\, .
\end{equation}


## Neural Networks and Linear Autoencoders

There is a very nice way to interpret PCA as a type of *linear autoencoder* whereby one trains a neural network with a hidden layer (with linear activation) that acts as an **information bottleneck.**  We want to minimize the least squred error between input and output. The network calculates:

\begin{equation}
\boldsymbol{P} \mathbf{x}_n
\end{equation}

for each $\mathbf{x}_n$ and we minimize the cost:

\begin{equation}
\mathcal{C} = \left \langle \mathbf{x}_n^\top \mathbf{x}_n - \mathbf{x}_n^\top \boldsymbol{P}\mathbf{x}_n \right \rangle  = \frac{1}{N} \sum_{n=1}^{N}\left( \mathbf{x}_n^\top \mathbf{x}_n - \mathbf{x}_n^\top \boldsymbol{P}\mathbf{x}_n \right) \, .
\end{equation}

To obtain the 1st princpal component for our example above we consider the linear autoencoder.

In [None]:
ml4s.draw_network([2,1,2])

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
x = np.loadtxt('../data/scatter_2d_pca.dat')
model = keras.Sequential(
    [
        layers.Dense(1,input_shape=(2,),activation='linear'),
        layers.Dense(2, activation='linear')
    ])

model.compile(loss='mean_squared_error', optimizer='adam')
training_history = model.fit(x=x,y=x, epochs=100, verbose=0)
score = model.evaluate(x, x, verbose=0);

In [None]:
plt.plot(training_history.history["loss"],color=colors[0], linestyle='-', 
             label=f'cost = {score:.2f}')
plt.ylabel("Cost")
plt.xlabel("Epoch")
plt.legend()

In [None]:
weights = [layer.weights[0].numpy() for layer in model.layers]
biases = [layer.weights[1].numpy() for layer in model.layers]
ml4s.draw_network([2,1,2], weights=weights, biases=biases, zero_index=True)

In [None]:
plt.scatter(x[:,0],x[:,1], s=1, alpha=0.5, label='data')
_x = np.linspace(-4,4,100)

weights = model.layers[1].weights[0].numpy()
plt.plot(_x,weights[0][1]/weights[0][0]*_x, '-', color=colors[0], label=r'$\mathbf{w}_1$')

plt.axis('equal')
plt.xticks([])
plt.yticks([]);
plt.legend()

Can you get more principal components with this strategy? 

Unlike the eigenvector problem above, the issue is that there is no guarentee the components will be orthognoal.  See:

[E. Plaut, From Principal Subspaces to Principal Components with Linear Autoencoders, arXiv:1804.10253 (2018)](https://arxiv.org/abs/1804.10253)

for a discussion of how you can re-orthogonalize via a singular value decomposition of the weight matrix.

In [None]:
model = keras.Sequential(
    [
        layers.Dense(2,input_shape=(2,),activation='linear'),
        layers.Dense(2, activation='linear')
    ])

model.compile(loss='mean_squared_error', optimizer='adam')
training_history = model.fit(x=x,y=x, epochs=100, verbose=0)
score = model.evaluate(x, x, verbose=0);

In [None]:
plt.semilogy(training_history.history["loss"],color=colors[0], linestyle='-', 
             label=f'cost = {score:.2f}')
plt.ylabel("Cost")
plt.xlabel("Epoch")
plt.legend()

In [None]:
import scipy
fig,ax = plt.subplots(1,2,figsize=(8,4))

ax[0].scatter(x[:,0],x[:,1], s=1, alpha=0.5,label='data')

_x = np.linspace(-1,1,100)

weights = model.layers[1].weights[0].numpy()
ax[0].plot(_x,weights[0][1]/weights[0][0]*_x, '-', color=colors[0], label=r'$\mathbf{w}_1$')
ax[0].plot(_x,weights[1][1]/weights[1][0]*_x, '-', color=colors[-2], label=r'$\mathbf{w}_2$')

ax[0].axis('equal')
ax[0].set_xticks([])
ax[0].set_yticks([])
ax[0].legend()

ax[1].scatter(x[:,0],x[:,1], s=1, alpha=0.5)

u, s, vh = scipy.linalg.svd(weights)
u = np.flip(u,axis=1)

ax[1].plot(_x,u[1,0]/u[0,0]*_x, '-', color=colors[0], label=r'$\mathbf{u}_1$')
ax[1].plot(_x,u[1,1]/u[0,1]*_x, '-', color=colors[-2], label=r'$\mathbf{u}_2$')

ax[1].axis('equal')
ax[1].legend()
ax[1].set_xticks([])
ax[1].set_yticks([]);

### Principal Component Analysis for MNIST

Let's try to understand this for a *real* dataset, MNIST.

We begin by loading the (now familiar) MNIST dataset, 60000 $28\times 28$ images of hand-written digits.

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

# determine the properties
rows,cols = x_train[0].shape

# reshape and rescale
x_train = x_train.reshape(x_train.shape[0], rows*cols).astype('float32')/255
x_test = x_test.reshape(x_test.shape[0], rows*cols).astype('float32')/255

In [None]:
from sklearn.decomposition import PCA

# perform the PCA, keeping M components
M = 100
model = PCA(n_components=M)
XPCA = model.fit_transform(x_train)

# store the results
λ = model.explained_variance_
PCAj = model.explained_variance_ratio_
V = model.components_

### Look at the Explained Variance Ratio

Over 90% of the variance is explained by the first 100 principal components.

In [None]:
fig,ax = plt.subplots()

ax.plot(PCAj)
ax.set_ylabel('PCA-j')
ax.set_xlabel('Component');

### Investigate the PCA eigenvectors

We can project and plot the PCA eigenvectors to get a sense of the **important** features in the data.

In [None]:
def plot_digit_array(x,y, show_prediction=False):
    '''Expects a list of digits (x) and associated labels (y)'''
    
    # determine the number of rows and columns of our image array
    num_digits = x.shape[0]
    num_cols = int(np.sqrt(num_digits))
    num_rows = num_digits//num_cols + 1

    fig,ax = plt.subplots(nrows=num_rows,ncols=num_cols,sharex=True,sharey=True,
                          figsize=(num_cols,num_rows))
    
    # plot all the numbers
    for i,cax in enumerate(ax.flatten()):
        if i < num_digits:
            cax.matshow(x[i].reshape(28,28), cmap='binary')
            cax.axis('off')
            if show_prediction:
                cax.text(0.99,0.99,f'{y[i]}',horizontalalignment='right',verticalalignment='top', 
                         transform=cax.transAxes, fontsize=8, color='r')
        else:
            cax.axis('off')
            
plot_digit_array(V[:20],range(20), show_prediction=False)

## Dimensional Reduction & Clustering

We can also use the PCA to project the MNIST data set into a lower dimensional ($M=2$) space where can visually inspect for patterns and clusters.

We first form the matrix of PCA vectors $V$ from which we can construct the projection operator $\boldsymbol{P}$.

In [None]:
pX = x_train @ V[:2,:].T

In [None]:
%config InlineBackend.figure_format = 'retina'

fig,ax = plt.subplots()
scatter = ax.scatter(pX[:,0],pX[:,1],s=2,c=y_train,cmap='Spectral_r')

# produce a legend with the unique colors corresponding to digits
legend = ax.legend(*scatter.legend_elements(),loc=(1,0), title="MNIST Digits")
ax.add_artist(legend)

ax.set_xlabel(f'PCA-1 = {PCAj[0]:.2f}')
ax.set_ylabel(f'PCA-2 = {PCAj[1]:.2f}');

Clearly there are patterns, but there is also overlap. This is related to the curse of high-dimensionality as it is very difficult to preserve distances between points when projecting into low-dimensions.