# Autoencoder: toy example

Load the libraries

In [1]:
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
import torch

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Autoencoder model 'bone structure'

Below is the minimal structure of components of an Autoencoder. Full code is contained in `autoendocer_utils.py`. 

In [None]:
# class Ecoder(torch.nn.Module): # torch.nn.Module is the base class for NN
#     def __init__():
#         pass
    
#     def forward():
#         pass
    
# class Decoder(torch.nn.Module):
#     def __init__():
#         pass
    
#     def forward():
#         pass
    
# class Autoencoder(torch.nn.Module):
#     def __init__():
#         self.encoder = Encoder()
#         self.decoder = Decoder()

#     def forward(self, x):
#         x = self.encoder(x)
#         x = self.decoder(x)
#         return x
    
#     def get_latent_space(self, x):
#         return self.encoder(x)
    

In [None]:
from autoencoder_utils import Autoencoder

### Generate data

We will consider very simple training data: solid background and a higher-value diagonal. A random noise is added on top.

In [None]:
from autoencoder_utils import generate_dataset, create_example

In [None]:
N_train = 100 # size of training set
N_test = 10 # size of test set
p = 5 # input dimension

X_train = generate_dataset(N_train, p)
X_test = generate_dataset(N_test, p)

In [None]:
plt.figure(dpi = 200, figsize = (2,2))
plt.title('Example of the training data')
plt.imshow(X_test[2, 0, :, :])

Set up the model and the training attributes: cost function (the MSE), its optimizer (Adam), learning rate, batch size, etc.

In [None]:
model = Autoencoder(p, 3,4, 5)
cost_function = torch.nn.MSELoss()
optimizer = torch.optim.Adam( model.parameters(),lr = 0.0001)
batch = 4
train_cost  = []
test_cost = []

Train the model.

In [None]:
for epoch in range(300):
    
    temp_train_cost =[]
    for j in range(0,N_train-batch, batch):
        
        optimizer.zero_grad()
        
        x_train_batch = X_train[j:j+batch, :, :, :]
        x_hat = model(x_train_batch)
        cost = cost_function(x_train_batch, x_hat)
        
        cost.backward()
        optimizer.step()    
        
        temp_train_cost.append(cost.detach().numpy())
        
    train_cost.append(np.mean(temp_train_cost))
    test_x_hat = model(X_test)
    test_cost.append(cost_function(test_x_hat, X_test).detach().numpy())

Visualize the evolution of the cost functions for the training and test sets.

In [None]:
plt.figure(dpi = 150)
plt.title('Cost Function')
plt.plot(train_cost, label = 'training')
plt.plot(test_cost, label = 'test')
plt.yscale('log')
plt.xlabel('epoch')
plt.legend()

### How normal inputs are reconstructed

In [None]:
fig, ax = plt.subplots(1,2, dpi = 200, figsize = (5, 10))

y = create_example(p)
y_hat = model(y.reshape(1,1,p,p)).detach().numpy().reshape((p,p))

ax[0].imshow(y, vmin = 0, vmax = 1)
ax[0].set_title('input')

ax[1].imshow(y_hat, vmin = 0, vmax = 1)
ax[1].set_title('output')

print(f'MSE of decoded image is {np.mean((y.detach().numpy()-y_hat)**2):0.4f}')

### How abnormal inputs are reconstructed

In [None]:
from autoencoder_utils import generate_abnormal_1, generate_abnormal_2

In [None]:
fig, ax = plt.subplots(1,2, dpi = 200, figsize = (5, 10))

y = generate_abnormal_1(p, with_noise = True)
y_hat = model(y.reshape(1,1,p,p)).detach().numpy().reshape((p,p))

ax[0].imshow(y, vmin = 0, vmax = 1)
ax[0].set_title('input')

ax[1].imshow(y_hat, vmin = 0, vmax = 1)
ax[1].set_title('output')

print(f'MSE of decoded image is {np.mean((y.detach().numpy()-y_hat)**2):0.4f}')

In [None]:
fig, ax = plt.subplots(1,2, dpi = 200, figsize = (5, 10))

y = generate_abnormal_2(p, with_noise = True)
y_hat = model(y.reshape(1,1,p,p)).detach().numpy().reshape((p,p))

ax[0].imshow(y, vmin = 0, vmax = 1)
ax[0].set_title('input')

ax[1].imshow(y_hat, vmin = 0, vmax = 1)
ax[1].set_title('output')

print(f'MSE of decoded image is {np.mean((y.detach().numpy()-y_hat)**2):0.4f}')

Now, let us look at the distributions of the latent coordinates and the recontruction errors. We generate 1000 of normal examples and record their latent coordinates and the MSE between the input and the output. Then we do the same for the abnormal examples.

In [None]:
LS = []
MSE_normal = []
MSE_abnormal = []
for k in range(1000):
    y = create_example(p)
    LS.append(model.get_latent_space(y.reshape(1,1,p,p)).detach().numpy().flatten())
    MSE_normal.append(torch.mean((y-model(y.reshape(1,1,p,p)))**2).detach().numpy().flatten()[0])
    
# and one anomaly
y = generate_abnormal_1(p, with_noise = True)
LS.append(model.get_latent_space(y.reshape(1,1,p,p)).detach().numpy().flatten())
MSE_abnormal.append(torch.mean((y-model(y.reshape(1,1,p,p)))**2).detach().numpy().flatten()[0])


y = generate_abnormal_2(p, with_noise = True)
LS.append(model.get_latent_space(y.reshape(1,1,p,p)).detach().numpy().flatten())
MSE_abnormal.append(torch.mean((y-model(y.reshape(1,1,p,p)))**2).detach().numpy().flatten()[0])

In [None]:
plt.figure(dpi = 200, figsize = (6,3))
plt.title('Distribution of the MSE')
_= plt.hist(MSE_normal, bins = 10, density=True, label = 'normal')
_= plt.hist(MSE_abnormal, bins = 2, density=True, label = 'abnormal')
_ = plt.yticks([])
plt.ylabel('Density of states')
plt.xlabel('MSE')
plt.legend()

Let us look at the distribution of the latent coordinates using pairwise plot.
It is not guaranteed that the abnormal examples would be outliers for every latent coordinate. But with a good autoencoder model, there should be at least one coordinate, along which the abnormal examples are far away from the normal ones.

In [None]:
# constract a DataFrame for plotting
import pandas as pd
df = pd.DataFrame(LS)
df['normal'] = 'normal'
df.iloc[-1, -1] = 'abnormal_2'
df.iloc[-2, -1] = 'abnormal_1'

import seaborn as sns
sns.set(font_scale = 2)
g = sns.pairplot(df, hue = 'normal')

# Autoencoder for XPCS

In this section we look at the use of latent space of an eutoencoder model for the two time correlation functions (2TCF) for XPCS experiments. The model is pre-trained and only the latent space coordinates are used here. The details of model training can be found at:
* https://doi.org/10.11578/dc.20210704.1
* https://doi.org/10.1038/s41598-021-93747-y
* https://doi.org/10.48550/arXiv.2201.07889

For anomaly detection, we consider two algorithms: DBSCAN clustering and Isolation Forest.
We consider two experiments where instrumentation instabilities happend. For a researcher, it is easy to identify the anomalous 2TCFs. We will see how one can teach a computer to recognize such anomalies. 

For each experiment, first two time series are used to generate (through dow-sampling) many examples of stable 2TCFs. From all other time series, only two down-sampled examples are considered. Remember, the anomalies need to be rare to be recognized. 

Since it is easy to recognize the anomaly for a researcher, we have the true labels avaiable to us to evalue the algorithm performance.

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN

In [None]:
from tiled.client import from_uri
c = from_uri("https://tiled-demo.blueskyproject.io/api")

## Experiment #1

In [None]:
from tiled.client import show_logs
# show_logs()

In [None]:
files_stable = list(c['um2022']['tatiana']['csx_stable']['experiment1'])
files_unstable = list(c['um2022']['tatiana']['csx_unstable']['experiment1'])

Plot the 2TCFs. Top row are stable series and bottom raw are unstable series.

In [None]:
fig, ax = plt.subplots(2, 3, figsize = (15,8))
for j in range(3):
    x = c['um2022']['tatiana']['csx_stable']['experiment1'][files_stable[j]].read()
    w = x.shape[1]
    ax[0, j].imshow(x.reshape(w,w), vmin = 1.05, vmax = 1.20, origin = 'lower')
    ax[0,j].set(xlabel = r'$t_1$, frames', ylabel = r'$t_2$, frames')
    
for j in range(2):
    x = c['um2022']['tatiana']['csx_unstable']['experiment1'][files_unstable[j]].read()
    w = x.shape[1]
    ax[1, j].imshow(x.reshape((w,w)), vmin = 1.0, vmax = 1.1, origin = 'lower')
    ax[1,j].set(xlabel = r'$t_1$, frames', ylabel = r'$t_2$, frames')
    
ax[-1,-1].axis('off')
plt.tight_layout()

Now let us look at the latent coordinate of all time series (calculated separately).

In [None]:
df_stable = pd.DataFrame(c['um2022']['tatiana']['tables']['experiment1_first_stable'].read())
df_unstable = pd.DataFrame(c['um2022']['tatiana']['tables']['experiment1_next_scans'].read())
df = pd.concat([df_stable,df_unstable], ignore_index=True, sort=False)
df = df.drop(columns = [df.columns[0]])
df.head()

In [None]:
sns.set(font_scale = 2)
g = sns.pairplot(df[:], plot_kws={'alpha': 0.5, 'edgecolor':'none'}, hue = 'stability')

for ax in g.axes.flatten():
    ax.get_yaxis().set_label_coords(-0.5,0.5)
    
for j in range(8):
    for i in range(8):
        g.axes[i,j].set_xlim((-10,10))
        g.axes[i,j].set_ylim((-10,10))

Now let us look closer at a particular plane.

In [None]:
sns.lmplot(x = 'lc7', y='lc8', data=df, hue = 'stability', fit_reg=False)

First, we see how DBSCAN assign the data. One can vary min_samples and reachability distance (eps).

In [None]:
X = df.iloc[:, 1:8].values

cluster = DBSCAN(min_samples = 10, eps= 3)
labels = cluster.fit_predict(X)
df['dbscan_labels'] = labels
sns.lmplot(x = 'lc7', y='lc8', data=df, hue = 'dbscan_labels', fit_reg=False)

Then look at the same data with Isolation forest.

In [None]:
contamination = 4/X.shape[0]
clf = IsolationForest(random_state=0, max_features = 2, contamination = contamination).fit(X)
labels = clf.predict(X)
df['iforest_labels'] = labels
sns.lmplot(x = 'lc7', y='lc8', data=df, hue = 'iforest_labels', fit_reg=False)

With Isolation Forest we can 'profile' the entire dataset by `anomaly score`. The higher the score, the more likely the sample is an anomaly. Depending on the assumed contamination level, we can find a proper threshold for the separating the normal data from outliers.

In [None]:
plt.hist(-clf.score_samples(X), bins = 25)
plt.title('Anomaly Score')

## Experiment #2

In [None]:
files_stable = list(c['um2022']['tatiana']['csx_stable']['experiment2'])
files_unstable = list(c['um2022']['tatiana']['csx_unstable']['experiment2'])

In [None]:
fig, ax = plt.subplots(2, 3, figsize = (15,8))
for j in range(3):
    x = c['um2022']['tatiana']['csx_stable']['experiment2'][files_stable[j]].read()
    w = x.shape[1]
    ax[0, j].imshow(x.reshape(w,w), vmin = 1.3, vmax = 1.7, origin = 'lower')
    ax[0,j].set(xlabel = r'$t_1$, frames', ylabel = r'$t_2$, frames')
    
for j in range(2):
    x = c['um2022']['tatiana']['csx_unstable']['experiment2'][files_unstable[j]].read()
    w = x.shape[1]
    ax[1, j].imshow(x.reshape((w,w)), vmin = 1.4, vmax = 1.6, origin = 'lower')
    ax[1,j].set(xlabel = r'$t_1$, frames', ylabel = r'$t_2$, frames')
    
ax[-1,-1].axis('off')
plt.tight_layout()

In [None]:
df_stable = pd.DataFrame(c['um2022']['tatiana']['tables']['experiment2_first_stable'].read())
df_unstable = pd.DataFrame(c['um2022']['tatiana']['tables']['experiment2_next_scans'].read())
df = pd.concat([df_stable,df_unstable], ignore_index=True, sort=False)
df = df.drop(columns = [df.columns[0]])
df.head()

In [None]:
sns.set(font_scale = 2)
g = sns.pairplot(df[:], plot_kws={'alpha': 0.7, 'edgecolor':'none'}, hue = 'stability')

for ax in g.axes.flatten():
    ax.get_yaxis().set_label_coords(-0.5,0.5)
    
for j in range(8):
    for i in range(8):
        g.axes[i,j].set_xlim((-25,25))
        g.axes[i,j].set_ylim((-25,25))

In [None]:
sns.lmplot(x = 'lc4', y='lc5', data=df, hue = 'stability', fit_reg=False)

In [None]:
cluster = DBSCAN(min_samples = 10, eps= 3)
labels = cluster.fit_predict(df.iloc[:, 1:8].values)
df['dbscan_labels'] = labels
sns.lmplot(x = 'lc4', y='lc5', data=df, hue = 'dbscan_labels', fit_reg=False)

In [None]:
X = df.iloc[:, 1:8].values

contamination = 4/X.shape[0]
clf = IsolationForest(random_state=0, max_features = 2, contamination = contamination).fit(X)
labels = clf.predict(X)
df['iforest_labels'] = labels
sns.lmplot(x = 'lc4', y='lc5', data=df, hue = 'iforest_labels', fit_reg=False)

In [None]:
plt.hist(-clf.score_samples(X), bins = 25)
plt.title('Anomaly Score')