In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
from transvae import trans_models
from transvae.transformer_models import TransVAE
from transvae.rnn_models import RNN, RNNAttn
from transvae.wae_models import WAE
from transvae.aae_models import AAE
from transvae.tvae_util import *

In [None]:
%config Completer.use_jedi = False
%load_ext autoreload
%autoreload 2

import IPython.display as Disp
np.set_printoptions(suppress=True)

This tutorial will demonstrate how to visualize and interpret model memory and how to evaluate model performance on a number of metrics. Full scripts for training models, generating samples and calculating attention weights are provided and instructions on how to use those scripts are included in the README. The functions demonstrated in this tutorial do not have pre-written high throughput scripts but can be used on larger input sizes if desired. 

# Model Reconstruction Performance

A set of metrics on reconstruction accuracy of the different models is presented below. Some parameters need to be selected:
<ul>
    <li>data size: int --how many samples from the data to laod
    <li>data selection: string  --training, testing, full_no_shuffle
    <li>model_src: string --path to model checkpoint
    <li>models : RNN, WAE, AAE, RNNAttn, TransVAE --model selectiong from listed
</ul>

In [None]:
data_size = 200
data_selection = "full_no_shuffle"
model_src = "checkpointz//trans_amp//1_16_2022//2000_trans1x-128_peptide.ckpt"
model = TransVAE(load_fn=model_src)
gpu = False

if "full_no_shuffle" in data_selection:
    data = pd.read_csv('notebooks//example_data//peptide_combined_no_shuff.txt').to_numpy() 
elif "training" in data_selection:
    data = pd.read_csv('notebooks//example_data//train_test//peptide_train.txt').to_numpy()
elif "testing" in data_selection:
    data = pd.read_csv('notebooks//example_data//train_test//peptide_test.txt').to_numpy()
else:
    data = pd.read_csv('notebooks//example_data//train_test//.txt').to_numpy() 
data_1D = data[:5000,0] #gets rid of extra dimension
if gpu:
    data.cuda

In [None]:
model.params['BATCH_SIZE'] = 200
reconstructed_seq, props = model.reconstruct(data[:5000], log=False, return_mems=False)
for og_token, reconstructed_token in zip(data_1D, reconstructed_seq):
    print('{} <- Original'.format(og_token))
    print('{} <- Reconstruction'.format(reconstructed_token))
    print('\n')

<ul>MCC info:
    <li>+1 represents a perfect prediction
    <li>0 no better than random prediction
    <li>−1 indicates total disagreement between prediction and observation.
</ul>

In [None]:
true_props_data = pd.read_csv('notebooks//example_data//function_full_no_shuff.txt').to_numpy()
true_props = true_props_data[3000:3200,0]
prop_acc = calc_property_accuracies(props,true_props, MCC=True)

Token accuracies are accuracies per token, 
<ul>
    <li>sequence accuracies are accuracies per sequence
    <li>token accuracies are accuracies for each token averaged over all tokens in input dataset
    <li>position accuracies are per sequence position

In [None]:
# First we tokenize the input and reconstructed smiles
input_sequences = []
for seq in data_1D:
    input_sequences.append(peptide_tokenizer(seq))
output_sequences = []
for seq in reconstructed_seq:
    output_sequences.append(peptide_tokenizer(seq))

In [None]:
seq_accs, token_accs, position_accs = calc_reconstruction_accuracies(input_sequences, output_sequences)

In [None]:
seq_accs, token_accs

Plotting the accuracy on token position

In [None]:
plt.plot(position_accs)
plt.xlabel('Sequence Position')
plt.ylabel('Accuracy')
plt.show()

On these 25 smiles, the `RNNAttn-256` model is above 95% accurate showing only one significant drop between sequence positions 35 and 40. However, this is a small sample size so it is not a good representation on how this model performs on molecules of this size in general. For this, you can read our analysis of model performance on the ZINC/PubChem datasets (shown below) or test your own models reconstruction accuracy on a larger sample size.

# Visualizing Model Memory

The memory of a model is analogous to the probability distribution of molecular embeddings that it has learned during training. A single molecular embedding is the size 128 vector at the center of the variational bottleneck. Each model has a built-in method for calculating and returning the model memory for a set of input structures, `calc_mems()`. ***(note - we plot the mean vector rather than the reparameterized vector so we can identify and analyze the meaningful latent dimensions)***

In [None]:
if model.model_type =='aae':
    mems, _, _ = model.calc_mems(data[:5000], log=False, save=False) 
elif model.model_type == 'wae':
    mems, _, _ = model.calc_mems(data[:5000], log=False, save=False) 
else:
    mems, mus, logvars = model.calc_mems(data[:5000], log=False, save=False) 

We can visualize the model memory by plotting a sample of molecular embeddings using `plt.imshow()`

In [None]:
import plotly.express as px 
video_mem = np.reshape(mems, (50,100,128))
fig =px.imshow(video_mem, animation_frame=0)
fig.show()

In [None]:
fig = plt.figure(figsize=(12,8))

video_mus = np.reshape(mus, (50,100,128))
fig =px.imshow(video_mus, animation_frame=0)
fig.show()

In [None]:
print(mus[0])

In [None]:
print(data[0])
fig = plt.figure(figsize=(20,8))

plt.imshow(mus[0:2])

plt.xticks()
plt.yticks([])
plt.show()
print(data[1])

In [None]:
video_logvars = np.reshape(logvars, (50,100,128))
fig =px.imshow(video_logvars, animation_frame=0)
fig.show()

For the `RNNAttn-256` model we see the selective memory structure. Some latent dimensions are more meaningful than others. We can calculate exactly how much information is stored in each dimension with the Shannon information entropy. Typically, you would want to calculate the entropy for a larger sample than the 25 SMILES we are using in this tutorial. 

In [None]:
vae_entropy_mems  = calc_entropy(mems)
vae_entropy_mus = calc_entropy(mus)
vae_entropy_logvars = calc_entropy(logvars)

In [None]:
fig = plt.figure(figsize=(6,3))

plt.bar(range(len(vae_entropy_mems)), vae_entropy_mems)
plt.xlabel('Latent Dimension')
plt.ylabel('Entropy (bits)')
plt.show()

In [None]:
fig = plt.figure(figsize=(6,3))

plt.bar(range(len(vae_entropy_mus)), vae_entropy_mus)
plt.xlabel('Latent Dimension')
plt.ylabel('Entropy (bits)')
plt.show()

In [None]:
fig = plt.figure(figsize=(6,3))

plt.bar(range(len(vae_entropy_logvars)), vae_entropy_logvars)
plt.xlabel('Latent Dimension')
plt.ylabel('Entropy (bits)')
plt.show()

We can see that some dimensions have significantly more information contained across the 25 samples than others and they correspond with the selective memory visualization shown above. We can sum the entropy of all dimensions to find the full model entropy. Again, note that we would need a larger sample size to converge the model entropy.

In [None]:
total_entropy_mems = np.sum(vae_entropy_mems)
print('The model contains {} nats of information'.format(round(total_entropy_mems, 2)))
total_entropy_mus = np.sum(vae_entropy_mus)
print('The model contains {} nats of information'.format(round(total_entropy_mus, 2)))
total_entropy_logvars = np.nansum(vae_entropy_logvars)
#print(vae_entropy_logvars)
print('The model contains {} nats of information'.format(round(total_entropy_logvars, 2)))