In [9]:
import numpy as np
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
from transvae import trans_models
from transvae.transformer_models import TransVAE
from transvae.rnn_models import RNN, RNNAttn
from transvae.wae_models import WAE
from transvae.aae_models import AAE
from transvae.tvae_util import *
from transvae import analysis
import glob
import re

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn import metrics
from sklearn.manifold import trustworthiness
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import plotly.express as px

import coranking #coranking.readthedocs.io
from coranking.metrics import trustworthiness, continuity, LCMC
from transvae.snc import SNC #github.com/hj-n/steadiness-cohesiveness

def loss_plots(loss_src):
    tot_loss = analysis.plot_loss_by_type(src,loss_types=['tot_loss'])
    plt.savefig(save_dir+'tot_loss.png')
    recon_loss = analysis.plot_loss_by_type(src,loss_types=['recon_loss'])
    plt.savefig(save_dir+'recon_loss.png')
    kld_loss = analysis.plot_loss_by_type(src,loss_types=['kld_loss'])
    plt.savefig(save_dir+'kld_loss.png')
    prob_bce_loss = analysis.plot_loss_by_type(src,loss_types=['prop_bce_loss'])
    plt.savefig(save_dir+'prob_bce_loss.png')
    if 'aae' in src:
        disc_loss = analysis.plot_loss_by_type(src,loss_types=['disc_loss'])
        plt.savefig(save_dir+'disc_loss.png')
    if 'wae' in src:
        mmd_loss = analysis.plot_loss_by_type(src,loss_types=['mmd_loss'])
        plt.savefig(save_dir+'mmd_loss.png')
    plt.close('all')
    
def load_reconstructions(data,data_1D,latent_size, load_src, true_props=None,subset=None):
    
    recon_src = load_src+model.name+"_"+re.split('(\d{2,3})',latent_size[0])[0]+"_"+re.split('(\d{2,3})',latent_size[0])[1]+"//saved_info.csv"
    recon_df = pd.read_csv(recon_src)
    reconstructed_seq = recon_df['reconstructions'].to_list()[:num_sequences]
    props = torch.Tensor(recon_df['predicted properties'][:num_sequences])
    true_props_data = pd.read_csv(true_props).to_numpy()
    true_props = true_props_data[0:num_sequences,0]
    
    if subset:
        testing = pd.read_csv(subset).to_numpy()
        test_idx_list = [np.where(data==testing[idx][0]) for idx in range(len(testing))]


        batch_recon_len = len(reconstructed_seq)
        reconstructed_seq = [reconstructed_seq[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]
        data_1D= [data_1D[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]
        props = [props[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]
        props=torch.Tensor(props)
        data = testing[:][0]
        true_props_data = pd.read_csv(true_props).to_numpy()
        true_props = true_props_data[0:num_sequences,0]
        true_props= [true_props[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]

    return data, data_1D, true_props, props, reconstructed_seq

########################################################################################
gpu = True

num_sequences = 500#_000
batch_size = 200 #setting for reconstruction
example_data = 'slurm_analyses//data//sunistar//peptide_train.txt'
save_dir_loc = 'slurm_analyses' #folder in which to save outpts
save_dir_name = 'train' #appended to identify data: train|test|other|etc...

reconstruct=True #True:reconstruct data here; False:load reconstructions from file
recon_src = "checkpointz//analyses_ckpts//" #directory in which all reconstructions are stored
true_prop_src = "slurm_analyses//data//sunistar//function_train.txt" #if property predictor load the true labels
subset_src = "" #(optional) this file should have the true sequences for a subset of the "example data" above

ckpt_list = glob.glob(""+"temp_ckpt//**//*.ckpt", recursive = True) #grab all checkpoint
print('current working directory: ',os.getcwd())


for i in range(len(ckpt_list)):
    
    #search the current directory for the model name and load that model
    model_dic = {'trans':'TransVAE','aae':'AAE','rnn':'RNN','rnnattn':'RNNAttn','wae':'WAE'}
    model_src = ckpt_list[i]
    print('working on: ',model_src,'\n')
    model_name = list(filter(None,[key for key in model_dic.keys() if key in model_src.split('\\')[-1]]))
    model = locals()[model_dic[model_name[0]]](load_fn=model_src) #use locals to call model specific constructor
    
    #create save directory for the current model according to latent space size
    latent_size = re.findall('(latent[\d]{2,3})', model_src)
    save_dir= save_dir_loc+model.name+latent_size[0]+save_dir_name
    if not os.path.exists(save_dir):os.mkdir(save_dir) 
    save_dir= save_dir+"//" 
    save_df = pd.DataFrame() #this will hold the number variables and save to CSV
    
    #load the true labels
    data = pd.read_csv(example_data).to_numpy() 
    data_1D = data[:num_sequences,0] #gets rid of extra dimension
    true_props_data = pd.read_csv(true_prop_src).to_numpy()
    true_props = true_props_data[0:num_sequences,0]

    
    #get the log.txt file from the ckpt and model name then plot loss curves
    
    loss_src = '_'.join( ("log",model_src.split('\\')[-1].split('_')[1],model_src.split('\\')[-1].split('_')[2][:-4]+"txt") )
    src= '\\'.join([str(i) for i in model_src.split('\\')[:-1]])+"\\"+loss_src
    print(loss_src, src)
    loss_plots(src)
    
    #set the batch size and reconstruct the data
    model.params['BATCH_SIZE'] = batch_size
    if reconstruct:
        reconstructed_seq, props = model.reconstruct(data[:num_sequences], log=False, return_mems=False)
    else:
        data, data_1D, true_props, props, reconstructed_seq = load_reconstructions(data, data_1D,latent_size,
                                                                                   load_src=recon_src,
                                                                                   true_props=true_prop_src)
    if gpu:torch.cuda.empty_cache() #free allocated CUDA memory
    
    #save the metrics to the dataframe
    save_df['reconstructions'] = reconstructed_seq #placing the saves on a line separate from the ops allows for editing
    save_df['predicted properties'] = [prop.item() for prop in props[:len(reconstructed_seq)]]
    prop_acc, prop_conf, MCC=calc_property_accuracies(props[:len(reconstructed_seq)],true_props[:len(reconstructed_seq)], MCC=True)
    save_df['property prediction accuracy'] = prop_acc
    save_df['property prediction confidence'] = prop_conf
    save_df['MCC'] = MCC
    

#   First we tokenize the input and reconstructed smiles
    input_sequences = []
    for seq in data_1D:
        input_sequences.append(peptide_tokenizer(seq))
    output_sequences = []
    for seq in reconstructed_seq:
        output_sequences.append(peptide_tokenizer(seq))
    
    seq_accs, tok_accs, pos_accs, seq_conf, tok_conf, pos_conf = calc_reconstruction_accuracies(input_sequences, output_sequences)
    save_df['sequence accuracy'] = seq_accs
    save_df['sequence confidence'] = seq_conf
    save_df['token accuracy'] = tok_accs
    save_df['token confidence'] = tok_conf
    save_df = pd.concat([pd.DataFrame({'position_accs':pos_accs,'position_confidence':pos_conf }), save_df], axis=1)
    
    ##moving into memory and entropy
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:], log=False, save=False) 
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:1_000], log=False, save=False) #subset size 1200*35=42000 would be ok


    ##calculate the entropies
    vae_entropy_mus = calc_entropy(mus)
    save_df = pd.concat([save_df,pd.DataFrame({'mu_entropies':vae_entropy_mus})], axis=1)
    if model.model_type != 'wae' and model.model_type!= 'aae': #these don't have a variational type bottleneck
        vae_entropy_mems  = calc_entropy(mems)
        save_df = pd.concat([save_df,pd.DataFrame({'mem_entropies':vae_entropy_mems})], axis=1)
        vae_entropy_logvars = calc_entropy(logvars)
        save_df = pd.concat([save_df,pd.DataFrame({'logvar_entropies':vae_entropy_logvars})], axis=1)
    


    #create random index and re-index ordered memory list creating n random sub-lists (ideally resulting in IID random lists)
    random_idx = np.random.permutation(np.arange(stop=mus.shape[0]))
    mus[:] = mus[random_idx]
    data = data[random_idx]

    #define the subset of the data to sample for PCA and silhouette/cluster metrics
    subsample_start=0
    subsample_length=mus.shape[0]

    #(for length based coloring): record all peptide lengths iterating through input
    pep_lengths = []
    for idx, pep in enumerate(data[subsample_start:(subsample_start+subsample_length)]):
        pep_lengths.append( len(pep[0]) )   
    #(for function based coloring): pull function from csv with peptide functions

    s_to_f =pd.read_csv(true_prop_src)    
    function = s_to_f['peptides'][subsample_start:(subsample_start+subsample_length)]
    function = function[random_idx] #account for random permutation

    pca = PCA(n_components=2)
    pca_batch =pca.fit_transform(X=mus[:])

    fig = px.scatter(pca_batch,color= pep_lengths ,opacity=0.7)
    fig.update_traces(marker=dict(size=3))
    fig.write_image(save_dir+'pca_length.png', width=1920, height=1080)

    fig = px.scatter_matrix(pca_batch, color= [str(itm) for itm in function], opacity=0.7)
    fig.update_traces(marker=dict(size=3))
    fig.write_image(save_dir+'pca_function.png', width=1920, height=1080)

    #create n subsamples and calculate silhouette score for each
    latent_mem_func_subsamples = []
    pca_func_subsamples = []
    n=250
    for s in range(n):
        s_len = len(mus)//n #sample lengths
        mem_func_sil = metrics.silhouette_score(mus[s_len*s:s_len*(s+1)], function[s_len*s:s_len*(s+1)], metric='euclidean')
        latent_mem_func_subsamples.append(mem_func_sil)
        XY = [i for i in zip(pca_batch[s_len*s:s_len*(s+1),0], pca_batch[s_len*s:s_len*(s+1),1])]
        pca_func_sil = metrics.silhouette_score(XY, function[s_len*s:s_len*(s+1)], metric='euclidean')
        pca_func_subsamples.append(pca_func_sil)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_mem_func_silhouette':latent_mem_func_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'pca_func_silhouette':pca_func_subsamples})], axis=1)

    
    save_df.to_csv(save_dir+"saved_info.csv", index=False)

current working directory:  C:\Users\s_renaud\Documents\GitHub\MSCSAM_TBD\main_model
working on:  temp_ckpt\rnn_latent128\300_rnn-128_peptide.ckpt 

log_rnn-128_peptide.txt temp_ckpt\rnn_latent128\log_rnn-128_peptide.txt
rnn-128_peptide
cuda
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  10
decoding sequences of max length  125 current position:  20
decoding sequences of max length  125 current position:  30
decoding sequences of max length  125 current position:  40
decoding sequences of max length  125 current position:  50


KeyboardInterrupt: 

<H4>Since Compute Canada does not do the dimensionality reduction metrics we need to do them below

In [1]:
import coranking #coranking.readthedocs.io
from coranking.metrics import trustworthiness, continuity, LCMC
from transvae.snc import SNC #github.com/hj-n/steadiness-cohesiveness

import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import Image
from transvae import trans_models
from transvae.transformer_models import TransVAE
from transvae.rnn_models import RNN, RNNAttn
from transvae.wae_models import WAE
from transvae.aae_models import AAE
from transvae.tvae_util import *
from transvae import analysis
import glob
import re


gpu = True

example_data = 'data\\peptides\\datasets\\uniprot_v2\\peptide_test.txt'
test_train='test'
ckpt_list = glob.glob(""+"checkpointz\\to_slurm//**//*.ckpt", recursive = True) #grab all checkpoints
analyses_list = glob.glob("model_analyses\\test//**/*.csv", recursive=True) #grab all analyses
print('current working directory: ',os.getcwd())

for i in range(len(ckpt_list)):
    
    #search the current directory for the model name and load that model
    model_dic = {'trans':'TransVAE','aae':'AAE','rnnattn':'RNNAttn','rnn':'RNN','wae':'WAE'}
    model_src = ckpt_list[i]
    print('working on: ',model_src,'\n')
    model_name = list(filter(None,[key for key in model_dic.keys() if key in model_src.split('//')[-1]]))
    model = locals()[model_dic[model_name[0]]](load_fn=model_src) #use locals to call model specific constructor
    
    #load the analysis file corresponding to the model from the CC outputs
    for idx in range(len(analyses_list)):
        if analyses_list[idx].split("\\")[-2].find(model_src.split("\\")[-2].split("_")[0]) != -1 and analyses_list[idx].split("\\")[-2].find(model_src.split("\\")[-2].split("_")[1]) != -1:
            if analyses_list[idx].find("rnnattn")  != -1 and model_src.find("rnnattn") == -1: continue
            save_dir = analyses_list[idx]
            cur_analysis = pd.read_csv(save_dir)
    print(save_dir, model_src)
    save_df = cur_analysis #this will hold the number variables and save to CSV
    
    #load the true labels
    data = pd.read_csv(example_data).to_numpy() 
    data_1D = data[:,0] #gets rid of extra dimension
    
    #moving into memory and entropy
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:60_000], log=False, save=False) 
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:60_000], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:60_000], log=False, save=False) #subset size 1200*35=42000 would be ok

    #create random index and re-index ordered memory list creating n random sub-lists (ideally resulting in IID random lists)
    random_idx = np.random.permutation(np.arange(stop=mus.shape[0]))
    mus[:] = mus[random_idx]
    data = data[random_idx]
    
    #need to perform PCA to be able to compare dimensionality reduction quality
    pca = PCA(n_components=2)
    pca_batch =pca.fit_transform(X=mus[:])
    
    #now ready to calculation dimensionality reduction accuracy with metrics
    trust_subsamples = []
    cont_subsamples = []
    lcmc_subsamples = []
    steadiness_subsamples = []
    cohesiveness_subsamples = []
    if 'test' in test_train: #different number of bootsraps for train vs test
        n=15
    else:
        n=60
    parameter = { "k": 50,"alpha": 0.1 } #for steadiness and cohesiveness
    for s in range(n):
        s_len = len(mus)//n
        Q = coranking.coranking_matrix(mus[s_len*s:s_len*(s+1)], pca_batch[s_len*s:s_len*(s+1)])
        trust_subsamples.append( np.mean(trustworthiness(Q, min_k=1, max_k=50)) )
        cont_subsamples.append( np.mean(continuity(Q, min_k=1, max_k=50)) )
        lcmc_subsamples.append( np.mean(LCMC(Q, min_k=1, max_k=50)) )
        print(s,trust_subsamples[s],cont_subsamples[s],lcmc_subsamples[s])

        metrics = SNC(raw=mus[s_len*s:s_len*(s+1)], emb=pca_batch[s_len*s:s_len*(s+1)], iteration=300, dist_parameter=parameter)
        metrics.fit() #solve for steadiness and cohesiveness
        steadiness_subsamples.append(metrics.steadiness())
        cohesiveness_subsamples.append(metrics.cohesiveness())
        print(metrics.steadiness(),metrics.cohesiveness())
        Q=0 #trying to free RAM
        metrics=0
        torch.cuda.empty_cache() #free allocated CUDA memory

    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_trustworthiness':trust_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_continuity':cont_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_lcmc':lcmc_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_steadiness':steadiness_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_cohesiveness':cohesiveness_subsamples})], axis=1)  
    
    save_df.to_csv(save_dir, index=False)

current working directory:  C:\Users\s_renaud\Documents\GitHub\MSCSAM_TBD\main_model
working on:  checkpointz\to_slurm\rnn_latent128\300_rnn-128_peptide.ckpt 

model_analyses\test\rnn-128_peptide_latent128_test\saved_info.csv checkpointz\to_slurm\rnn_latent128\300_rnn-128_peptide.ckpt
loaded analysis:  model_analyses\test\rnn-128_peptide_latent128_test\saved_info.csv
rnn-128_peptide
0 0.7290241622955547 0.8009505173582531 0.06979944271197006
0.7209609380458292 0.6189992125703438
1 0.7262266883179355 0.802691288488399 0.07596036270898948
0.7239525488784908 0.5960984817442772
2 0.714668421353525 0.7914918621657833 0.07108957696859496
0.7061775159659129 0.6253074599537447
3 0.7303904333832639 0.801511383502429 0.07325587501188267
0.7079493231566878 0.6293727751774751
4 0.7262715133676234 0.8003046490777583 0.07273328827498193
0.7068473983953187 0.6331042224269483
5 0.7219381820473441 0.7969354533371528 0.07582470681946135
0.6993178266234523 0.6413384866206114
6 0.7237499934026759 0.797395

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


model_analyses\test\rnn-128_peptide_latent32_test\saved_info.csv checkpointz\to_slurm\rnn_latent32\300_rnn-128_peptide.ckpt
loaded analysis:  model_analyses\test\rnn-128_peptide_latent32_test\saved_info.csv
rnn-128_peptide
0 0.6911140121315561 0.779639941599065 0.06630761088669475
0.6115372445463161 0.7294107314997793
1 0.6880222438763247 0.7768100366192453 0.06256316484746736


KeyboardInterrupt: 