In [2]:
import numpy as np
import random
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
from transvae import trans_models
from transvae.transformer_models import TransVAE
from transvae.rnn_models import RNN, RNNAttn
from transvae.wae_models import WAE
from transvae.aae_models import AAE
from transvae.tvae_util import *
from transvae import analysis
import glob
import re

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn import metrics
from sklearn.manifold import trustworthiness
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import plotly.express as px
import plotly.graph_objects as go

import coranking #coranking.readthedocs.io
from coranking.metrics import trustworthiness, continuity, LCMC
from transvae.snc import SNC #github.com/hj-n/steadiness-cohesiveness

In [1]:
def loss_plots(loss_src):
    tot_loss = analysis.plot_loss_by_type(src,loss_types=['tot_loss'])
    plt.savefig(save_dir+'tot_loss.png')
    recon_loss = analysis.plot_loss_by_type(src,loss_types=['recon_loss'])
    plt.savefig(save_dir+'recon_loss.png')
    kld_loss = analysis.plot_loss_by_type(src,loss_types=['kld_loss'])
    plt.savefig(save_dir+'kld_loss.png')
    prob_bce_loss = analysis.plot_loss_by_type(src,loss_types=['prop_bce_loss'])
    plt.savefig(save_dir+'prob_bce_loss.png')
    if 'aae' in src:
        disc_loss = analysis.plot_loss_by_type(src,loss_types=['disc_loss'])
        plt.savefig(save_dir+'disc_loss.png')
    if 'wae' in src:
        mmd_loss = analysis.plot_loss_by_type(src,loss_types=['mmd_loss'])
        plt.savefig(save_dir+'mmd_loss.png')
    plt.close('all')
    
def load_reconstructions(data,data_1D,latent_size, load_src, true_props=None,subset=None):
    
    recon_src = load_src+model.name+"_"+re.split('(\d{2,3})',latent_size[0])[0]+"_"+re.split('(\d{2,3})',latent_size[0])[1]+"//saved_info.csv"
    recon_df = pd.read_csv(recon_src)
    reconstructed_seq = recon_df['reconstructions'].to_list()[:num_sequences]
    props = torch.Tensor(recon_df['predicted properties'][:num_sequences])
    true_props_data = pd.read_csv(true_props).to_numpy()
    true_props = true_props_data[0:num_sequences,0]
    
    if subset:
        testing = pd.read_csv(subset).to_numpy()
        test_idx_list = [np.where(data==testing[idx][0]) for idx in range(len(testing))]


        batch_recon_len = len(reconstructed_seq)
        reconstructed_seq = [reconstructed_seq[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]
        data_1D= [data_1D[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]
        props = [props[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]
        props=torch.Tensor(props)
        data = testing[:][0]
        true_props_data = pd.read_csv(true_props).to_numpy()
        true_props = true_props_data[0:num_sequences,0]
        true_props= [true_props[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]

    return data, data_1D, true_props, props, reconstructed_seq

########################################################################################
gpu = True

num_sequences = 500_000
batch_size = 200 #setting for reconstruction
example_data = 'data\\peptides\\datasets\\uniprot_v3\\peptide_test.txt'
save_dir_loc = 'model_analyses\\test\\' #folder in which to save outpts
save_dir_name = 'test' #appended to identify data: train|test|other|etc...

reconstruct=True #True:reconstruct data here; False:load reconstructions from file
recon_src = "checkpointz//analyses_ckpts//" #directory in which all reconstructions are stored
true_prop_src = 'data\\peptides\\datasets\\uniprot_v3\\function_test.txt' #if property predictor load the true labels
subset_src = "" #(optional) this file should have the true sequences for a subset of the "example data" above

ckpt_list = glob.glob(""+"checkpointz\\to_slurm//**//*.ckpt", recursive = True) #grab all checkpoint
print('current working directory: ',os.getcwd())


for i in range(len(ckpt_list)):
    #search the current directory for the model name and load that model
    model_dic = {'trans':'TransVAE','aae':'AAE','rnnattn':'RNNAttn','rnn':'RNN','wae':'WAE'}
    model_src = ckpt_list[i]
    print('working on: ',model_src,'\n')
    model_name = list(filter(None,[key for key in model_dic.keys() if key in model_src.split('\\')[-1]]))
    model = locals()[model_dic[model_name[0]]](load_fn=model_src) #use locals to call model specific constructor
    
    #create save directory for the current model according to latent space size
    latent_size = re.findall('(latent[\d]{2,3})', model_src)
    save_dir= save_dir_loc+model.name+"_"+latent_size[0]+"_"+save_dir_name
    if not os.path.exists(save_dir):os.mkdir(save_dir) 
    save_dir= save_dir+"//" 
    save_df = pd.DataFrame() #this will hold the number variables and save to CSV
    
    #load the true labels
    data = pd.read_csv(example_data).to_numpy() 
    data_1D = data[:num_sequences,0] #gets rid of extra dimension
    true_props_data = pd.read_csv(true_prop_src).to_numpy()
    true_props = true_props_data[0:num_sequences,0]

    print("data loaded")
    #get the log.txt file from the ckpt and model name then plot loss curves
    loss_src = '_'.join( ("log",model_src.split('\\')[-1].split('_')[1],model_src.split('\\')[-1].split('_')[2][:-4]+"txt") )
    src= '\\'.join([str(i) for i in model_src.split('\\')[:-1]])+"\\"+loss_src
    print(loss_src, src)
    loss_plots(src)
    
#     #set the batch size and reconstruct the data
#     model.params['BATCH_SIZE'] = batch_size
#     if reconstruct:
#         reconstructed_seq, props = model.reconstruct(data[:num_sequences], log=False, return_mems=False)
#     else:
#         data, data_1D, true_props, props, reconstructed_seq = load_reconstructions(data, data_1D,latent_size,
#                                                                                    load_src=recon_src,
#                                                                                    true_props=true_prop_src)
#     if gpu:torch.cuda.empty_cache() #free allocated CUDA memory
    
#     #save the metrics to the dataframe
#     save_df['reconstructions'] = reconstructed_seq #placing the saves on a line separate from the ops allows for editing
#     save_df['predicted properties'] = [prop.item() for prop in props[:len(reconstructed_seq)]]
#     prop_acc, prop_conf, MCC=calc_property_accuracies(props[:len(reconstructed_seq)],true_props[:len(reconstructed_seq)], MCC=True)
#     save_df['property prediction accuracy'] = prop_acc
#     save_df['property prediction confidence'] = prop_conf
#     save_df['MCC'] = MCC
    

# #   First we tokenize the input and reconstructed smiles
#     input_sequences = []
#     for seq in data_1D:
#         input_sequences.append(peptide_tokenizer(seq))
#     output_sequences = []
#     for seq in reconstructed_seq:
#         output_sequences.append(peptide_tokenizer(seq))
    
#     seq_accs, tok_accs, pos_accs, seq_conf, tok_conf, pos_conf = calc_reconstruction_accuracies(input_sequences, output_sequences)
#     save_df['sequence accuracy'] = seq_accs
#     save_df['sequence confidence'] = seq_conf
#     save_df['token accuracy'] = tok_accs
#     save_df['token confidence'] = tok_conf
#     save_df = pd.concat([pd.DataFrame({'position_accs':pos_accs,'position_confidence':pos_conf }), save_df], axis=1)
    
    ##moving into memory and entropy
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) #50_000
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:65_000], log=False, save=False) #subset size 1200*35=42000 would be ok

#     ##calculate the entropies
#     vae_entropy_mus = calc_entropy(mus)
#     save_df = pd.concat([save_df,pd.DataFrame({'mu_entropies':vae_entropy_mus})], axis=1)
#     if model.model_type != 'wae' and model.model_type!= 'aae': #these don't have a variational type bottleneck
#         vae_entropy_mems  = calc_entropy(mems)
#         save_df = pd.concat([save_df,pd.DataFrame({'mem_entropies':vae_entropy_mems})], axis=1)
#         vae_entropy_logvars = calc_entropy(logvars)
#         save_df = pd.concat([save_df,pd.DataFrame({'logvar_entropies':vae_entropy_logvars})], axis=1)

    #create random index and re-index ordered memory list
    random_idx = np.random.permutation(np.arange(stop=mus.shape[0]))
    mus = mus[random_idx]
    shuf_data = data[random_idx]

    subsample_start=0
    subsample_length=mus.shape[0] #mus shape depends on batch size!

    #(for length based coloring): record all peptide lengths iterating through input
    pep_lengths = []
    for idx, pep in enumerate(shuf_data[subsample_start:(subsample_start+subsample_length)]):
        pep_lengths.append( len(pep[0]) )   
    #(for function based coloring): pull function from csv with peptide functions
    s_to_f =pd.read_csv(true_prop_src)    
    function = s_to_f['peptides'][subsample_start:(subsample_start+subsample_length)]
    function = function[random_idx] #account for random permutation

    pca = PCA(n_components=5,svd_solver='full')
    pca_batch =pca.fit_transform(X=mus[:])

#     #Calculate and plot the loading matrix from the PCA fit of the data
#     loadings = pca.components_.T*np.sqrt(pca.explained_variance_)
#     color=['tab:blue','tab:red','tab:green','tab:orange','tab:purple']
#     y_labels=['PC1 Correlation','PC2 Correlation','PC3 Correlation','PC4 Correlation','PC5 Correlation']
#     titles=['Latent Dimension Correlations to PC1','Latent Dimension Correlations to PC2',
#             'Latent Dimension Correlations to PC3','Latent Dimension Correlations to PC4',
#             'Latent Dimension Correlations to PC5']
#     for pc in range (loadings.shape[1]):
#         plt.figure(figsize=(10,6))
#         plt.title(titles[pc])
#         plt.ylabel(y_labels[pc])
#         plt.xlim(-1,loadings.shape[0]+1)
#         plt.xlabel('Latent Dimensions')
#         plt.bar(np.linspace(0,loadings.shape[0]-1,loadings.shape[0]),loadings[:,pc])
#         plt.savefig(save_dir+'latent_correlations_PC{}.png'.format(pc+1), transparent=None, dpi=600)
#         plt.close()

    
#     #plot format dictionnaries
#     titles={'text':'{}'.format(model.model_type.replace("_"," ").upper()),
#                           'x':0.5,'xanchor':'center','yanchor':'top','font_size':40}
#     general_fonts={'family':"Helvetica",'size':30,'color':"Black"}
#     colorbar_fmt={'title_font_size':30,'thickness':15,'ticks':'','title_text':'Lengths',
#                                'ticklabelposition':"outside bottom"}
    
#     fig = px.scatter(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1], "lengths":pep_lengths}),
#                 symbol_sequence=['hexagon2'],x='PC1', y='PC2', color="lengths",
#                 color_continuous_scale='Jet',template='simple_white', opacity=0.9)
#     fig.update_traces(marker=dict(size=9))
#     fig.update_layout(title=titles,xaxis_title="PC1", yaxis_title="PC2",font=general_fonts)
#     fig.update_coloraxes(colorbar=colorbar_fmt)
#     fig.write_image(save_dir+'pca_length.png', width=900, height=600)

#     fig = px.scatter(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1], 
#                                     "Function":list(map(lambda itm: "AMP" if itm==1 else "NON-AMP",function))}),
#                                     x='PC1', y='PC2', color="Function",symbol_sequence=['x-thin-open','circle'],
#                                     template='simple_white',symbol='Function', opacity=0.8) 
#     fig.update_traces(marker=dict(size=9))
#     fig.update_layout(title=titles,xaxis_title="PC1",yaxis_title="PC2",font=general_fonts)
#     fig.write_image(save_dir+'pca_function.png', width=900, height=600)
    
#     # Plot the explained variances
#     plt.bar(range(pca.n_components_), pca.explained_variance_ratio_*100, color='black')
#     plt.xlabel('PCA features')
#     plt.ylabel('variance %')
#     plt.xticks(range(pca.n_components_))
#     plt.savefig(save_dir+'variance_explained.png')

#     fig = px.scatter_matrix(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1],"PC3":pca_batch[:,2],
#                                           "PC4":pca_batch[:,3],"PC5":pca_batch[:,4],"lengths":pep_lengths}),
#                                     dimensions=["PC1","PC2","PC3","PC4","PC5"],
#                                     symbol_sequence=['hexagon2'],template='simple_white',
#                                     color="lengths",color_continuous_scale='Jet', opacity=0.9)
#     fig.update_traces(diagonal_visible=False)
#     fig.update_layout(title=titles,font=general_fonts)
#     fig.write_image(save_dir+'pca_matrix_length.png', width=5_000, height=2500) 
    
#     fig = px.scatter_matrix(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1],"PC3":pca_batch[:,2],
#                                           "PC4":pca_batch[:,3],"PC5":pca_batch[:,4],
#                                    "Function":list(map(lambda itm: "AMP" if itm==1 else "NON-AMP",function))}),
#                                     dimensions=["PC1","PC2","PC3","PC4","PC5"],template='simple_white',
#                                     color="Function",symbol_sequence=['x-thin','circle'],
#                                     symbol='Function', opacity=0.8) 
#     fig.update_traces(diagonal_visible=False)
#     fig.update_layout(title=titles,font=general_fonts)
#     fig.write_image(save_dir+'pca_matrix_function.png', width=5_000, height=2500) 
#     pearson = {} #dict to store the pearson coefficient between PCA vs AMP function or physicochem.props.
#     pearson.update({'amp'+'_spearmanr':[(spearmanr(pca_batch[:,pc],function).correlation,
#                                          spearmanr(pca_batch[:,pc],function).pvalue) for pc in range(5)]})
#     if 'train' in save_dir_name:
#         phys_props = pd.read_csv('data\\train_physicochem_props.csv')
#     else:
#         phys_props = pd.read_csv('data\\test_physicochem_props.csv')

    
#     for col in phys_props.columns:
#         functions = phys_props[col][:len(mus)].values
#         functions = functions[random_idx] #keeping track of data scrambling...
#         pearson.update({str(col)+'_pearsonr':[pearsonr(pca_batch[:,pc],functions) for pc in range(5)]})
#         fig = px.scatter_matrix(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1],"PC3":pca_batch[:,2],
#                                                   "PC4":pca_batch[:,3],"PC5":pca_batch[:,4],
#                                            "Function":functions}),
#                                             dimensions=["PC1","PC2","PC3","PC4","PC5"],template='simple_white',
#                                             color="Function",opacity=0.9) 
#         colorbar_fmt={'title_font_size':30,'thickness':15,'ticks':'','title_text':str(col),
#                                'ticklabelposition':"outside bottom"}
#         fig.update_traces(diagonal_visible=False)
#         fig.update_layout(title=titles,font=general_fonts)
#         fig.update_coloraxes(colorbar=colorbar_fmt, 
#                              cmax=np.mean(functions)+np.std(functions),
#                              cmin=np.mean(functions)-np.std(functions),
#                              cmid=np.mean(functions))
#         fig.write_image(save_dir+col+'_PCA_matrix'+'.png', width=5_000, height=2500) 
#     df_pearson = pd.DataFrame.from_dict(pearson)
#     df_pearson.to_csv(save_dir+'pearsonr.csv', index=False)

#     #first calculate silhouette score on all latent space dims
#     n=15
#     latent_mem_func_subsamples = []
#     for s in range(n):
#         s_len = len(mus)//n #sample lengths
#         mem_func_sil = metrics.silhouette_score(mus[s_len*s:s_len*(s+1)], function[s_len*s:s_len*(s+1)], metric='euclidean')
#         latent_mem_func_subsamples.append(mem_func_sil)
#     save_df = pd.concat([save_df,pd.DataFrame({'latent_mem_func_silhouette':latent_mem_func_subsamples})], axis=1)
#     #then go over pairs of PC's from PCA and find max SS PC's
#     pc_pairs = [[0,1],[0,2],[0,3],[0,4],[1,2],[1,3],[1,4],[2,3],[2,4],[3,4]]
#     for pc_pair in pc_pairs:
#         print("working on PC[{},{}]".format(pc_pair[0],pc_pair[1]))
#         pca_func_subsamples = []
#         for s in range(n):
#             s_len = len(mus)//n #sample lengths
#             XY = [i for i in zip(pca_batch[s_len*s:s_len*(s+1),pc_pair[0]], pca_batch[s_len*s:s_len*(s+1),pc_pair[1]])]
#             pca_func_sil = metrics.silhouette_score(XY, function[s_len*s:s_len*(s+1)], metric='euclidean')
#             pca_func_subsamples.append(pca_func_sil)
#         save_df = pd.concat([save_df,pd.DataFrame({'pca_func_silhouette[{},{}]'.format(pc_pair[0],pc_pair[1]):pca_func_subsamples})], axis=1)
#     print( np.argmax(save_df.drop(columns=save_df.columns[0]).mean(axis=0)) )
#     save_df.to_csv(save_dir+"saved_info.csv", index=False)
    
    
    
    #Section dealing with sequence generation metrics and bootstrapping from the latent space
    #first randomly sample points within the latents space
    rnd_seq_count =1_000 
    rnd_latent_list=[] #generate N latent space vectors
    mem_min = np.min(mus)
    mem_max = np.max(mus)
    for seq in range(rnd_seq_count):
        rnd_latent_list.append( np.array([random.uniform(mem_min,mem_max) for i in range(model.params['d_latent'])]).astype(np.float32) )
    
    model.params['BATCH_SIZE'] = 25
    rnd_token_list=np.empty((rnd_seq_count,model.tgt_len)) #store N decoded latent vectors now in token(0-20) form max length 125
    
    #decode these points into predicted amino acid tokens (integers)
    for batch in range(0,rnd_seq_count,model.params['BATCH_SIZE']):
        rnd_token_list[batch:batch+model.params['BATCH_SIZE']] =  model.greedy_decode(torch.tensor(rnd_latent_list[batch:batch+model.params['BATCH_SIZE']]).cuda()).cpu()
    
    #turn the tokens into characters
    decoded_rnd_seqs = decode_mols(torch.tensor(rnd_token_list), model.params['ORG_DICT'])
    decoded_rnd_seqs[:]=[x for x in decoded_rnd_seqs if x] #removes the empty lists
    
    df_gen_scores = {} #dictionnary to store results
    #UNIQUENESS
    percent_unique, unique_conf = uniqueness(decoded_rnd_seqs)
    df_gen_scores.update({'percent_unique': percent_unique})
    df_gen_scores.update({'unique_confidence':unique_conf})
    
    #NOVELTY
    #sample N test/train set sequences randomly and compare to those created
    percent_novel, novel_conf = novelty(data, np.expand_dims(np.array(decoded_rnd_seqs),1))
    df_gen_scores.update({'percent_novel':percent_novel})
    df_gen_scores.update({'novel_confidence':novel_conf})
    
    #AMP SAMPLING
    peptides_to_probe=10
    sample_count=100
    best_pc = np.argmax([np.abs(pearsonr(pca_batch[:,pc],function)[0]) for pc in range(5)]) #find the best PCvsAMP correlation
    pca_min = np.min(pca_batch[:,best_pc])
    pca_max = np.max(pca_batch[:,best_pc])
    pca_scan = np.zeros((peptides_to_probe,5)) #create a reduced vector to be sent backwards to high-D
    pca_scan[:,best_pc]=np.linspace(start=pca_min, stop=pca_max, num=peptides_to_probe) #scan 1 dim evenly with best PC
    amp_sample_latents = pca.inverse_transform(pca_scan) #inverse to high-Dims for decoding
    all_gen_seqs = [] #stored in a text file for AMP prediction later
    for idx,amp in enumerate(amp_sample_latents):
        print("working on amp sample number: ",idx)
        mus=np.expand_dims(amp.astype(np.float32),0)
        nearby_samples = np.random.normal(loc=0,scale=1,size=(sample_count,1,model.params['d_latent'])).astype(np.float32)*0.3 + mus
        model.params['BATCH_SIZE'] = 25
        rnd_token_list=np.empty((sample_count,model.tgt_len)) #store N decoded latent vectors now in token(0-20) form max length 125
        for batch in range(0,sample_count,model.params['BATCH_SIZE']):
            rnd_token_list[batch:batch+model.params['BATCH_SIZE']] =  model.greedy_decode(torch.tensor(nearby_samples[batch:batch+model.params['BATCH_SIZE']]).squeeze().cuda()).cpu()
        decoded_rnd_seqs = decode_mols(torch.tensor(rnd_token_list), model.params['ORG_DICT'])
        
               
        for seq in decoded_rnd_seqs:
            if len(seq)<=50 and len(seq)>0: #save only sequences with length >=50
                all_gen_seqs.append(seq) #appending to list of all generated sequences
        decoded_rnd_seqs = [seq for seq in decoded_rnd_seqs if len(seq)>0 and len(seq)<=50] #keep constrained length seqs
        
        #SEQ SIMILARITY
        similarity_score = sequence_similarity(decoded_rnd_seqs)  
        df_gen_scores.update({'average_sequence_similarity_'+str(idx): np.average(similarity_score)})
        df_gen_scores.update({'std_on_similarity_score_'+str(idx): np.std(similarity_score)})
        
        #AMP UNIQUENESS
        amp_percent_unique, amp_unique_conf = uniqueness(decoded_rnd_seqs)
        df_gen_scores.update({'amp_uniqueness_'+str(idx): amp_percent_unique})
        df_gen_scores.update({'amp_uniqueness_std_'+str(idx): amp_unique_conf})
        
        #Jaccard Similarity Score
        jac_scores_2 = jaccard_similarity_score(decoded_rnd_seqs,2)
        jac_scores_3 = jaccard_similarity_score(decoded_rnd_seqs,3)
        df_gen_scores.update({'amp_jac_score_2_'+str(idx): np.average(jac_scores_2)})
        df_gen_scores.update({'amp_jac_score_std_2_'+str(idx): np.std(jac_scores_2)})
        df_gen_scores.update({'amp_jac_score_3_'+str(idx): np.average(jac_scores_3)})
        df_gen_scores.update({'amp_jac_score_std_3_'+str(idx): np.std(jac_scores_3)})
    
    #Store Output
    with open(save_dir+'all_gen_seqs.txt','w') as f:
        for seq in all_gen_seqs:
            f.write(str(seq)+"\n")
    f.close()
    with open(save_dir+'PC_minmax.txt','w') as f:
        f.write(str(pca_min))
        f.write('\t')
        f.write(str(pca_max))
    f.close()
    df = pd.DataFrame.from_dict([df_gen_scores])
    pd.DataFrame.from_dict([df_gen_scores]).to_csv(save_dir+"generation_metrics.csv", index=False)
    

current working directory:  C:\Users\s_renaud\Documents\GitHub\MSCSAM_TBD\main_model
working on:  checkpointz\to_slurm\aae_latent128\300_aae-128_peptide.ckpt 

data loaded
log_aae-128_peptide.txt checkpointz\to_slurm\aae_latent128\log_aae-128_peptide.txt


  rnd_token_list[batch:batch+model.params['BATCH_SIZE']] =  model.greedy_decode(torch.tensor(rnd_latent_list[batch:batch+model.params['BATCH_SIZE']]).cuda()).cpu()


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

working on amp sample number:  3
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  4
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  1

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

working on amp sample number:  3
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  4
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  1

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

working on amp sample number:  3
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  4
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  1

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

decoding sequences of max length  125 current position:  100
working on amp sample number:  3
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  4
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 cu

decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  7
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
working on amp sample number:  8
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
decoding sequences of max length 

<H4>Since Compute Canada does not do the dimensionality reduction metrics we need to do them below

In [1]:
import coranking #coranking.readthedocs.io
from coranking.metrics import trustworthiness, continuity, LCMC
from transvae.snc import SNC #github.com/hj-n/steadiness-cohesiveness

import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import Image
from transvae import trans_models
from transvae.transformer_models import TransVAE
from transvae.rnn_models import RNN, RNNAttn
from transvae.wae_models import WAE
from transvae.aae_models import AAE
from transvae.tvae_util import *
from transvae import analysis
import glob
import re


gpu = True

example_data = 'data\\peptides\\datasets\\uniprot_v3\\peptide_train.txt'
test_train='train'
ckpt_list = glob.glob(""+"checkpointz\\to_slurm//**//*.ckpt", recursive = True) #grab all checkpoints
analyses_list = glob.glob("model_analyses\\train//**/*.csv", recursive=True) #grab all analyses
print('current working directory: ',os.getcwd())

for i in range(len(ckpt_list)):
    
    #search the current directory for the model name and load that model
    model_dic = {'trans':'TransVAE','aae':'AAE','rnnattn':'RNNAttn','rnn':'RNN','wae':'WAE'}
    model_src = ckpt_list[i]
    print('working on: ',model_src,'\n')
    model_name = list(filter(None,[key for key in model_dic.keys() if key in model_src.split('//')[-1]]))
    model = locals()[model_dic[model_name[0]]](load_fn=model_src) #use locals to call model specific constructor
    
    #load the analysis file corresponding to the model from the CC outputs
    for idx in range(len(analyses_list)):
        if analyses_list[idx].split("\\")[-2].find(model_src.split("\\")[-2].split("_")[0]) != -1 and analyses_list[idx].split("\\")[-2].find(model_src.split("\\")[-2].split("_")[1]) != -1:
            if analyses_list[idx].find("rnnattn")  != -1 and model_src.find("rnnattn") == -1: continue
            save_dir = analyses_list[idx]
            cur_analysis = pd.read_csv(save_dir)
    print("analysis: ",save_dir, "checkpoint: ",model_src)
    save_df = cur_analysis #this will hold the number variables and save to CSV
    
    #load the true labels
    data = pd.read_csv(example_data).to_numpy() 
    data_1D = data[:,0] #gets rid of extra dimension
    
    #moving into memory and entropy
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) 
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:65_000], log=False, save=False) #subset size 1200*35=42000 would be ok

    #create random index and re-index ordered memory list creating n random sub-lists (ideally resulting in IID random lists)
    random_idx = np.random.permutation(np.arange(stop=mus.shape[0]))
    mus[:] = mus[random_idx]
    data = data[random_idx]
    mus = mus[:30_000]#limit the quantity of data to speed up
    data = data[:30_000]
    
    #need to perform PCA to be able to compare dimensionality reduction quality
    pca = PCA(n_components=2)
    pca_batch =pca.fit_transform(X=mus) 
    
    #now ready to calculation dimensionality reduction accuracy with metrics
    trust_subsamples = []
    cont_subsamples = []
    lcmc_subsamples = []
    steadiness_subsamples = []
    cohesiveness_subsamples = []
    if 'test' in test_train: #different number of bootsraps for train vs test
        n=15
    else:
        n=15
    parameter = { "k": 50,"alpha": 0.1 } #for steadiness and cohesiveness
    for s in range(n):
        s_len = len(mus)//n
        Q = coranking.coranking_matrix(mus[s_len*s:s_len*(s+1)], pca_batch[s_len*s:s_len*(s+1)])
        trust_subsamples.append( np.mean(trustworthiness(Q, min_k=1, max_k=50)) )
        cont_subsamples.append( np.mean(continuity(Q, min_k=1, max_k=50)) )
        lcmc_subsamples.append( np.mean(LCMC(Q, min_k=1, max_k=50)) )
        print(s,trust_subsamples[s],cont_subsamples[s],lcmc_subsamples[s])

        metrics = SNC(raw=mus[s_len*s:s_len*(s+1)], emb=pca_batch[s_len*s:s_len*(s+1)], iteration=300, dist_parameter=parameter)
        metrics.fit() #solve for steadiness and cohesiveness
        steadiness_subsamples.append(metrics.steadiness())
        cohesiveness_subsamples.append(metrics.cohesiveness())
        print(metrics.steadiness(),metrics.cohesiveness())
        Q=0 #trying to free RAM
        metrics=0
        torch.cuda.empty_cache() #free allocated CUDA memory

    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_trustworthiness':trust_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_continuity':cont_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_lcmc':lcmc_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_steadiness':steadiness_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_cohesiveness':cohesiveness_subsamples})], axis=1)  
    
    save_df.to_csv(save_dir, index=False)

current working directory:  C:\Users\s_renaud\Documents\GitHub\MSCSAM_TBD\main_model
working on:  checkpointz\to_slurm\aae_latent128\300_aae-128_peptide.ckpt 

analysis:  model_analyses\train\aae-128_peptide_latent128_train\saved_info.csv checkpoint:  checkpointz\to_slurm\aae_latent128\300_aae-128_peptide.ckpt
0 0.8277429465902973 0.8862606809358823 0.07679406389722487
0.7330333156098998 0.7278493610671778
1 0.8310897881507834 0.8885384332591733 0.0792132540549302
0.7296862570948062 0.720482613371051
2 0.8293727596017993 0.888477456728557 0.0781438614379489
0.7267890454378874 0.7134277909330917
3 0.8297669058490494 0.8869309226153785 0.07456731890759731
0.7390826334477664 0.701123293447171
4 0.8249919675438919 0.8837750215021689 0.07435796890810012
0.7250924132822161 0.7271063953150014
5 0.8327085442189435 0.888954402408286 0.07626276342171674
0.7162625932531059 0.745518049566263
6 0.8286551832264427 0.8854890772596141 0.07426949480340445
0.7311025514986411 0.7072030458102468
7 0.82468

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.7481940328860448 0.8578322861069106 0.06238540547190069
0.638533820321017 0.7421123138169867
1 0.7540982544764399 0.8639034365452872 0.06697365073012655
0.6499585867437607 0.7398717745345145
2 0.7497432668340724 0.8606735457106293 0.06404135290715043
0.6669807642304794 0.7263953408970663
3 0.7547354526851814 0.8646541403353694 0.06498655738428781
0.6592386369384589 0.7352877075908086
4 0.74753170387042 0.8602958545790083 0.06334327039962487
0.6469802807563207 0.7521039523360136
5 0.7568007131815724 0.8666094579410504 0.06564757362584961
0.672205452057698 0.7124703947144828
6 0.746342257453966 0.861559503513567 0.06116416804122131
0.6545504119427492 0.7232462297114433
7 0.7521604724682577 0.8633922015823908 0.06327020683529494
0.6581628452365385 0.7365038035905196
8 0.748449644986718 0.8621951316091538 0.06527873372452045
0.650500441365782 0.7423354188225073
9 0.7503495746938681 0.8634379143684323 0.064803460825312
0.6452893891289653 0.7459374789777699
10 0.7531102568248569 0.861960

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.7817494427379845 0.8657976002367859 0.06489981987009424
0.6860279562681824 0.7240211489299258
1 0.7776529374834878 0.864904715187948 0.05952416294944897
0.6942339117462972 0.7199499848833129
2 0.779760438357809 0.8657382267044044 0.06547254248570376
0.7067912336718912 0.712113657660093
3 0.7807431056757561 0.8633723573638166 0.06390784811574202
0.6915587175708421 0.7319282148841706
4 0.7802298590701763 0.8653267931878473 0.06532910839386667
0.6794130213831057 0.7370212465351993
5 0.7815131944935706 0.8643849703292673 0.06424255520341124
0.6841581980722777 0.7519456257133945
6 0.7803777116961408 0.8661816422765857 0.06657125376729638
0.6856483731764518 0.7329440758325035
7 0.7770540179330642 0.8650499222851394 0.06374573953083276
0.6824706619341617 0.7348553229178791
8 0.7793036000145555 0.8630356576495695 0.0662257556158067
0.6774347070067037 0.7296132461443747
9 0.7753509459109489 0.8622182116359004 0.0638275951030358
0.6789963330104464 0.7318346721179267
10 0.7774100266649977 0.8

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.7370152024862489 0.8634798531553141 0.05438491841129625
0.6599594062114423 0.7575824907561008
1 0.7411408308729112 0.8677459396459396 0.05509249104771339
0.677052358441377 0.7292449245570558
2 0.7349718117406029 0.8625270411852072 0.05334805106198267
0.6619306771549778 0.7693799043011983
3 0.7356895988208894 0.8644465497862985 0.054062373340331364
0.659694401314592 0.7451748458819165
4 0.7338514366521357 0.8615954330371448 0.05322597831562191
0.6681604168464217 0.7417586167473771
5 0.7358988142566593 0.8623296728083439 0.05310954565024748
0.665037822012742 0.7207348443295716
6 0.735511429907948 0.8628942947421034 0.05365523364708812
0.6616822160540748 0.7517589179957713
7 0.7345331155245443 0.8644403988735728 0.05499980219741367
0.6718833592749228 0.7144300582522937
8 0.7356832221077624 0.8644243722554571 0.05568478357406243
0.6721963982746723 0.7200012770138551
9 0.7353888925064938 0.8634427521866973 0.05380539041626898
0.661869025792211 0.7299458463253474
10 0.7363696226465082 0.

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.7048295180346146 0.8435433644618213 0.056666953706836706
0.618546602036423 0.7289063797335309
1 0.7047551319817609 0.8417347545138849 0.05812562074591167
0.6355306099082833 0.6751488803409209
2 0.7103367225378365 0.847016338019426 0.060058642056503415
0.6427382187046151 0.6971648244395563
3 0.7061748649767138 0.846622273275638 0.05829721700083059
0.6225448191929621 0.6951381809587687
4 0.7059023271017153 0.8454431749638677 0.05558706982926562
0.6421531007437715 0.6995170928210982
5 0.7023545689993241 0.8422006038817529 0.05573566421432087
0.6312867194061164 0.6916453100801258
6 0.704945775278604 0.8440941823278963 0.05594142651521406
0.6287836587384819 0.6916323120772725
7 0.7063196671556927 0.8424553940775705 0.05781756902173512
0.6359905251733154 0.7066238795115285
8 0.7067649274666356 0.8435658699330959 0.05789351555257577
0.6311669533023905 0.6870498073008479
9 0.7049032603770718 0.8418258635373054 0.05614372756816056
0.6201228881599853 0.7259144198051992
10 0.6990685432479973 

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.7038607003735162 0.8202114156255954 0.04634985370820545
0.6070524926127104 0.7467925730163356
1 0.7120558903524263 0.824902405768977 0.05113286360477842
0.6058012770570621 0.7567935573433031
2 0.7064704779924463 0.8231374149539764 0.047437195138963335
0.6082248327625459 0.7625110678259335
3 0.7095121578341077 0.8223314195181133 0.048358298374462205
0.6040827613586748 0.7705288056644701
4 0.7050526733230048 0.817467967309887 0.04445018935032474
0.599107985645122 0.7708128020690734
5 0.7081179809593305 0.8218620666430814 0.04655993758721276
0.608784761405714 0.7546024495562659
6 0.7076433483480004 0.8212104581692216 0.046372178073512
0.6087074623288271 0.7678978181298376
7 0.7068372255065805 0.8230875355555914 0.04994284344813773
0.6109940494088862 0.7328813955165548
8 0.7045471826703718 0.818190318236446 0.04631808677705805
0.6142341530313588 0.7571988727513344
9 0.7067667142992241 0.8232477437192547 0.04812326931117625
0.6134692506108159 0.7602579585370444
10 0.707837688444789 0.81

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.7303331148724994 0.8250086729023492 0.04941967224060078
0.6380094487535801 0.7079615078674266
1 0.7328214178232443 0.8254545304817981 0.050723409193686245
0.6345025287660547 0.7097686031824371
2 0.7204775058502059 0.8193664915616927 0.050212004066216315
0.6272276195228377 0.7310539207666935
3 0.7317756288160424 0.823882800179238 0.04861755104743188
0.618212040626378 0.7264182100066221
4 0.7262540537663069 0.8179417535731206 0.048231873688452456
0.6301790729477404 0.7090796436114641
5 0.7311293925488106 0.8228570146579326 0.05211704233928238
0.6265267455639212 0.7277941900406311
6 0.7244291155869462 0.8176056860279202 0.04957175868407676
0.6309412221232009 0.6996425936663357
7 0.7248780495433655 0.819882312586311 0.049355800801785726
0.6171896058482067 0.72182484444936
8 0.7255663882938849 0.8193251092950953 0.050501338864550306
0.6236457240221633 0.6981362862947138
9 0.7230241474443765 0.8175129935837381 0.04669166132328648
0.6403676861895333 0.6910549680285497
10 0.725669486957588

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.6282432904444215 0.7558020286560526 0.027017069620606668
0.5889880918350345 0.5367218276823523
1 0.6322719414020851 0.760936936687637 0.02785653011797503
0.6026307225245164 0.5363876162714754
2 0.6316934394324627 0.7577437273143625 0.02737532624635946
0.6046527239256314 0.5886132211968325
3 0.6283145426313163 0.7508958423885975 0.025941641033130316
0.6013694766685304 0.5244442186353766
4 0.6278204849509972 0.7554050332624818 0.02784749898895549
0.5932613449237842 0.5487871615608625
5 0.6278184961295548 0.7536613083905495 0.02749455948320163
0.6053919175627682 0.5733835156921988
6 0.624883119679371 0.7513484718755988 0.025811762202408314
0.6057916524876883 0.5114239854779963
7 0.6300605528028838 0.75697036666267 0.027078372480382664
0.6002292370349307 0.5585538701514774
8 0.6322204656470758 0.7575466829846816 0.027166643175142294
0.5998613739141103 0.5326849704641035
9 0.6332485349229441 0.7584468852671034 0.026550747466288525
0.604431798474272 0.5242728182839997
10 0.63151649309545

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.592383768045427 0.7298055767104591 0.02110355276317513
0.710951803420244 0.3783058659231291
1 0.5965663683973031 0.7445771067009967 0.02095072265294144
0.7147238730698928 0.37807095199024277
2 0.5895639429090346 0.7315981561190327 0.02020247569379806
0.7160701350814327 0.3810565307752861
3 0.5943491864957727 0.7411003705508291 0.020556112719135787
0.7356113539524092 0.34679390300504975
4 0.5945894585130341 0.7401119282131148 0.02152200276065967
0.7310083287259836 0.378616359847917
5 0.5912539164199226 0.736203292843262 0.021204780192569335
0.7096672232857806 0.38518505027889305
6 0.5926805816760807 0.7384218884692142 0.02082895250811133
0.7111528926163948 0.4261443140393708
7 0.5973129882188606 0.742009352749593 0.022962196674393427
0.7220801528682634 0.3892252824513617
8 0.5893874622848304 0.7348121304567612 0.019428914489586616
0.7010800216984434 0.4014289931314957
9 0.5941425706948023 0.7360965843901847 0.02189541156182263
0.7138906337216956 0.3724765623242189
10 0.5962594987185

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.6497984240282964 0.7779607285686163 0.036984445082164995
0.5407131540083425 0.7774703074615357
1 0.6548181187125105 0.777323937816027 0.03682569540742687
0.5347070615216831 0.778350678446461
2 0.6504366287188931 0.769746055541196 0.035374356963423115
0.5401161961992752 0.7739089592498102
3 0.6532973366243604 0.7746432740539301 0.03624138769834786
0.5297989724133529 0.7671208283825148
4 0.6558031986192203 0.7784762126805997 0.037402154776814835
0.5525052327682445 0.755441520967257
5 0.6587278169134274 0.7808652910941538 0.03630959209219879
0.5432347145086189 0.7656539831306213
6 0.6472341267747849 0.7699367529883281 0.035545470263352676
0.5324846384541455 0.7723890744294652
7 0.6500645646928271 0.773566567430613 0.036727605666183286
0.5283213497047674 0.7816979018869251
8 0.6498752102823925 0.7755906591386966 0.035756832120518556
0.5364611696341848 0.769501928009676
9 0.654277782393981 0.7799535088458848 0.037580740663992966
0.5390788121464952 0.7698614209163291
10 0.657927361768634

  self.params['CHAR_WEIGHTS'] = torch.tensor(self.params['CHAR_WEIGHTS'], dtype=torch.float)


0 0.6546829584777252 0.767459071280191 0.035531048443111675
0.5643288670593534 0.7525286599262132
1 0.6597725531915507 0.7676425386977955 0.036852308515930014
0.5485122783580216 0.7772462924525856
2 0.6532309320142416 0.7672174557121237 0.03482694181400288
0.566083866802437 0.7659330532708306
3 0.6570049685463833 0.7672969032877276 0.03568715374440789
0.5610478312747906 0.7592733219331842
4 0.6564611653577856 0.7690054806148442 0.034987584637980296
0.5537973308464263 0.7572779580325617
5 0.6555180683046969 0.7639787901597419 0.036261866245881534
0.5483963515178953 0.7827161110891988
6 0.6555124901043692 0.7673299837852428 0.036737832379021285
0.5546362672420695 0.7574991094932784
7 0.6520465649775001 0.7628506453924815 0.03538795243347058
0.554430390681911 0.7532594512530436
8 0.6578285205116251 0.7726172697076231 0.03792526738351706
0.5633586439819098 0.7655955209314173
9 0.6514774155347451 0.7627141171555012 0.03315089522939714
0.5439933306560045 0.7501162179115215
10 0.6592407578586

<H3> This cell concatenates missing saved_info information (usually not necessary when ran in order)

In [2]:
import glob
import pandas as pd

analyses_list = glob.glob("model_analyses\\test//**/*o.csv", recursive=True) #grab all analyses
old_analyses_list = glob.glob("model_analyses\\old\\test//**/*o.csv", recursive=True)

for csv,old_csv in zip(analyses_list,old_analyses_list):
    print(csv)
    analysis = pd.read_csv(csv)
    old_analysis = pd.read_csv(old_csv)
    old_analysis = old_analysis.drop(columns=old_analysis.loc[:,'mu_entropies':'latent_to_PCA_cohesiveness'].columns)
    new_analysis = pd.concat([old_analysis,analysis], axis=1)
    new_analysis.to_csv(csv,index=False)


model_analyses\test\aae-128_peptide_latent128_test\saved_info.csv
model_analyses\test\aae-128_peptide_latent32_test\saved_info.csv
model_analyses\test\aae-128_peptide_latent64_test\saved_info.csv
model_analyses\test\rnn-128_peptide_latent128_test\saved_info.csv
model_analyses\test\rnn-128_peptide_latent32_test\saved_info.csv
model_analyses\test\rnn-128_peptide_latent64_test\saved_info.csv
model_analyses\test\rnnattn-128_peptide_latent128_test\saved_info.csv
model_analyses\test\rnnattn-128_peptide_latent32_test\saved_info.csv
model_analyses\test\rnnattn-128_peptide_latent64_test\saved_info.csv
model_analyses\test\trans1x-128_peptide_latent128_test\saved_info.csv
model_analyses\test\trans1x-128_peptide_latent32_test\saved_info.csv
model_analyses\test\trans1x-128_peptide_latent64_test\saved_info.csv
model_analyses\test\wae-128_peptide_latent128_test\saved_info.csv
model_analyses\test\wae-128_peptide_latent32_test\saved_info.csv
model_analyses\test\wae-128_peptide_latent64_test\saved_info.

<H3> This cell runs the python peptides package and finds physicochemical properties of peptide sequences

In [2]:
import peptides
dict_list=[]
for seq in data:
    pep = peptides.Peptide(seq[0])
    dict_list.append(
        {"aliphatic_index":pep.aliphatic_index(),
     "boman":pep.boman(),
     "charge_ph3":pep.charge(pH=3)/len(seq[0]),
     "charge_ph7":pep.charge(pH=7)/len(seq[0]),
     "charge_ph9":pep.charge(pH=11)/len(seq[0]),
    "hydrophobic_moment":pep.hydrophobic_moment()/len(seq[0]),
    "hydrophobicity":pep.hydrophobicity(),
    "instability_index":pep.instability_index(),
    "isoelectric_point":pep.isoelectric_point(),
    "molecular_weight":pep.molecular_weight()} )

In [3]:
df = pd.DataFrame(dict_list)
df.to_csv('data/train_physicochem_props.csv',index=False)

<H3> Special Extra section to perform particular analysis on select Peptides

In [13]:
gpu = True

num_sequences = 500_000
batch_size = 200 #setting for reconstruction
example_data = 'data\\peptides\\datasets\\uniprot_v3\\peptide_test.txt'
save_dir_loc = 'model_analyses\\sample\\' #folder in which to save outpts
save_dir_name = 'test' #appended to identify data: train|test|other|etc...

reconstruct=True #True:reconstruct data here; False:load reconstructions from file
recon_src = "checkpointz//analyses_ckpts//" #directory in which all reconstructions are stored
true_prop_src = 'data\\peptides\\datasets\\uniprot_v3\\function_test.txt' #if property predictor load the true labels
subset_src = "" #(optional) this file should have the true sequences for a subset of the "example data" above

ckpt_list = glob.glob(""+"checkpointz\\to_slurm//**//*.ckpt", recursive = True) #grab all checkpoint
print('current working directory: ',os.getcwd())


for i in range(len(ckpt_list)):
    #search the current directory for the model name and load that model
    model_dic = {'trans':'TransVAE','aae':'AAE','rnnattn':'RNNAttn','rnn':'RNN','wae':'WAE'}
    model_src = ckpt_list[i]
    print('working on: ',model_src,'\n')
    model_name = list(filter(None,[key for key in model_dic.keys() if key in model_src.split('\\')[-1]]))
    model = locals()[model_dic[model_name[0]]](load_fn=model_src) #use locals to call model specific constructor
    
    #create save directory for the current model according to latent space size
    latent_size = re.findall('(latent[\d]{2,3})', model_src)
    save_dir= save_dir_loc+model.name+"_"+latent_size[0]+"_"+save_dir_name
    if not os.path.exists(save_dir):os.mkdir(save_dir) 
    save_dir= save_dir+"//" 
    
     #load the true labels
    data = pd.read_csv(example_data).to_numpy() 
    data_1D = data[:num_sequences,0] #gets rid of extra dimension
    true_props_data = pd.read_csv(true_prop_src).to_numpy()
    true_props = true_props_data[0:num_sequences,0]
    
    ##moving into memory and entropy
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) #50_000
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:65_000], log=False, save=False) #subset size 1200*35=42000 would be ok

    #create random index and re-index ordered memory list
    random_idx = np.random.permutation(np.arange(stop=mus.shape[0]))
    mus = mus[random_idx]
    shuf_data = data[random_idx]

    subsample_start=0
    subsample_length=mus.shape[0] #mus shape depends on batch size!

    #(for length based coloring): record all peptide lengths iterating through input
    pep_lengths = []
    for idx, pep in enumerate(shuf_data[subsample_start:(subsample_start+subsample_length)]):
        pep_lengths.append( len(pep[0]) )   
    #(for function based coloring): pull function from csv with peptide functions
    s_to_f =pd.read_csv(true_prop_src)    
    function = s_to_f['peptides'][subsample_start:(subsample_start+subsample_length)]
    function = function[random_idx] #account for random permutation

    pca = PCA(n_components=5,svd_solver='full')
    pca_batch =pca.fit_transform(X=mus[:])
    
    #list of AMPs of interest
    probing_amps=np.array(('GKIIKLKASLKLL','GAIIKLKASLKLL','GKIIKLAASLKLL','GKIIKLKASLALL','GKIIKLKAALALL','GKIIALKASLKLL'))
    probing_amps=np.reshape(probing_amps,(len(probing_amps),1))
    
    if model.model_type =='aae' or model.model_type =='wae':
        mus,_,_=probed_amp_mems=model.calc_mems(probing_amps,log=False,save=False)
    else:
        _,mus,_=probed_amp_mems=model.calc_mems(probing_amps,log=False,save=False)
    pca.transform(X=probed_amp_mems)
    
    #plotting
    titles={'text':'{}'.format(model.model_type.replace("_"," ").upper()),
            'x':0.5,'xanchor':'center','yanchor':'top','font_size':40}
    general_fonts={'family':"Helvetica",'size':30,'color':"Black"}
    colorbar_fmt={'title_font_size':30,'thickness':15,'ticks':'','title_text':'Lengths',
                  'ticklabelposition':"outside bottom"}
    
    fig = px.scatter_matrix(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1],"PC3":pca_batch[:,2],
                                          "PC4":pca_batch[:,3],"PC5":pca_batch[:,4],
                                          "Function":list(map(lambda itm: "AMP" if itm==1 else "NON-AMP",function))}),
                                          dimensions=["PC1","PC2","PC3","PC4","PC5"],template='simple_white',
                                          color="Function",symbol_sequence=['x-thin','circle'],
                                          symbol='Function', opacity=0.8)
    fig.update_traces(diagonal_visible=False)
    fig.update_layout(title=titles,font=general_fonts)
    fig.write_image(save_dir+'pca_matrix_function.png', width=5_000, height=2500) 
    

current working directory:  C:\Users\s_renaud\Documents\GitHub\MSCSAM_TBD\main_model
working on:  checkpointz\to_slurm\aae_latent128\300_aae-128_peptide.ckpt 




To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



ValueError: Found array with dim 3. Estimator expected <= 2.