In [1]:
import numpy as np
import random
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
from transvae import trans_models
from transvae.transformer_models import TransVAE
from transvae.rnn_models import RNN, RNNAttn
from transvae.wae_models import WAE
from transvae.aae_models import AAE
from transvae.tvae_util import *
from transvae import analysis
import glob
import re

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn import metrics
from sklearn.manifold import trustworthiness
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import plotly.express as px
import plotly.graph_objects as go

import coranking #coranking.readthedocs.io
from coranking.metrics import trustworthiness, continuity, LCMC
from transvae.snc import SNC #github.com/hj-n/steadiness-cohesiveness

import Bio
from Bio import pairwise2
from Bio.Align import substitution_matrices

def loss_plots(loss_src):
    tot_loss = analysis.plot_loss_by_type(src,loss_types=['tot_loss'])
    plt.savefig(save_dir+'tot_loss.png')
    recon_loss = analysis.plot_loss_by_type(src,loss_types=['recon_loss'])
    plt.savefig(save_dir+'recon_loss.png')
    kld_loss = analysis.plot_loss_by_type(src,loss_types=['kld_loss'])
    plt.savefig(save_dir+'kld_loss.png')
    prob_bce_loss = analysis.plot_loss_by_type(src,loss_types=['prop_bce_loss'])
    plt.savefig(save_dir+'prob_bce_loss.png')
    if 'aae' in src:
        disc_loss = analysis.plot_loss_by_type(src,loss_types=['disc_loss'])
        plt.savefig(save_dir+'disc_loss.png')
    if 'wae' in src:
        mmd_loss = analysis.plot_loss_by_type(src,loss_types=['mmd_loss'])
        plt.savefig(save_dir+'mmd_loss.png')
    plt.close('all')
    
def load_reconstructions(data,data_1D,latent_size, load_src, true_props=None,subset=None):
    
    recon_src = load_src+model.name+"_"+re.split('(\d{2,3})',latent_size[0])[0]+"_"+re.split('(\d{2,3})',latent_size[0])[1]+"//saved_info.csv"
    recon_df = pd.read_csv(recon_src)
    reconstructed_seq = recon_df['reconstructions'].to_list()[:num_sequences]
    props = torch.Tensor(recon_df['predicted properties'][:num_sequences])
    true_props_data = pd.read_csv(true_props).to_numpy()
    true_props = true_props_data[0:num_sequences,0]
    
    if subset:
        testing = pd.read_csv(subset).to_numpy()
        test_idx_list = [np.where(data==testing[idx][0]) for idx in range(len(testing))]


        batch_recon_len = len(reconstructed_seq)
        reconstructed_seq = [reconstructed_seq[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]
        data_1D= [data_1D[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]
        props = [props[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]
        props=torch.Tensor(props)
        data = testing[:][0]
        true_props_data = pd.read_csv(true_props).to_numpy()
        true_props = true_props_data[0:num_sequences,0]
        true_props= [true_props[test_idx_list[i][0][0]] for i in range(len(test_idx_list)) if test_idx_list[i][0][0]<batch_recon_len]

    return data, data_1D, true_props, props, reconstructed_seq

########################################################################################
gpu = True

num_sequences = 500#_000
batch_size = 200 #setting for reconstruction
example_data = 'data\\peptides\\datasets\\uniprot_v2\\peptide_train.txt'
save_dir_loc = 'model_analyses\\train\\' #folder in which to save outpts
save_dir_name = 'train' #appended to identify data: train|test|other|etc...

reconstruct=True #True:reconstruct data here; False:load reconstructions from file
recon_src = "checkpointz//analyses_ckpts//" #directory in which all reconstructions are stored
true_prop_src = 'data\\peptides\\datasets\\uniprot_v2\\function_train.txt' #if property predictor load the true labels
subset_src = "" #(optional) this file should have the true sequences for a subset of the "example data" above

ckpt_list = glob.glob(""+"checkpointz\\to_slurm//**//*.ckpt", recursive = True) #grab all checkpoint
print('current working directory: ',os.getcwd())


for i in range(len(ckpt_list)):
    
    #search the current directory for the model name and load that model
    model_dic = {'trans':'TransVAE','aae':'AAE','rnnattn':'RNNAttn','rnn':'RNN','wae':'WAE'}
    model_src = ckpt_list[i]
    print('working on: ',model_src,'\n')
    model_name = list(filter(None,[key for key in model_dic.keys() if key in model_src.split('\\')[-1]]))
    model = locals()[model_dic[model_name[0]]](load_fn=model_src) #use locals to call model specific constructor
    
    #create save directory for the current model according to latent space size
    latent_size = re.findall('(latent[\d]{2,3})', model_src)
    save_dir= save_dir_loc+model.name+"_"+latent_size[0]+"_"+save_dir_name
    if not os.path.exists(save_dir):os.mkdir(save_dir) 
    save_dir= save_dir+"//" 
#     save_df = pd.DataFrame() #this will hold the number variables and save to CSV
    
    #load the true labels
    data = pd.read_csv(example_data).to_numpy() 
    data_1D = data[:num_sequences,0] #gets rid of extra dimension
    true_props_data = pd.read_csv(true_prop_src).to_numpy()
    true_props = true_props_data[0:num_sequences,0]

    
#     #get the log.txt file from the ckpt and model name then plot loss curves
#     loss_src = '_'.join( ("log",model_src.split('\\')[-1].split('_')[1],model_src.split('\\')[-1].split('_')[2][:-4]+"txt") )
#     src= '\\'.join([str(i) for i in model_src.split('\\')[:-1]])+"\\"+loss_src
#     print(loss_src, src)
#     loss_plots(src)
    
#     #set the batch size and reconstruct the data
#     model.params['BATCH_SIZE'] = batch_size
#     if reconstruct:
#         reconstructed_seq, props = model.reconstruct(data[:num_sequences], log=False, return_mems=False)
#     else:
#         data, data_1D, true_props, props, reconstructed_seq = load_reconstructions(data, data_1D,latent_size,
#                                                                                    load_src=recon_src,
#                                                                                    true_props=true_prop_src)
#     if gpu:torch.cuda.empty_cache() #free allocated CUDA memory
    
#     #save the metrics to the dataframe
#     save_df['reconstructions'] = reconstructed_seq #placing the saves on a line separate from the ops allows for editing
#     save_df['predicted properties'] = [prop.item() for prop in props[:len(reconstructed_seq)]]
#     prop_acc, prop_conf, MCC=calc_property_accuracies(props[:len(reconstructed_seq)],true_props[:len(reconstructed_seq)], MCC=True)
#     save_df['property prediction accuracy'] = prop_acc
#     save_df['property prediction confidence'] = prop_conf
#     save_df['MCC'] = MCC
    

# #   First we tokenize the input and reconstructed smiles
#     input_sequences = []
#     for seq in data_1D:
#         input_sequences.append(peptide_tokenizer(seq))
#     output_sequences = []
#     for seq in reconstructed_seq:
#         output_sequences.append(peptide_tokenizer(seq))
    
#     seq_accs, tok_accs, pos_accs, seq_conf, tok_conf, pos_conf = calc_reconstruction_accuracies(input_sequences, output_sequences)
#     save_df['sequence accuracy'] = seq_accs
#     save_df['sequence confidence'] = seq_conf
#     save_df['token accuracy'] = tok_accs
#     save_df['token confidence'] = tok_conf
#     save_df = pd.concat([pd.DataFrame({'position_accs':pos_accs,'position_confidence':pos_conf }), save_df], axis=1)
    
    ##moving into memory and entropy
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:50_000], log=False, save=False) 
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:50_000], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:50_000], log=False, save=False) #subset size 1200*35=42000 would be ok

#     ##calculate the entropies
#     vae_entropy_mus = calc_entropy(mus)
#     save_df = pd.concat([save_df,pd.DataFrame({'mu_entropies':vae_entropy_mus})], axis=1)
#     if model.model_type != 'wae' and model.model_type!= 'aae': #these don't have a variational type bottleneck
#         vae_entropy_mems  = calc_entropy(mems)
#         save_df = pd.concat([save_df,pd.DataFrame({'mem_entropies':vae_entropy_mems})], axis=1)
#         vae_entropy_logvars = calc_entropy(logvars)
#         save_df = pd.concat([save_df,pd.DataFrame({'logvar_entropies':vae_entropy_logvars})], axis=1)
    


#     #create random index and re-index ordered memory list creating n random sub-lists (ideally resulting in IID random lists)
#     random_idx = np.random.permutation(np.arange(stop=mus.shape[0]))
#     mus = mus[random_idx]
#     data = data[random_idx]

#     subsample_start=0
#     subsample_length=mus.shape[0] #this may change depending on batch size

#     #(for length based coloring): record all peptide lengths iterating through input
#     pep_lengths = []
#     for idx, pep in enumerate(data[subsample_start:(subsample_start+subsample_length)]):
#         pep_lengths.append( len(pep[0]) )   
#     #(for function based coloring): pull function from csv with peptide functions
#     s_to_f =pd.read_csv(true_prop_src)    
#     function = s_to_f['peptides'][subsample_start:(subsample_start+subsample_length)]
#     function = function[random_idx] #account for random permutation

#     pca = PCA(n_components=5)
#     pca_batch =pca.fit_transform(X=mus[:])

#     #plot format dictionnaries
#     titles={'text':'{}'.format(model.model_type.replace("_"," ").upper()),
#                           'x':0.5,'xanchor':'center','yanchor':'top','font_size':40}
#     general_fonts={'family':"Helvetica",'size':30,'color':"Black"}
#     colorbar_fmt={'title_font_size':30,'thickness':15,'ticks':'','title_text':'Lengths',
#                                'ticklabelposition':"outside bottom"}
    
#     fig = px.scatter(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1], "lengths":pep_lengths}),
#                 symbol_sequence=['hexagon2'],x='PC1', y='PC2', color="lengths",
#                 color_continuous_scale='Jet',template='simple_white', opacity=0.9)
#     fig.update_traces(marker=dict(size=9))
#     fig.update_layout(title=titles,xaxis_title="PC1", yaxis_title="PC2",font=general_fonts)
#     fig.update_coloraxes(colorbar=colorbar_fmt)
#     fig.write_image(save_dir+'pca_length.png', width=900, height=600)

#     fig = px.scatter(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1], 
#                                     "Function":list(map(lambda itm: "AMP" if itm==1 else "NON-AMP",function))}),
#                                     x='PC1', y='PC2', color="Function",symbol_sequence=['x-thin-open','circle'],
#                                     template='simple_white',symbol='Function', opacity=0.8) 
#     fig.update_traces(marker=dict(size=9))
#     fig.update_layout(title=titles,xaxis_title="PC1",yaxis_title="PC2",font=general_fonts)
#     fig.write_image(save_dir+'pca_function.png', width=900, height=600)
    
#     # Plot the explained variances
#     plt.bar(range(pca.n_components_), pca.explained_variance_ratio_*100, color='black')
#     plt.xlabel('PCA features')
#     plt.ylabel('variance %')
#     plt.xticks(features)
#     plt.savefig(save_dir+'variance_explained.png')

#     fig = px.scatter_matrix(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1],"PC3":pca_batch[:,2],
#                                     "lengths":pep_lengths}), dimensions=["PC1","PC2","PC3"],
#                                     symbol_sequence=['hexagon2'],template='simple_white',
#                                     color="lengths",color_continuous_scale='Jet', opacity=0.9)
#     fig.update_traces(diagonal_visible=False)
#     fig.update_layout(title=titles,xaxis_title="PC1", yaxis_title="PC2",font=general_fonts)
#     fig.write_image(save_dir+'pca_matrix_length.png', width=1920, height=1080) 
    
#     fig = px.scatter_matrix(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1],"PC3":pca_batch[:,2], 
#                                    "Function":list(map(lambda itm: "AMP" if itm==1 else "NON-AMP",function))}),
#                                     dimensions=["PC1","PC2","PC3"],template='simple_white',
#                                     color="Function",symbol_sequence=['x-thin','circle'],
#                                     symbol='Function', opacity=0.8) 
#     fig.update_traces(diagonal_visible=False)
#     fig.update_layout(title=titles,xaxis_title="PC1", yaxis_title="PC2",font=general_fonts)
#     fig.write_image(save_dir+'pca_matrix_function.png', width=1920, height=1080) 

#     #create n subsamples and calculate silhouette score for each
#     latent_mem_func_subsamples = []
#     pca_func_subsamples = []
#     n=250
#     for s in range(n):
#         s_len = len(mus)//n #sample lengths
#         mem_func_sil = metrics.silhouette_score(mus[s_len*s:s_len*(s+1)], function[s_len*s:s_len*(s+1)], metric='euclidean')
#         latent_mem_func_subsamples.append(mem_func_sil)
#         XY = [i for i in zip(pca_batch[s_len*s:s_len*(s+1),0], pca_batch[s_len*s:s_len*(s+1),1])]
#         pca_func_sil = metrics.silhouette_score(XY, function[s_len*s:s_len*(s+1)], metric='euclidean')
#         pca_func_subsamples.append(pca_func_sil)
#     save_df = pd.concat([save_df,pd.DataFrame({'latent_mem_func_silhouette':latent_mem_func_subsamples})], axis=1)
#     save_df = pd.concat([save_df,pd.DataFrame({'pca_func_silhouette':pca_func_subsamples})], axis=1)

#     save_df.to_csv(save_dir+"saved_info.csv", index=False)
    
#New section dealing with sequence generation metrics and bootstrapping from the latent space
    rnd_seq_count = 1_000
    rnd_latent_list=[] #generate N latent space vectors
    for seq in range(rnd_seq_count):
        rnd_latent_list.append( np.array([random.uniform(np.min(mus),np.max(mus)) for i in range(model.params['d_latent'])]).astype(np.float32) )
    
    model.params['BATCH_SIZE'] = 25
    rnd_token_list=np.empty((rnd_seq_count,model.tgt_len)) #store N decoded latent vectors now in token(0-20) form max length 125
    print('pass_check')
    for batch in range(0,rnd_seq_count,model.params['BATCH_SIZE']):
        rnd_token_list[batch:batch+model.params['BATCH_SIZE']] =  model.greedy_decode(torch.tensor(rnd_latent_list[batch:batch+model.params['BATCH_SIZE']]).cuda()).cpu()
    
    decoded_rnd_seqs = decode_mols(torch.tensor(rnd_token_list), model.params['ORG_DICT'])
    decoded_rnd_seqs[:]=[x for x in decoded_rnd_seqs if x] #removes the empty lists
    z=1.96 #95% confidence interval
    percent_unique = len(set(decoded_rnd_seqs)) / rnd_seq_count
    unique_conf = z*math.sqrt(percent_unique*(1-percent_unique)/rnd_seq_count)
    percent_unique, unique_conf
    
    df_gen_scores = {}
    df_gen_scores.update({'percent_unique': percent_unique})
    df_gen_scores.update({'unique_confidence':unique_conf})
    
    #sample N test set sequences randomly
    shuffled_test = random.sample(data.tolist(),len(data))
    shuffled_test = np.array(shuffled_test[:len(decoded_rnd_seqs)])
    combined = np.concatenate(( shuffled_test,np.array(decoded_rnd_seqs).reshape(len(decoded_rnd_seqs),1)) )
    percent_novel = len(set(combined.flatten().tolist()))/(2*len(decoded_rnd_seqs))
    novel_conf =  z*math.sqrt(percent_novel*(1-percent_novel)/(2*len(decoded_rnd_seqs)))
    percent_novel, novel_conf
    df_gen_scores.update({'percent_novel':percent_novel})
    df_gen_scores.update({'novel_confidence':novel_conf})
    
    shuffled_test = shuffled_test.flatten().tolist()
    similarity_score=[]
    matrix = substitution_matrices.load("BLOSUM62")
    for seq in shuffled_test[:10_000:100]: #grab 100 test set peptides
        for seq2 in decoded_rnd_seqs[::10]: #grab 100 of the 1000 random latent peptides
            similarity_score.append( pairwise2.align.globaldx(seq,seq2, matrix, score_only=True)/(len(seq)+len(seq2)) )
            
  
    df_gen_scores.update({'average_sequence_similarity': np.average(similarity_score)})
    df_gen_scores.update({'std_on_similarity_score': np.std(similarity_score)})
    
    #GLFDIWKKWRWRR is an AMP in the test set. Use it as a reference point in the latent space
    model.params['BATCH_SIZE'] = 1
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(np.array([['GLIDTVKNMAINAAKSAGMSVLKTLSCKLSKEC']], dtype='O'), log=False, save=False) 
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(np.array([['GLIDTVKNMAINAAKSAGMSVLKTLSCKLSKEC']], dtype='O'), log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(np.array([['GLIDTVKNMAINAAKSAGMSVLKTLSCKLSKEC']], dtype='O'), log=False, save=False) #subset size 1200*35=42000 would be ok
    
    if model.model_type =='aae' or model.model_type =='wae':
        nearby_samples = np.random.normal(loc=0,scale=1,size=(500,1,model.params['d_latent'])).astype(np.float32)*0.4 + mus
    else:
        nearby_samples = np.random.normal(loc=0,scale=1,size=(500,1,model.params['d_latent'])).astype(np.float32)*np.exp(0.5*logvars) + mus
    
    rnd_seq_count=500
    model.params['BATCH_SIZE'] = 50
    rnd_token_list=np.empty((rnd_seq_count,model.tgt_len)) #store N decoded latent vectors now in token(0-20) form max length 125
    for batch in range(0,rnd_seq_count,model.params['BATCH_SIZE']):
        rnd_token_list[batch:batch+model.params['BATCH_SIZE']] =  model.greedy_decode(torch.tensor(nearby_samples[batch:batch+model.params['BATCH_SIZE']]).squeeze().cuda()).cpu()
    
    decoded_rnd_seqs = decode_mols(torch.tensor(rnd_token_list), model.params['ORG_DICT'])                                             
             
    z=1.96 #95% confidence interval
    amp_percent_unique = len(set(decoded_rnd_seqs))/len(decoded_rnd_seqs)
    amp_unique_conf = z*math.sqrt(amp_percent_unique*(1-amp_percent_unique)/rnd_seq_count)
    df_gen_scores.update({'amp_uniqueness': amp_percent_unique})
    df_gen_scores.update({'amp_uniqueness_std': amp_unique_conf})
                                                
    k=2
    jac_scores = np.empty((len(list(set(decoded_rnd_seqs))),1))
    jac_scores.shape
    for i,decoded_seq in enumerate(list(set(decoded_rnd_seqs))):
        for j,seq in enumerate(np.array(['GLIDTVKNMAINAAKSAGMSVLKTLSCKLSKEC'], dtype='O')):
            jac_scores[i,j] = (jaccard_similarity(build_kmers(seq,k), build_kmers(decoded_seq,k)))
    df_gen_scores.update({'amp_jac_score': np.average(jac_scores)})
    df_gen_scores.update({'amp_jac_score_std': np.std(jac_scores)})
    np.average(jac_scores), np.std(jac_scores) 

    model.params['BATCH_SIZE'] = 2
    reconstructed_seq, props = model.reconstruct(np.array([[seq] for seq in list(set(decoded_rnd_seqs))]), log=False, return_mems=False)
    props[props>1]=1 #sometimes the model outputs a probability >1 for a class so threshold
    amp_percent_amp = sum(props.round()).item()/len(props)
    amp_amp_conf = z*math.sqrt(amp_percent_amp*(1-amp_percent_amp)/len(props))
    df_gen_scores.update({'predicted_amps': amp_percent_amp})
    df_gen_scores.update({'predicted_amps_conf': amp_amp_conf})    
    df = pd.DataFrame.from_dict([df_gen_scores])
    pd.DataFrame.from_dict([df_gen_scores]).to_csv(save_dir+"generation_metrics.csv", index=False)
    
    #GLFDIWKKWRWRR is an AMP in the test set. Use it as a reference point in the latent space
    #GLIDTVKNMAINAAKSAGMSVLKTLSCKLSKEC is an amp in the training set also can be used as a reference pt in latent space
    model.params['BATCH_SIZE'] = 200
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:20_000], log=False, save=False) 
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:20_000], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:20_000], log=False, save=False)
        
    
    true_prop_src = 'data\\peptides\\datasets\\uniprot_v2\\function_test.txt'
    model.params['BATCH_SIZE'] = 1
    subsample_start=0
    subsample_length=mus.shape[0] #this may change depending on batch size

    #(for length based coloring): record all peptide lengths iterating through input
    pep_lengths = []
    for idx, pep in enumerate(data[subsample_start:(subsample_start+subsample_length)]):
        pep_lengths.append( len(pep[0]) )   
    #(for function based coloring): pull function from csv with peptide functions
    s_to_f =pd.read_csv(true_prop_src)    
    function = s_to_f['peptides'][subsample_start:(subsample_start+subsample_length)]

    pca = PCA(n_components=3)
    pca_batch =pca.fit_transform(X=mus[:])
    pca_generated = pca.transform(nearby_samples.squeeze())

    if model.model_type =='aae' or model.model_type =='wae':
        amp_mus, _, _ = model.calc_mems(np.array([['GLIDTVKNMAINAAKSAGMSVLKTLSCKLSKEC']], dtype='O'), log=False, save=False)
    else:
        amp_mems, amp_mus, amp_logvars = model.calc_mems(np.array([['GLIDTVKNMAINAAKSAGMSVLKTLSCKLSKEC']], dtype='O'), log=False, save=False)
    pca_amp = pca.transform(amp_mus)

    #plot format dictionnaries
    titles={'text':'{}'.format(model.model_type.replace("_"," ").upper()),
                          'x':0.5,'xanchor':'center','yanchor':'top','font_size':40}
    general_fonts={'family':"Helvetica",'size':30,'color':"Black"}
    colorbar_fmt={'title_font_size':30,'thickness':15,'ticks':'','title_text':'Lengths',
                               'ticklabelposition':"outside bottom", 'showscale':'False'}

    fig1 = px.scatter(pd.DataFrame({"PC1":pca_batch[:,0],"PC2":pca_batch[:,1], 
                                    "Function":list(map(lambda itm: "AMP" if itm==1 else "NON-AMP",function))}),
                                    x='PC1', y='PC2', color="Function",symbol_sequence=['x-thin-open','circle'],
                                    template='simple_white',symbol='Function', opacity=0.3) 
    fig2 = px.scatter(pd.DataFrame({"PC1":pca_generated[:,0],"PC2":pca_generated[:,1],
                                    'color':["Generated" for i in pca_generated[:,0]],}),
                                    x='PC1', y='PC2',color='color',labels='label',symbol_sequence=['asterisk-open'],
                                    template='simple_white', opacity=0.9)
    fig2.update_traces(marker=dict(color='red'))
    fig3 = px.scatter(pd.DataFrame({"PC1":pca_amp[:,0],"PC2":pca_amp[:,1], 'color':['source AMP' for i in pca_amp[:,0]]}),
                                    x='PC1', y='PC2',color='color',symbol_sequence=['cross'],
                                    template='simple_white', opacity=1)
    fig3.update_traces(marker=dict(size=12, color='black'))
    fig1.update_traces(marker=dict(size=9))
    fig2.update_traces(marker=dict(size=9))
    fig = go.Figure(data= fig1.data+fig2.data+fig3.data)
    fig.update_coloraxes(showscale=False)
    fig.update_layout(title=titles,xaxis_title="PC1",yaxis_title="PC2",font=general_fonts)
    fig.write_image(save_dir+'amp_sample.png', width=1_000, height=700)

current working directory:  C:\Users\s_renaud\Documents\GitHub\MSCSAM_TBD\main_model
working on:  checkpointz\to_slurm\rnn_latent128\300_rnn-128_peptide.ckpt 

rnn-128_peptide


KeyboardInterrupt: 

<H4>Since Compute Canada does not do the dimensionality reduction metrics we need to do them below

In [None]:
import coranking #coranking.readthedocs.io
from coranking.metrics import trustworthiness, continuity, LCMC
from transvae.snc import SNC #github.com/hj-n/steadiness-cohesiveness

import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import Image
from transvae import trans_models
from transvae.transformer_models import TransVAE
from transvae.rnn_models import RNN, RNNAttn
from transvae.wae_models import WAE
from transvae.aae_models import AAE
from transvae.tvae_util import *
from transvae import analysis
import glob
import re


gpu = True

example_data = 'data\\peptides\\datasets\\uniprot_v2\\peptide_test.txt'
test_train='test'
ckpt_list = glob.glob(""+"checkpointz\\to_slurm//**//*.ckpt", recursive = True) #grab all checkpoints
analyses_list = glob.glob("model_analyses\\test//**/*.csv", recursive=True) #grab all analyses
print('current working directory: ',os.getcwd())

for i in range(len(ckpt_list)):
    
    #search the current directory for the model name and load that model
    model_dic = {'trans':'TransVAE','aae':'AAE','rnnattn':'RNNAttn','rnn':'RNN','wae':'WAE'}
    model_src = ckpt_list[i]
    print('working on: ',model_src,'\n')
    model_name = list(filter(None,[key for key in model_dic.keys() if key in model_src.split('//')[-1]]))
    model = locals()[model_dic[model_name[0]]](load_fn=model_src) #use locals to call model specific constructor
    
    #load the analysis file corresponding to the model from the CC outputs
    for idx in range(len(analyses_list)):
        if analyses_list[idx].split("\\")[-2].find(model_src.split("\\")[-2].split("_")[0]) != -1 and analyses_list[idx].split("\\")[-2].find(model_src.split("\\")[-2].split("_")[1]) != -1:
            if analyses_list[idx].find("rnnattn")  != -1 and model_src.find("rnnattn") == -1: continue
            save_dir = analyses_list[idx]
            cur_analysis = pd.read_csv(save_dir)
    print("analysis: ",save_dir, "checkpoint: ",model_src)
    save_df = cur_analysis #this will hold the number variables and save to CSV
    
    #load the true labels
    data = pd.read_csv(example_data).to_numpy() 
    data_1D = data[:,0] #gets rid of extra dimension
    
    #moving into memory and entropy
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:60_000], log=False, save=False) 
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:60_000], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:60_000], log=False, save=False) #subset size 1200*35=42000 would be ok

    #create random index and re-index ordered memory list creating n random sub-lists (ideally resulting in IID random lists)
    random_idx = np.random.permutation(np.arange(stop=mus.shape[0]))
    mus[:] = mus[random_idx]
    data = data[random_idx]
    
    #need to perform PCA to be able to compare dimensionality reduction quality
    pca = PCA(n_components=2)
    pca_batch =pca.fit_transform(X=mus[:])
    
    #now ready to calculation dimensionality reduction accuracy with metrics
    trust_subsamples = []
    cont_subsamples = []
    lcmc_subsamples = []
    steadiness_subsamples = []
    cohesiveness_subsamples = []
    if 'test' in test_train: #different number of bootsraps for train vs test
        n=15
    else:
        n=60
    parameter = { "k": 50,"alpha": 0.1 } #for steadiness and cohesiveness
    for s in range(n):
        s_len = len(mus)//n
        Q = coranking.coranking_matrix(mus[s_len*s:s_len*(s+1)], pca_batch[s_len*s:s_len*(s+1)])
        trust_subsamples.append( np.mean(trustworthiness(Q, min_k=1, max_k=50)) )
        cont_subsamples.append( np.mean(continuity(Q, min_k=1, max_k=50)) )
        lcmc_subsamples.append( np.mean(LCMC(Q, min_k=1, max_k=50)) )
        print(s,trust_subsamples[s],cont_subsamples[s],lcmc_subsamples[s])

        metrics = SNC(raw=mus[s_len*s:s_len*(s+1)], emb=pca_batch[s_len*s:s_len*(s+1)], iteration=300, dist_parameter=parameter)
        metrics.fit() #solve for steadiness and cohesiveness
        steadiness_subsamples.append(metrics.steadiness())
        cohesiveness_subsamples.append(metrics.cohesiveness())
        print(metrics.steadiness(),metrics.cohesiveness())
        Q=0 #trying to free RAM
        metrics=0
        torch.cuda.empty_cache() #free allocated CUDA memory

    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_trustworthiness':trust_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_continuity':cont_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_lcmc':lcmc_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_steadiness':steadiness_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_cohesiveness':cohesiveness_subsamples})], axis=1)  
    
    save_df.to_csv(save_dir, index=False)

current working directory:  C:\Users\s_renaud\Documents\GitHub\MSCSAM_TBD\main_model
working on:  checkpointz\to_slurm\rnn_latent128\300_rnn-128_peptide.ckpt 

analysis:  model_analyses\test\rnn-128_peptide_latent128_test\saved_info.csv checkpoint:  checkpointz\to_slurm\rnn_latent128\300_rnn-128_peptide.ckpt
rnn-128_peptide
0 0.7263278906062263 0.8013199558729199 0.0733011277176656
0.7152629414570874 0.6150958533741794
1 0.7172063902859634 0.7914899201190343 0.0715827182435673
0.7058813278944015 0.6286144357018227
2 0.7155813759984018 0.7916761788451923 0.06936732373353666
0.7051496672660817 0.6336141042480539
3 0.7216968580864563 0.7970960963455368 0.06890977742513325
0.7165728671426881 0.6510426816726227
4 0.7273828882274378 0.8011807891448378 0.06981649325874331
0.7102687106527645 0.6370959883035602
5 0.724585498669805 0.7987928943939604 0.07576132908707749
0.7010568947270404 0.6579795792161849
6 0.7189782410502227 0.7998008640811263 0.07084000794804672
0.7221717835460162 0.63616783


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



analysis:  model_analyses\test\rnn-128_peptide_latent32_test\saved_info.csv checkpoint:  checkpointz\to_slurm\rnn_latent32\300_rnn-128_peptide.ckpt
rnn-128_peptide
0 0.6880380225019455 0.7798709712605748 0.06627739182811522
0.6135066966994913 0.7198686646522403
1 0.6857518235161038 0.7781790646514453 0.06219396814358677
0.6192396065750676 0.710346533661266
2 0.6989112341332542 0.7855606921847275 0.07296879467552649
0.6200832655942752 0.7294500587626398
3 0.6842890203442478 0.7741448419083942 0.0643408282248118
0.6162378511973263 0.717832203344827
4 0.6900583866749541 0.7771103011006475 0.06411888073001792
0.6106633344002548 0.725982461720017
5 0.6841661794522836 0.773414452418139 0.06857221431855808
0.6255874404546424 0.7011656968114361
6 0.6923723821967731 0.7804090504574986 0.06731805134180202
0.6072037878208381 0.7140680598341926
7 0.693119596561149 0.7833630706516134 0.06734564269736318
0.6200257261986079 0.71552681374486
8 0.6916022070316151 0.7785283598537547 0.07061099922748273



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



analysis:  model_analyses\test\rnn-128_peptide_latent64_test\saved_info.csv checkpoint:  checkpointz\to_slurm\rnn_latent64\300_rnn-128_peptide.ckpt
rnn-128_peptide
0 0.7300701591465508 0.8122864720256364 0.0749737024369066
0.7287871389634732 0.632182260405256
1 0.7319761107778988 0.8151575407746705 0.07841880388113903
0.7245793580074413 0.6258209668856803
2 0.7277143835908246 0.8117680071329 0.07541178340561856
0.7103801072166633 0.6238262977356805
3 0.7353295093172799 0.8194648627803142 0.07997016108468606
0.7328077185743048 0.617868866934086
4 0.7358159752004113 0.8142707857529893 0.07735419422574104
0.7174752841676452 0.6050244436865764
5 0.7298872849712784 0.8125320855531135 0.0730988239076381
0.7302507784883007 0.6087263573487184
6 0.7388238971789775 0.8202667355751466 0.08355252246671999
0.736200297330778 0.6132691948561331
7 0.7348557144671303 0.817013228621897 0.08115438636154078
0.724666156946523 0.6017808961286154
8 0.727138583320619 0.8074262516189208 0.07614840047692537
0.7