<h3> This notebook will run the total model analysis script and take care of the PCA benchmarks that are run seperately from the aforementioned script

In [None]:
%run total_model_analysis.py 

In [None]:
import coranking #coranking.readthedocs.io
from coranking.metrics import trustworthiness, continuity, LCMC
from transvae.snc import SNC #github.com/hj-n/steadiness-cohesiveness

import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import Image
from transvae import trans_models
from transvae.transformer_models import TransVAE
from transvae.rnn_models import RNN, RNNAttn
from transvae.wae_models import WAE
from transvae.aae_models import AAE
from transvae.tvae_util import *
from transvae import analysis
import glob
import re


gpu = True

example_data = 'data\\peptides\\datasets\\uniprot_v3\\peptide_train.txt'
test_train='train'
ckpt_list = glob.glob(""+"checkpointz\\to_slurm//**//*.ckpt", recursive = True) #grab all checkpoints
analyses_list = glob.glob("model_analyses\\train//**/*.csv", recursive=True) #grab all analyses
print('current working directory: ',os.getcwd())

for i in range(len(ckpt_list)):
    
    #search the current directory for the model name and load that model
    model_dic = {'trans':'TransVAE','aae':'AAE','rnnattn':'RNNAttn','rnn':'RNN','wae':'WAE'}
    model_src = ckpt_list[i]
    print('working on: ',model_src,'\n')
    model_name = list(filter(None,[key for key in model_dic.keys() if key in model_src.split('//')[-1]]))
    model = locals()[model_dic[model_name[0]]](load_fn=model_src) #use locals to call model specific constructor
    
    #load the analysis file corresponding to the model from the CC outputs
    for idx in range(len(analyses_list)):
        if analyses_list[idx].split("\\")[-2].find(model_src.split("\\")[-2].split("_")[0]) != -1 and analyses_list[idx].split("\\")[-2].find(model_src.split("\\")[-2].split("_")[1]) != -1:
            if analyses_list[idx].find("rnnattn")  != -1 and model_src.find("rnnattn") == -1: continue
            save_dir = analyses_list[idx]
            cur_analysis = pd.read_csv(save_dir)
    print("analysis: ",save_dir, "checkpoint: ",model_src)
    save_df = cur_analysis #this will hold the number variables and save to CSV
    
    #load the true labels
    data = pd.read_csv(example_data).to_numpy() 
    data_1D = data[:,0] #gets rid of extra dimension
    
    #moving into memory and entropy
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) 
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:65_000], log=False, save=False) #subset size 1200*35=42000 would be ok

    #create random index and re-index ordered memory list creating n random sub-lists (ideally resulting in IID random lists)
    random_idx = np.random.permutation(np.arange(stop=mus.shape[0]))
    mus[:] = mus[random_idx]
    data = data[random_idx]
    mus = mus[:30_000]#limit the quantity of data to speed up
    data = data[:30_000]
    
    #need to perform PCA to be able to compare dimensionality reduction quality
    pca = PCA(n_components=2)
    pca_batch =pca.fit_transform(X=mus) 
    
    #now ready to calculation dimensionality reduction accuracy with metrics
    trust_subsamples = []
    cont_subsamples = []
    lcmc_subsamples = []
    steadiness_subsamples = []
    cohesiveness_subsamples = []
    if 'test' in test_train: #different number of bootsraps for train vs test
        n=15
    else:
        n=15
    parameter = { "k": 50,"alpha": 0.1 } #for steadiness and cohesiveness
    for s in range(n):
        s_len = len(mus)//n
        Q = coranking.coranking_matrix(mus[s_len*s:s_len*(s+1)], pca_batch[s_len*s:s_len*(s+1)])
        trust_subsamples.append( np.mean(trustworthiness(Q, min_k=1, max_k=50)) )
        cont_subsamples.append( np.mean(continuity(Q, min_k=1, max_k=50)) )
        lcmc_subsamples.append( np.mean(LCMC(Q, min_k=1, max_k=50)) )
        print(s,trust_subsamples[s],cont_subsamples[s],lcmc_subsamples[s])

        metrics = SNC(raw=mus[s_len*s:s_len*(s+1)], emb=pca_batch[s_len*s:s_len*(s+1)], iteration=300, dist_parameter=parameter)
        metrics.fit() #solve for steadiness and cohesiveness
        steadiness_subsamples.append(metrics.steadiness())
        cohesiveness_subsamples.append(metrics.cohesiveness())
        print(metrics.steadiness(),metrics.cohesiveness())
        Q=0 #trying to free RAM
        metrics=0
        torch.cuda.empty_cache() #free allocated CUDA memory

    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_trustworthiness':trust_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_continuity':cont_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_lcmc':lcmc_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_steadiness':steadiness_subsamples})], axis=1)
    save_df = pd.concat([save_df,pd.DataFrame({'latent_to_PCA_cohesiveness':cohesiveness_subsamples})], axis=1)  
    
    save_df.to_csv(save_dir, index=False)

<H3> This cell concatenates missing saved_info information (usually not necessary when ran in order)

In [None]:
import glob
import pandas as pd

analyses_list = glob.glob("model_analyses\\test//**/*o.csv", recursive=True) #grab all analyses
old_analyses_list = glob.glob("model_analyses\\old\\test//**/*o.csv", recursive=True)

for csv,old_csv in zip(analyses_list,old_analyses_list):
    print(csv)
    analysis = pd.read_csv(csv)
    old_analysis = pd.read_csv(old_csv)
    old_analysis = old_analysis.drop(columns=old_analysis.loc[:,'mu_entropies':'latent_to_PCA_cohesiveness'].columns)
    new_analysis = pd.concat([old_analysis,analysis], axis=1)
    new_analysis.to_csv(csv,index=False)


<H3> This cell runs the python peptides package and finds physicochemical properties of peptide sequences

In [None]:
import peptides
dict_list=[]
for seq in data:
    pep = peptides.Peptide(seq[0])
    dict_list.append(
        {"aliphatic_index":pep.aliphatic_index(),
     "boman":pep.boman(),
     "charge_ph3":pep.charge(pH=3)/len(seq[0]),
     "charge_ph7":pep.charge(pH=7)/len(seq[0]),
     "charge_ph11":pep.charge(pH=11)/len(seq[0]),
    "hydrophobic_moment":pep.hydrophobic_moment()/len(seq[0]),
    "hydrophobicity":pep.hydrophobicity(),
    "instability_index":pep.instability_index(),
    "isoelectric_point":pep.isoelectric_point(),
    "molecular_weight":pep.molecular_weight()} )

In [None]:
df = pd.DataFrame(dict_list)
df.to_csv('data/train_physicochem_props.csv',index=False)

<H3> Special Extra section to perform particular analysis on select Peptides

In [None]:
gpu = True

num_sequences = 500_000
batch_size = 200 #setting for reconstruction
example_data = 'data\\peptides\\datasets\\uniprot_v3\\peptide_test.txt'
save_dir_loc = 'model_analyses\\sample\\' #folder in which to save outpts
save_dir_name = 'test' #appended to identify data: train|test|other|etc...

reconstruct=True #True:reconstruct data here; False:load reconstructions from file
recon_src = "checkpointz//analyses_ckpts//" #directory in which all reconstructions are stored
true_prop_src = 'data\\peptides\\datasets\\uniprot_v3\\function_test.txt' #if property predictor load the true labels
subset_src = "" #(optional) this file should have the true sequences for a subset of the "example data" above

ckpt_list = glob.glob(""+"checkpointz\\to_slurm//**//*.ckpt", recursive = True) #grab all checkpoint
print('current working directory: ',os.getcwd())


for i in range(len(ckpt_list)):
    #search the current directory for the model name and load that model
    model_dic = {'trans':'TransVAE','aae':'AAE','rnnattn':'RNNAttn','rnn':'RNN','wae':'WAE'}
    model_src = ckpt_list[i]
    print('working on: ',model_src,'\n')
    model_name = list(filter(None,[key for key in model_dic.keys() if key in model_src.split('\\')[-1]]))
    model = locals()[model_dic[model_name[0]]](load_fn=model_src) #use locals to call model specific constructor
    
    #create save directory for the current model according to latent space size
    latent_size = re.findall('(latent[\d]{2,3})', model_src)
    save_dir= save_dir_loc+model.name+"_"+latent_size[0]+"_"+save_dir_name
    if not os.path.exists(save_dir):os.mkdir(save_dir) 
    save_dir= save_dir+"//" 
    
     #load the true labels
    data = pd.read_csv(example_data).to_numpy() 
    data_1D = data[:num_sequences,0] #gets rid of extra dimension
    true_props_data = pd.read_csv(true_prop_src).to_numpy()
    true_props = true_props_data[0:num_sequences,0]
    
    ##moving into memory and entropy
    if model.model_type =='aae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) #50_000
    elif model.model_type == 'wae':
        mus, _, _ = model.calc_mems(data[:65_000], log=False, save=False) 
    else:
        mems, mus, logvars = model.calc_mems(data[:65_000], log=False, save=False) #subset size 1200*35=42000 would be ok
    #create random index and re-index ordered memory list
    random_idx = np.random.permutation(np.arange(stop=mus.shape[0]))
    mus = mus[random_idx]
    shuf_data = data[random_idx]

    subsample_start=0
    subsample_length=mus.shape[0] #mus shape depends on batch size!

    #(for length based coloring): record all peptide lengths iterating through input
    pep_lengths = []
    for idx, pep in enumerate(shuf_data[subsample_start:(subsample_start+subsample_length)]):
        pep_lengths.append( len(pep[0]) )   
    #(for function based coloring): pull function from csv with peptide functions
    s_to_f =pd.read_csv(true_prop_src)    
    function = s_to_f['peptides'][subsample_start:(subsample_start+subsample_length)]
    function = function[random_idx] #account for random permutation

    pca = PCA(n_components=5,svd_solver='full')
    pca_batch =pca.fit_transform(X=mus[:])
    
    #list of AMPs of interest
    probing_amps=np.array(('GKIIKLKASLKLL','GAIIKLKASLKLL','GKIIKLAASLKLL','GKIIKLKASLALL',
                           'GKIIKLKAALALL','GKIIALKASLKLL','IGIKLLKSKLKAL'))
    probing_amps=np.reshape(probing_amps,(len(probing_amps),1))
    
    model.params['BATCH_SIZE'] = 7
    if model.model_type =='aae' or model.model_type =='wae':
        probed_mus,_,_=model.calc_mems(probing_amps,log=False,save=False)
    else:
        _,probed_mus,_=model.calc_mems(probing_amps,log=False,save=False)
    reduced_amp_probes=pca.transform(X=probed_mus[:])
    
    #plotting
    titles={'text':'{}'.format(model.model_type.replace("_"," ").upper()),
            'x':0.5,'xanchor':'center','yanchor':'top','font_size':40}
    general_fonts={'family':"Helvetica",'size':30,'color':"Black"}
    colorbar_fmt={'title_font_size':30,'thickness':15,'ticks':'','title_text':'Lengths',
                  'ticklabelposition':"outside bottom"}
    
    #need to add the probed amps to the data
    converted_function = list(map(lambda itm: "AMP" if itm==1 else "NON-AMP",function))
    function_w_probe_amps = np.append(converted_function, probing_amps.flatten(),axis=0)
    pca_w_probe_amps = np.append(pca_batch,reduced_amp_probes, axis=0)
    
    fig = px.scatter_matrix(pd.DataFrame({"PC1":pca_w_probe_amps[:,0],"PC2":pca_w_probe_amps[:,1],
                                            "PC3":pca_w_probe_amps[:,2],"PC4":pca_w_probe_amps[:,3],
                                            "PC5":pca_w_probe_amps[:,4],"Function":function_w_probe_amps}),
                                            dimensions=["PC1","PC2","PC3","PC4","PC5"],template='simple_white',
                                            color='Function',
                                            symbol_sequence=['x-thin','circle',
                                                             'square-dot','square-dot','square-dot',
                                                             'square-dot','square-dot','square-dot','square-dot'],
                                            symbol='Function', opacity=0.8)
    fig.update_traces(diagonal_visible=False)
    fig.update_layout(title=titles,font=general_fonts)
    fig.write_image(save_dir+'pca_matrix_function.png', width=5_000, height=2500)
    
    #now we sample near the existing amps
    sample_count=10
    amp_sample_list=[]
    for idx,amp in enumerate(probed_mus):
        print("working on amp sample number: ",idx)
        current_mu=np.expand_dims(amp.astype(np.float32),0)
        nearby_samples = np.random.normal(loc=0,scale=1,size=(sample_count,1,model.params['d_latent'])).astype(np.float32)*0.3 + current_mu
        model.params['BATCH_SIZE'] = 25
        rnd_token_list=np.empty((sample_count,model.tgt_len)) #store N decoded latent vectors now in token(0-20) form max length 125
        for batch in range(0,sample_count,model.params['BATCH_SIZE']):
            rnd_token_list[batch:batch+model.params['BATCH_SIZE']] =  model.greedy_decode(torch.tensor(nearby_samples[batch:batch+model.params['BATCH_SIZE']]).squeeze().cuda()).cpu()
            decoded_rnd_seqs = decode_mols(torch.tensor(rnd_token_list), model.params['ORG_DICT'])
        amp_sample_list.append(decoded_rnd_seqs)
    with open(save_dir+'amp_sample_list.txt','w') as f:
        for amp in amp_sample_list:
            f.write(str(amp)+'\n')
    f.close()
    