In [1]:
import pandas as pd
import numpy as np
import torch
import os
import pickle as pkl

from transvae import trans_models
from transvae.transformer_models import TransVAE
from transvae.tvae_util import *
from transvae import analysis
from scripts.parsers import model_init, train_parser

In [2]:
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'analysis_of_model_results.ipynb',
 'checkpointz',
 'data',
 'data_and_pca_embeddings',
 'input_data_analysis.ipynb',
 'LICENSE',
 'model_analyses',
 'notebook_model_training.ipynb',
 'output_graphing_notebook.ipynb',
 'pipeline.ipynb',
 'README.md',
 'scripts',
 'script_for_combined_model_analysis.ipynb',
 'structure-assessment.ipynb',
 'testing.ipynb',
 'total_model_analysis.py',
 'train_only_requirements.yml',
 'transvae',
 'trials']

In [3]:
os.listdir("data/")

['attn_weights_self_attn.npy',
 'attn_weights_src_attn.npy',
 'datasets',
 'dataset_construction',
 'function_test.txt',
 'function_train.txt',
 'peptide_test.txt',
 'peptide_train.txt',
 'sampled_physicochem_props.csv',
 'test_physicochem_props.csv',
 'train_physicochem_props.csv']

In [4]:
os.listdir("checkpointz/")

['amp_aae', 'amp_rnn', 'amp_rnnattn', 'amp_trans', 'amp_wae']

In [5]:
os.listdir("checkpointz/amp_trans/sunistarv4_emb128_latent64/")

['050_trans1x-128_peptide.ckpt',
 '100_trans1x-128_peptide.ckpt',
 '150_trans1x-128_peptide.ckpt',
 '200_trans1x-128_peptide.ckpt',
 '250_trans1x-128_peptide.ckpt',
 '300_trans1x-128_peptide.ckpt']

In [6]:
model_src = "checkpointz/amp_trans/sunistarv4_emb128_latent64/300_trans1x-128_peptide.ckpt"

"""

"""
model_obj=torch.load(model_src, map_location=torch.device("cpu"))
# model = TransVAE()
model = TransVAE(load_fn=model_src, workaround="cpu")
# model = locals()["TransVAE"](params={},load_fn=model_src)
    #params=model_obj["params"])
model.params['HARDWARE']= 'cpu'
# model.load(checkpoint_path=model_src)
# model = locals()["TransVAE"](load_fn=model_obj) #use locals to call model specific constructor


In [7]:
model.best_loss

inf

In [8]:
total_parameters = 0
for p in model.model.parameters():
    total_parameters += len(p.flatten())

In [9]:
total_parameters

1502007

## grab some data to see if model was loaded

In [10]:
os.listdir("data/")

['attn_weights_self_attn.npy',
 'attn_weights_src_attn.npy',
 'datasets',
 'dataset_construction',
 'function_test.txt',
 'function_train.txt',
 'peptide_test.txt',
 'peptide_train.txt',
 'sampled_physicochem_props.csv',
 'test_physicochem_props.csv',
 'train_physicochem_props.csv']

In [11]:
data_fpath = "data/dataset_construction/final_dataset/"
train_seqs = pd.read_csv(data_fpath+"peptide_train.txt")
train_fctn = pd.read_csv(data_fpath+"function_train.txt")
with open(data_fpath+"char_dict_peptide.pkl", 'rb') as f:
    char_dict = pkl.load(f)

In [12]:
train_seqs.head()

Unnamed: 0,peptides
0,MKYCSQCGGTVALRIPDGDTRQRFVCGH
1,LTALCPPGRSYIRYSQICAQAVRAAMKPQYKAEAERAATATVKTVK...
2,MRGVVCLGLVLVHLL
3,SASEGPKFRETVTEFVEQIRALGPKSA
4,KINTINISYLVPGNRICIADSVNI


In [13]:
train_fctn.head()

Unnamed: 0,peptides
0,0
1,0
2,0
3,0
4,0


In [14]:
char_dict

{'<start>': 0,
 'M': 1,
 'K': 2,
 'Y': 3,
 'C': 4,
 'S': 5,
 'Q': 6,
 'G': 7,
 'T': 8,
 'V': 9,
 'A': 10,
 'L': 11,
 'R': 12,
 'I': 13,
 'P': 14,
 'D': 15,
 'F': 16,
 'H': 17,
 'E': 18,
 'N': 19,
 'W': 20,
 '_': 21,
 '<end>': 22}

In [15]:
def encode_seqs(df, stoi):
    encoded_seqs = []
    max_len = 50
    for seq in df['peptides']:
        temp_ = [stoi[aa] for aa in seq]
        seq_len = len(seq)
        temp_ += [22]
        temp_ += [21] * (max_len - seq_len)
        temp_ = [0] + temp_
        encoded_seqs.append(temp_)
    df = pd.DataFrame({"encoded_peptides":encoded_seqs})
    return df
def decode_seq(encoded_seq, stoi):
    itos = {v:k for k,v in stoi.items()}
    decoded_seq = []
    for tok in encoded_seq:
        decoded_seq.append(itos[tok])
    decoded_seq = "".join(decoded_seq)
    decoded_seq = decoded_seq.strip("_")
    decoded_seq = decoded_seq.strip("<start>")
    decoded_seq = decoded_seq.strip("<end>")
    return decoded_seq

In [16]:
df = encode_seqs(train_seqs, char_dict)

In [17]:
df.shape

(243201, 1)

In [18]:
train_fctn.shape

(243201, 1)

In [19]:
# put them in same dataframe
df["amp_or_not"] = train_fctn['peptides']

In [20]:
print( (df['amp_or_not']==0).sum() )
print(df['amp_or_not'].sum())

234584
8617


In [21]:
print(train_seqs.iloc[0,0])
print(df.iloc[0,0])
print(decode_seq(df.iloc[0,0],char_dict))

MKYCSQCGGTVALRIPDGDTRQRFVCGH
[0, 1, 2, 3, 4, 5, 6, 4, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 7, 15, 8, 12, 6, 12, 16, 9, 4, 7, 17, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21]
MKYCSQCGGTVALRIPDGDTRQRFVCGH


## Need to grab some amps and non-amps and put into torch tensor

In [22]:
n_amps     = 500
n_non_amps = 500

indices_amps     = np.random.randint(0,8600, size=n_amps)
indices_non_amps = np.random.randint(0,200600, size=n_non_amps)
some_amps     = df[df["amp_or_not"]==1].iloc[indices_amps]
some_non_amps = df[df["amp_or_not"]==0].iloc[indices_non_amps]

In [23]:
sampled_peptides = []
sampled_fctns = []
for i,seq in enumerate(some_amps["encoded_peptides"]):
    sampled_peptides.append( [decode_seq(seq,char_dict)] )
    sampled_fctns.append( [some_amps.iloc[i,1]])
for i,seq in enumerate(some_non_amps["encoded_peptides"]):
    sampled_peptides.append( [decode_seq(seq,char_dict)] )
    sampled_fctns.append( [some_non_amps.iloc[i,1]])

In [24]:
sampled_peptides = np.array(sampled_peptides)
sampled_fctns = np.array(sampled_fctns)

In [25]:
sampled_peptides.shape

(1000, 1)

In [26]:
sampled_fctns.shape

(1000, 1)

In [27]:
some_amps.head()

Unnamed: 0,encoded_peptides,amp_or_not
235351,"[0, 2, 11, 2, 11, 2, 16, 2, 11, 2, 6, 22, 21, ...",1
75117,"[0, 4, 5, 4, 2, 19, 2, 9, 4, 3, 12, 19, 7, 9, ...",1
229443,"[0, 16, 11, 14, 13, 11, 7, 19, 11, 11, 5, 7, 1...",1
187571,"[0, 20, 13, 11, 10, 13, 14, 12, 12, 13, 12, 7,...",1
103721,"[0, 10, 5, 5, 7, 20, 9, 4, 8, 11, 8, 13, 18, 4...",1


In [28]:
def convert_to_torch(data_amps, data_non_amps):
    N = data_amps.shape[0] + data_non_amps.shape[0]
    max_len = len(data_amps.iloc[0,0])
    tensor = torch.zeros(N,max_len)
    for i,seq in enumerate(data_amps["encoded_peptides"]):
        for ii in range(max_len):
            tensor[i,ii] = seq[ii]
    for j, seq in enumerate(data_non_amps["encoded_peptides"]):
        for jj in range(max_len):
            tensor[i+j, jj] = seq[jj]
    return tensor

In [29]:
tst=convert_to_torch(some_amps, some_non_amps)

In [30]:
import time

In [32]:
model.params["BATCH_SIZE"] = n_amps + n_non_amps
t0 = time.time()
with torch.no_grad():
    z, mu, logvar = model.calc_mems(sampled_peptides, log=False,save=False)
    decoded_seqs  = model.reconstruct(np.c_[sampled_peptides,sampled_fctns],log=False,return_mems=False)
print(f"time elapsed = {round(time.time()-t0,5)}s")

cpu
decoding sequences of max length  125 current position:  0
decoding sequences of max length  125 current position:  100
time elapsed = 766.65501s


In [None]:
mu.shape

In [None]:
decoded_seqs[0][:5]

In [None]:
sampled_peptides[:5]

## project latent space using ...

### PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=5)
pca.fit(mu)

In [None]:
pca_mu = pca.transform(mu)

In [None]:
marker_sizes = [8 for _ in range(len(pca_mu[:,0]))]
plt.scatter(pca_mu[:,0], 
            pca_mu[:,1], 
            c=sampled_fctns.reshape(-1,1), 
            s=marker_sizes
)

### UMAP

In [None]:
import umap
import umap.plot

import matplotlib.cm as cm
import matplotlib.colors as colors

SAVE_FIGURES = False

In [None]:
for nneighbours in [5,15,30,45,60]:
    mapper = umap.UMAP(n_neighbors=nneighbours)
    mapper = mapper.fit(mu)
    
    values = sampled_fctns[:,0]
    fig, ax = plt.subplots()
    
    cmap = "rainbow"
    # create a scalar colour map for values
    norm = colors.Normalize(values.min(), values.max())
    scalar_map = cm.ScalarMappable(norm=norm, cmap=cmap)  # type: ignore
    # plot using umaps helper function
    ax = umap.plot.points(mapper, values=values, ax=ax, cmap=cmap)
    # create a colorbar
    cbar = fig.colorbar(scalar_map, ax=ax)  # type: ignore
    plt.title(f"64-dim'l latent space coloured by amp-or-not|{nneighbours}")
    plt.show()

if SAVE_FIGURES:
    plt.savefig(
        figure_dir + "umap-logP."+IMAGE_FTYPE,
        dpi=DPI
    )

### t-SNE

In [38]:
os.cpu_count()

8