# Extract Unique Peptides

## Import Libraries

In [1]:
# import libraries
import numpy as np
import pandas as pd

import torch

from transformers import BertTokenizerFast, BertModel


# import other files
from extract_features import load_features, load_pred_features
from model_training import PeptidesDataLoader
from models import BioNN

# constant
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Load Model

In [2]:
# load data
features_x = load_features().iloc[:, 1:].values

# load BERT model and FC (2 layers)
checkpoint = 'unikei/bert-base-proteins'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)

bert_model = BertModel.from_pretrained(checkpoint).to(device)
bert_model.load_state_dict(torch.load('./Model/bert_model.pth', map_location=device))
bert_model.eval()

bio_model = BioNN(768 * 10 + features_x.shape[1], labels_num=18).to(device)
bio_model.load_state_dict(torch.load('./Model/bio_model.pth', map_location=device))
bio_model.eval()

Some weights of BertModel were not initialized from the model checkpoint at unikei/bert-base-proteins and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BioNN(
  (nn1): Linear(in_features=10350, out_features=512, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (nn2): Linear(in_features=512, out_features=256, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (nn3): Linear(in_features=256, out_features=18, bias=True)
)

## Load Data

In [3]:
# load mmp3 unique peptide sequences
mmp3_original_unique_peptides = pd.read_csv("./Data/MMP3_unique_sequence.csv", header=None)

# load features of mmp3 unique peptides
mmp3_features_x = load_pred_features(3).iloc[:, 1:].values  # drop peptide column
mmp3_features_x = torch.from_numpy(mmp3_features_x).float()

mmp3_original_unique_peptides = mmp3_original_unique_peptides.iloc[:, 0].values.tolist()
mmp3_dataloader = PeptidesDataLoader(mmp3_original_unique_peptides, mmp3_features_x, np.zeros_like(mmp3_features_x), batch_size=512, shuffle=False)  # the labels here are not used, so use zeros

# load mmp9 unique peptide sequences
mmp9_original_unique_peptides = pd.read_csv("./Data/MMP9_unique_sequence.csv", header=None)

# load features of mmp9 unique peptides
mmp9_features_x = load_pred_features(9).iloc[:, 1:].values  # drop peptide column
mmp9_features_x = torch.from_numpy(mmp9_features_x).float()

mmp9_original_unique_peptides = mmp9_original_unique_peptides.iloc[:, 0].values.tolist()
mmp9_dataloader = PeptidesDataLoader(mmp9_original_unique_peptides, mmp9_features_x, np.zeros_like(mmp9_features_x), batch_size=512, shuffle=False)  # the labels here are not used, so use zeros

Extracting binary features: 1944it [00:00, 388902.36it/s]
Extracting cksaap features: 1944it [00:00, 49848.24it/s]
Extracting binary features: 134it [00:00, ?it/s]
Extracting cksaap features: 134it [00:00, 22333.18it/s]


In [4]:
# predict mmp3 unique peptides' mmp cleavage efficiency scores
pred_mmp3 = []
with torch.no_grad():
    for peptides_epoch, features_epoch, _ in mmp3_dataloader:
        tokens_epoch = tokenizer(peptides_epoch, return_tensors='pt').to(device)
        bert_output = bert_model(**tokens_epoch).last_hidden_state.view(len(peptides_epoch), -1)  # flatten embedding
        bio_input = torch.cat([bert_output, features_epoch.to(device)], dim=1)

        bio_output = bio_model(bio_input)
        pred_mmp3.append(bio_output.to("cpu"))
pred_mmp3 = torch.cat(pred_mmp3, dim=0).detach().numpy()

# predict mmp9 unique peptides' mmp cleavage efficiency scores
pred_mmp9 = []
with torch.no_grad():
    for peptides_epoch, features_epoch, _ in mmp9_dataloader:
        tokens_epoch = tokenizer(peptides_epoch, return_tensors='pt').to(device)
        bert_output = bert_model(**tokens_epoch).last_hidden_state.view(len(peptides_epoch), -1)  # flatten embedding
        bio_input = torch.cat([bert_output, features_epoch.to(device)], dim=1)

        bio_output = bio_model(bio_input)
        pred_mmp9.append(bio_output.to("cpu"))
pred_mmp9 = torch.cat(pred_mmp9, dim=0).detach().numpy()

# save the prediction result
np.save("./Result/pred_mmp3.npy", pred_mmp3)
np.save("./Result/pred_mmp9.npy", pred_mmp9)

In [5]:
# unique peptide sequences for mmp 3
pred_other_than_mmp3 = np.concatenate([pred_mmp3[:, :2], pred_mmp3[:, 3:]], axis=1)  # delete the score for mmp 3, because we care about scores of other mmp cleavage
sum((pred_other_than_mmp3 < 0).mean(axis=1) == 1)  # number of unique peptides for mmp3 (cannot be cleaved by other mmps)

557

In [6]:
# unique peptide sequences for mmp 9
pred_other_than_mmp9 = np.concatenate([pred_mmp9[:, :5], pred_mmp9[:, 6:]], axis=1)  # delete the score for mmp 9, because we care about scores of other mmp cleavage
sum((pred_other_than_mmp9 < 0).mean(axis=1) == 1)  # number of unique peptides for mmp9 (cannot be cleaved by other mmps)

58