In [1]:
from copy import deepcopy
import torch
from tape import ProteinBertModel, TAPETokenizer
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import trange

In [2]:
normal_df = pd.read_csv('normal_peptide_df.csv', index_col=0)
hans_df = pd.read_csv('hans_peptide_df.csv', index_col=0)

In [3]:
normal_peptides = normal_df.peptide_seq
normal_labels = normal_df.is_peptide

hans_peptides = hans_df.peptide_seq
hans_labels = hans_df.is_peptide

In [4]:
normal_protbert = ProteinBertModel.from_pretrained('bert-base', output_hidden_states = True, output_attentions=True)
hans_protbert = ProteinBertModel.from_pretrained('bert-base', output_hidden_states = True, output_attentions=True)
tokenizer = TAPETokenizer(vocab='iupac') 

In [5]:
def print_protein_output(model, sequence):
    token_ids = torch.tensor([tokenizer.encode(sequence)])
    output = model(token_ids)
    print("Protein Sequence: ", sequence)
    print("\nEncoded Input Length: ", len(token_ids[0]))
    print("\nEncoded Input: ", token_ids)
    print("\nOutput Sequence Length: ", output[0].shape)
    print("\nPooled Output Size: ", output[1].shape)
    print("\nOutput Sequence: ", output[0])
    print("\nPooled Output: ", output[1])

def print_layers_and_return_weights(model):
    parameters = deepcopy(model.state_dict())
    print("Model's state_dict:")
    for param_tensor in parameters:
        print(param_tensor, "\t", parameters[param_tensor].size())
    print()
    return parameters

In [6]:
print_protein_output(normal_protbert, 'GCTVEDR')

Protein Sequence:  GCTVEDR

Encoded Input Length:  9

Encoded Input:  tensor([[ 2, 11,  7, 23, 25,  9,  8, 21,  3]])

Output Sequence Length:  torch.Size([1, 9, 768])

Pooled Output Size:  torch.Size([1, 768])

Output Sequence:  tensor([[[ 0.5911,  1.0075,  0.8251,  ...,  0.5309,  0.6056, -0.6089],
         [ 0.9178, -0.7448, -0.7677,  ...,  0.3638,  0.4572, -0.5632],
         [ 0.9718,  0.5218, -0.8213,  ...,  0.8715,  0.4985, -1.2027],
         ...,
         [ 0.0147, -0.7276, -1.7229,  ...,  0.6857, -0.6894, -1.2196],
         [-0.9114, -1.2325, -0.6967,  ...,  0.4279, -0.3354, -0.5608],
         [-0.5925,  0.2265, -0.6207,  ...,  0.6646, -0.7102, -0.3488]]],
       grad_fn=<AddBackward0>)

Pooled Output:  tensor([[-7.9848e-02, -6.5225e-01, -3.8316e-01, -4.3976e-01, -2.2729e-01,
         -2.8057e-01, -5.0535e-01, -2.0483e-01,  3.8906e-01,  6.7004e-01,
         -3.6246e-02,  1.1568e-02, -3.1143e-01,  3.2744e-01, -4.3844e-02,
          7.9236e-01, -7.8314e-01,  4.1429e-01, -7.2975e-01

  


In [7]:
protein_parameters = print_layers_and_return_weights(normal_protbert)

Model's state_dict:
embeddings.word_embeddings.weight 	 torch.Size([30, 768])
embeddings.position_embeddings.weight 	 torch.Size([8192, 768])
embeddings.token_type_embeddings.weight 	 torch.Size([1, 768])
embeddings.LayerNorm.weight 	 torch.Size([768])
embeddings.LayerNorm.bias 	 torch.Size([768])
encoder.layer.0.attention.self.query.weight 	 torch.Size([768, 768])
encoder.layer.0.attention.self.query.bias 	 torch.Size([768])
encoder.layer.0.attention.self.key.weight 	 torch.Size([768, 768])
encoder.layer.0.attention.self.key.bias 	 torch.Size([768])
encoder.layer.0.attention.self.value.weight 	 torch.Size([768, 768])
encoder.layer.0.attention.self.value.bias 	 torch.Size([768])
encoder.layer.0.attention.output.dense.weight 	 torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias 	 torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.weight 	 torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.bias 	 torch.Size([768])
encoder.layer.0.intermediate.dense.weig

encoder.layer.8.output.dense.bias 	 torch.Size([768])
encoder.layer.8.output.LayerNorm.weight 	 torch.Size([768])
encoder.layer.8.output.LayerNorm.bias 	 torch.Size([768])
encoder.layer.9.attention.self.query.weight 	 torch.Size([768, 768])
encoder.layer.9.attention.self.query.bias 	 torch.Size([768])
encoder.layer.9.attention.self.key.weight 	 torch.Size([768, 768])
encoder.layer.9.attention.self.key.bias 	 torch.Size([768])
encoder.layer.9.attention.self.value.weight 	 torch.Size([768, 768])
encoder.layer.9.attention.self.value.bias 	 torch.Size([768])
encoder.layer.9.attention.output.dense.weight 	 torch.Size([768, 768])
encoder.layer.9.attention.output.dense.bias 	 torch.Size([768])
encoder.layer.9.attention.output.LayerNorm.weight 	 torch.Size([768])
encoder.layer.9.attention.output.LayerNorm.bias 	 torch.Size([768])
encoder.layer.9.intermediate.dense.weight 	 torch.Size([3072, 768])
encoder.layer.9.intermediate.dense.bias 	 torch.Size([3072])
encoder.layer.9.output.dense.weight 	

In [8]:
protein_parameters['pooler.dense.weight']

tensor([[ 0.0056, -0.0231, -0.0246,  ..., -0.0461, -0.0209,  0.0173],
        [-0.0294,  0.0147,  0.0025,  ...,  0.0099,  0.0012, -0.0167],
        [ 0.0018,  0.0007,  0.0062,  ...,  0.0042, -0.0200,  0.0193],
        ...,
        [-0.0159,  0.0119, -0.0263,  ...,  0.0081, -0.0236,  0.0112],
        [-0.0004, -0.0026,  0.0323,  ..., -0.0019,  0.0177, -0.0051],
        [ 0.0217, -0.0533, -0.0376,  ..., -0.0180, -0.0017,  0.0175]])

#### The above was a demo, let's now create two protein BERT models with a classifier layer at the end


In [9]:
import torch.nn as nn

In [10]:
class PeptideClassificationModel(nn.Module):
    def __init__(self):
        super(PeptideClassificationModel, self).__init__()
        
        self.base_model = ProteinBertModel.from_pretrained('bert-base', output_hidden_states = True, output_attentions=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(768, 2)
        
    def forward(self, input_ids):
        outputs = self.base_model(input_ids)
        outputs = self.dropout(outputs[1])
        outputs = self.linear(outputs)
        
        return outputs

    
normal_model = PeptideClassificationModel()
normal_model.to('cuda')
normal_optimizer = torch.optim.AdamW(normal_model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )




hans_model = PeptideClassificationModel()
hans_model.to('cuda')
hans_optimizer = torch.optim.AdamW(hans_model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

In [11]:
def padding_all(tensor_list, max_len):
    #max_len = max([len(t) for t in tensor_list])
    padded_tensors = []
    for t in tensor_list:
        pad_len = max_len - len(t)
        padded_tensor = torch.cat([t, torch.zeros(pad_len, dtype=t.dtype)])
        padded_tensors.append(padded_tensor)
    return padded_tensors

In [29]:
token_ids = torch.tensor([tokenizer.encode("QQDD")]).to("cuda")
print(normal_model(token_ids))

tensor([[0.0674, 0.2855]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [13]:
normal_token_id = []

for sample in normal_peptides:
    sample_token = torch.tensor(tokenizer.encode(sample))
    normal_token_id.append(sample_token)

    
normal_token_id = padding_all(normal_token_id, 32)
print(normal_token_id)
normal_token_id = torch.stack(normal_token_id, dim = 0)
normal_labels = torch.tensor(normal_labels)




hans_token_id = []

for sample in hans_peptides:
  sample_token = torch.tensor(tokenizer.encode(sample))
  hans_token_id.append(sample_token) 

    
hans_token_id = padding_all(hans_token_id, 32)    
hans_token_id = torch.stack(hans_token_id, dim = 0)
hans_labels = torch.tensor(hans_labels)

[tensor([ 2, 28, 25, 12,  8, 17,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), tensor([ 2, 19, 14, 19,  9, 19, 14, 19, 14,  3,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), tensor([ 2, 14, 20, 15, 11, 21, 13, 16, 15,  3,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), tensor([ 2,  5,  5, 15,  9,  5, 14, 13,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), tensor([ 2,  5,  5, 11,  5, 25, 16, 21, 11, 10, 25, 10, 23,  3,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), tensor([ 2,  5, 11, 21, 11, 21, 19, 11, 19, 11, 15,  3,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]), tensor([ 2, 12, 10, 12, 12, 19,  9, 23, 22, 21, 19,  8, 22, 17,  3,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,

In [14]:
print(normal_token_id[0])
print(normal_labels[0])

tensor([ 2, 28, 25, 12,  8, 17,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
tensor(1)


### Normal Data DataLoader

In [15]:
seed_val = 42
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)


val_ratio = 0.3
batch_size = 16


train_idx, val_idx = train_test_split(
    np.arange(len(normal_labels)),
    test_size = val_ratio,
    stratify = normal_labels,
    random_state = 42)

# Train and validation sets
normal_train_set = TensorDataset(normal_token_id[train_idx], 
                          normal_labels[train_idx])

normal_val_set = TensorDataset(normal_token_id[val_idx], 
                        normal_labels[val_idx])

# Prepare DataLoader
normal_train_dataloader = DataLoader(
            normal_train_set,
            sampler = RandomSampler(normal_train_set),
            batch_size = batch_size
        )

normal_validation_dataloader = DataLoader(
            normal_val_set,
            sampler = SequentialSampler(normal_val_set),
            batch_size = batch_size
        )

### Hans Data DataLoader

In [16]:
seed_val = 42
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)


val_ratio = 0.3
batch_size = 16


train_idx, val_idx = train_test_split(
    np.arange(len(hans_labels)),
    test_size = val_ratio,
    stratify = hans_labels,
    random_state = 42)

# Train and validation sets
hans_train_set = TensorDataset(hans_token_id[train_idx], 
                          hans_labels[train_idx])

hans_val_set = TensorDataset(hans_token_id[val_idx], 
                        hans_labels[val_idx])

# Prepare DataLoader
hans_train_dataloader = DataLoader(
            hans_train_set,
            sampler = RandomSampler(hans_train_set),
            batch_size = batch_size
        )

hans_validation_dataloader = DataLoader(
            hans_val_set,
            sampler = SequentialSampler(hans_val_set),
            batch_size = batch_size
        )

#### Define Loss Function

In [17]:
loss_fn = nn.CrossEntropyLoss()

#### Function that calculates accuracy

In [18]:
def calculate_accuracy(preds, labels):
  return sum([preds == labels for preds, labels in zip(preds, labels)])


def return_accuracy(preds, labels):
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()  
  return calculate_accuracy(preds, labels) / len(labels)

## Fine-Tuning the Normal TAPE BERT on Normal Dataset

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

epochs = 3

for _ in trange(epochs, desc = 'Epoch'):
    normal_model.train()
    
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(normal_train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        normal_optimizer.zero_grad()
        train_output = normal_model(b_input_ids)
        
        loss = loss_fn(train_output, b_labels)
        loss.backward()
        
        # Backward pass
        normal_optimizer.step()
        # Update tracking variables
        tr_loss += loss
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========
    normal_model.eval()

    # Tracking variables 
    val_accuracy = []


    for batch in normal_validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = normal_model(b_input_ids)
            
        logits = eval_output.cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        accuracy = return_accuracy(logits, label_ids)
        val_accuracy.append(accuracy)
        

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))


Epoch:  33%|███▎      | 1/3 [00:10<00:20, 10.08s/it]


	 - Train loss: 0.7147
	 - Validation Accuracy: 0.5025


Epoch:  67%|██████▋   | 2/3 [00:20<00:10, 10.10s/it]


	 - Train loss: 0.7074
	 - Validation Accuracy: 0.5554


Epoch: 100%|██████████| 3/3 [00:30<00:00, 10.10s/it]


	 - Train loss: 0.6887
	 - Validation Accuracy: 0.4954





In [20]:
torch.cuda.empty_cache()

## Fine-Tuning the Hans TAPE BERT on Hans Dataset

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

epochs = 3

for _ in trange(epochs, desc = 'Epoch'):
    hans_model.train()
    
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(hans_train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        hans_optimizer.zero_grad()
        train_output = hans_model(b_input_ids)
        
        loss = loss_fn(train_output, b_labels)
        loss.backward()
        
        # Backward pass
        hans_optimizer.step()
        # Update tracking variables
        tr_loss += loss
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========
    hans_model.eval()

    # Tracking variables 
    val_accuracy = []


    for batch in hans_validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = hans_model(b_input_ids)
            
        logits = eval_output.cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        accuracy = return_accuracy(logits, label_ids)
        val_accuracy.append(accuracy)
        

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))

Epoch:  33%|███▎      | 1/3 [00:10<00:20, 10.11s/it]


	 - Train loss: 0.1053
	 - Validation Accuracy: 0.9720


Epoch:  67%|██████▋   | 2/3 [00:20<00:10, 10.13s/it]


	 - Train loss: 0.0208
	 - Validation Accuracy: 1.0000


Epoch: 100%|██████████| 3/3 [00:30<00:00, 10.14s/it]


	 - Train loss: 0.0131
	 - Validation Accuracy: 1.0000





### Save the models

In [46]:
torch.save(normal_model.state_dict(), "normal_model.pth")
torch.save(hans_model.state_dict(), "hans_model.pth")

### Test Time Comparison

In [40]:
def compare_models(sequence, model1, model2):
    token_ids = torch.tensor([tokenizer.encode(sequence)]).to("cuda")
    output1 = model1(token_ids)[0]
    output2 = model2(token_ids)[0]
    print("Sequence: ", sequence)
    print("Model 1 predicted: ", output1)
    print("Model 2 predicted: ", output2)

In [42]:
compare_models("KQLGRIML", normal_model, hans_model)

Sequence:  KQLGRIML
Model 1 predicted:  tensor([-0.0582,  0.4019], device='cuda:0', grad_fn=<SelectBackward0>)
Model 2 predicted:  tensor([ 3.0676, -2.8883], device='cuda:0', grad_fn=<SelectBackward0>)


In [43]:
compare_models("QQQQAPKL", normal_model, hans_model)

Sequence:  QQQQAPKL
Model 1 predicted:  tensor([-0.0048,  0.4109], device='cuda:0', grad_fn=<SelectBackward0>)
Model 2 predicted:  tensor([-2.4143,  2.2280], device='cuda:0', grad_fn=<SelectBackward0>)
