## Imports

In [1]:
!pip install transformers
!pip install sklearn
!pip install torch

from transformers import BertModel, BertTokenizer,TrainingArguments, Trainer, BatchEncoding

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import pandas as pd
from sklearn.model_selection import train_test_split
import random
import numpy as np
import re

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Collecting sklearn
  Using cached sklearn-0.0.post9.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0.post9-py3-none-any.whl size=2951 sha256=d9a5e2d01c3bc62a5b1cc6ebfc2d3ecdcee81baed3693a1980722c363ff77f7e
  Stored in directory: /home/lferraz/.cache/pip/wheels/ef/63/d1/f1671e1e93b7ef4d35df483f9b2485e6dd21941da9a92296fb
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post9


## Preparação do dataset

In [2]:
class SequenceViabilityDataset:
  def __init__(self, sequences, labels):
    
    self.sequences = sequences
    self.labels = labels

  def __getitem__(self, i):
    return self.sequences[i], self.labels[i]

  def __len__(self):
    return len(self.sequences)

  def get_idx_split(self, train_size=0.8):
    all_ids = range(20)#len(self.sequences))
    train_ids, test_ids = train_test_split(all_ids, train_size = 0.8)
    test_ids, val_ids = train_test_split(test_ids, train_size = 0.5)
    return {"train": train_ids, "test": test_ids, "valid": val_ids}

## Load dos dados

In [3]:
df = pd.read_csv("./datasets/reconstructed_sequences_viability_f.csv")

## Preprocessing dos dados

In [4]:
all_data = []
for index, row in df.iterrows():
 
  s = row["sequence"]
  l = row["viability_CMV"]
  if not np.isnan(l):
    l = int(l)
    
    s = " ".join(list(s.upper()))
    s = re.sub(r"[UZOB]", "X", s)
    all_data.append((s, l))
  

## Instanciação do dataset

In [5]:
dataset = SequenceViabilityDataset([x[0] for x in all_data], [x[1] for x in all_data])

## Train test validation split

In [6]:
split_idx = dataset.get_idx_split()
train_data = [dataset[i] for i in split_idx["train"]]
test_data = [dataset[i] for i in split_idx["test"]]
valid_data = [dataset[i] for i in split_idx["valid"]]

## Criação de batches

In [7]:

BATCH_SIZE = 10

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False)

c = 0
for i, l in train_loader:
  if c < 5:
    print(i, l)
  c += 1

('M A A D G Y L P D W L E D T L S E G I R Q W W K L K P G P P P P K P A E R H K D D S R G L V L P G Y K Y L G P F N G L D K G E P V N E A D A A A L E H D K A Y D R Q L D S G D N P Y L K Y N H A D A E F Q E R L K E D T S F G G N L G R A V F Q A K K R V L E P L G L V E E P V K T A P G K K R P V E H S P V E P D S S S G T G K A G Q Q P A R K R L N F G Q T G D A D S V P D P Q P L G Q P P A A P S G L G T N T M A T G S G A P M A D N N E G A D G V G N S S G N W H C D S T W M G D R V I T T S T R T W A L P T Y N N H L Y K Q I S S Q S G A S N D N H Y F G Y S T P W G Y F D F N R F H C H F S P R D W Q R L I N N N W G F R P K R L N F K L F N I Q V K E V T Q N D G T T T I A N N L T S T V Q V F T D S E Y Q L P Y V L G S A H Q G C L P P F P A D V F M V P Q Y G Y L T L N N G S Q A V G R S S F Y C L E Y F P S Q M L R T G N N F T F S Y T F E D V P F H S S Y A H S Q S L D R L M N P L I D Q Y L Y Y L S R T N T P S G T T T Q S R L Q F S Q A G A S D I R D Q S R N W L P G P C Y R Q Q R V S K T S A D N N N S E 

## Definição do classifier

In [8]:
class ProtBERTClassifier(torch.nn.Module):
    def __init__(self, model, num_classes):
        super(ProtBERTClassifier, self).__init__()
        self.model = model
        self.classify = torch.nn.Linear(model.config.hidden_size, num_classes)

    def forward(self, sequences):

        bert_embeddings = self.model(**BatchEncoding(data=sequences)).last_hidden_state.to(device)

        return self.classify(bert_embeddings).squeeze()

## Instanciação do classifier

In [9]:
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)

bert_model = BertModel.from_pretrained("Rostlab/prot_bert")
bert_model.to(device)
model = ProtBERTClassifier(bert_model, 1)
model.to(device)

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ProtBERTClassifier(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30, 1024, padding_idx=0)
      (position_embeddings): Embedding(40000, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-29): 30 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-1

## Definição do training loop

In [10]:
def test_model(model, tokenizer, test_dataloader):
    num_correct = 0
    num_tests = 0
    with torch.no_grad():
        for batched_sequences, labels in test_dataloader:
            #print(batched_sequences, labels)
            tokens = tokenizer(batched_sequences, return_tensors='pt', padding=True).to(device)
            #print(2)
            pred = model(tokens)
            num_correct += (pred.argmax(1) == torch.Tensor(labels).to(device)).sum().item()
            num_tests += len(labels)

    accuracy = num_correct / num_tests
    print("acc =", accuracy)
    return accuracy

def train_model(model, tokenizer, train_dataloader, valid_dataloader, epochs=10, lr=0.01):
  
  optimizer = torch.optim.Adam(model.parameters(), lr)

  patience = 3
  delta = 0.5
  c = 0
  min_so_far = np.inf

  for epoch in range(epochs):
      print("Epoch", epoch+1)

      # Training loss
      for batched_sequences, labels in train_dataloader:


          tokens = tokenizer(batched_sequences, return_tensors='pt', padding=True).to(device)

          pred = model(tokens)

          training_loss = F.cross_entropy(pred, torch.Tensor(labels).to(device))

          optimizer.zero_grad()

          training_loss.backward()

          optimizer.step()
      # Early stopping
      # Comparar validation loss com training loss
      
      test_model(model, tokenizer, test_loader)

      # Sem afetar o modelo
      total_validation_loss = 0
      with torch.no_grad():
        for batched_sequences, labels in valid_dataloader:

            tokens = tokenizer(batched_sequences, return_tensors='pt', padding=True).to(device)

            pred = model(tokens)

            valid_loss = F.cross_entropy(pred, torch.Tensor(labels).to(device))
            
            total_validation_loss += valid_loss.item()
            

      # Se a validation loss for a menor até agora, atualizar e dar reset ao counter
      if total_validation_loss < min_so_far:
        min_so_far = valid_loss.item()
        c = 0
      # Se a validation loss for maior que a menor até agora + delta, incrementar counter
      elif total_validation_loss > min_so_far + delta:
        c += 1

        # Se a validation_loss aumentou vezes suficientes de seguida, parar
        if c >= patience:
          break
  return model

In [11]:

trained_model = train_model(model, tokenizer, train_loader, valid_loader, epochs=5)

Epoch 1


OutOfMemoryError: CUDA out of memory. Tried to allocate 332.00 MiB (GPU 0; 23.69 GiB total capacity; 21.90 GiB already allocated; 155.38 MiB free; 22.48 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF