In [56]:
#imports
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, Trainer, TrainingArguments, BertConfig, BertForSequenceClassification, AutoModelForSequenceClassification, EarlyStoppingCallback
from transformers.integrations import TensorBoardCallback
from transformers import RobertaTokenizerFast, RobertaForMaskedLM
from scipy.stats import spearmanr
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error
from sklearn.utils import shuffle
import re
import tqdm

In [57]:
#useful constants
MAX_LENGTH = 512
EPOCHS = 50
LEARNING_RATE = 5e-7
BATCH_SIZE = 256
TOKENIZER_PATH = "../Tokenizers/Proberta512"
PRETRAINED_MODEL = "../Models/Proberta512/Best_Checkpoint"

In [58]:
class ProteinDegreeDataset(Dataset):

    def __init__(self, max_length, data_path, tokenizer):
        self.seqs, self.labels = self.load_dataset(data_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def load_dataset(self,path):
        df = pd.read_csv(path,names=['Sequence','Degree','Tokenized Sequence'],skiprows=1)
        df['Degree'] = np.log(df['Degree'])
        df['Degree'] = (df['Degree'] - np.mean(df['Degree']) )/ np.std(df['Degree'])
    
        seq = list(df['Sequence'])
        label = list(df['Degree'].astype(float))

        return seq, label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOB]", "X", seq)

        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return sample

In [59]:
tokenizer = RobertaTokenizerFast.from_pretrained(TOKENIZER_PATH)

Didn't find file ../Tokenizers/Proberta512/tokenizer.json. We won't load it.
Didn't find file ../Tokenizers/Proberta512/added_tokens.json. We won't load it.
Didn't find file ../Tokenizers/Proberta512/special_tokens_map.json. We won't load it.
Didn't find file ../Tokenizers/Proberta512/tokenizer_config.json. We won't load it.
loading file ../Tokenizers/Proberta512/vocab.json
loading file ../Tokenizers/Proberta512/merges.txt
loading file None
loading file None
loading file None
loading file None
file ../Tokenizers/Proberta512/config.json not found
file ../Tokenizers/Proberta512/config.json not found


In [60]:
tokenizer

PreTrainedTokenizerFast(name_or_path='../Tokenizers/Proberta512', vocab_size=10000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})

In [61]:
tokenizer('ABC')

{'input_ids': [0, 37, 38, 39, 2], 'attention_mask': [1, 1, 1, 1, 1]}

In [62]:
train_data_path = '../Datasets/Degree_tokenized_split_after_clustering/degree_train.csv'
val_data_path = '../Datasets/Degree_tokenized_split_after_clustering/degree_valid.csv'


In [63]:
train_dataset = ProteinDegreeDataset(MAX_LENGTH, train_data_path, tokenizer)
val_dataset = ProteinDegreeDataset(MAX_LENGTH, val_data_path, tokenizer)

In [64]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    mse = mean_squared_error(labels, preds)
    residuals = []
    for i in range(len(labels)):
        residuals.append(labels[i] - pred[i])
    res_std = np.std(residuals)
    spearman = spearmanr(labels, preds)
    return {
        'mse' : mse,
        'res_std' : res_std,
        'spearman' : spearman[0]
    }

In [65]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=1)
    return model

In [66]:
OUTPUT_DIR = f'../Models/DegreeRegressionProberta512EPOCHS{EPOCHS}BATCH_SIZE{BATCH_SIZE}LEARNING_RATE{LEARNING_RATE}'

In [67]:
if not os.path.isdir(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [68]:
training_args = TrainingArguments(
    output_dir = OUTPUT_DIR + '/Checkpoints',
    num_train_epochs = EPOCHS,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 16,
    warmup_steps = 1000,
    learning_rate = LEARNING_RATE,
    logging_dir = OUTPUT_DIR + '/Logs',
    logging_steps = 200,
    do_train = True,
    do_eval = True,
    evaluation_strategy = 'epoch',
    gradient_accumulation_steps = BATCH_SIZE,
    fp16 = True,
    fp16_opt_level = '02',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [69]:
class DegreeRegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.MSELoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [70]:
trainer = DegreeRegressionTrainer(
    model_init = model_init,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics
)

loading configuration file ../Models/Proberta512/Best_Checkpoint/config.json
Model config RobertaConfig {
  "_name_or_path": "../Models/Proberta512/Best_Checkpoint",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 10000
}

loading weights file ../Models/Proberta512/Best_Checkpoint/pytorch_model.bin
Some weights of the model checkpoint 

In [71]:
trainer.train()

loading configuration file ../Models/Proberta512/Best_Checkpoint/config.json
Model config RobertaConfig {
  "_name_or_path": "../Models/Proberta512/Best_Checkpoint",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.16.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 10000
}

loading weights file ../Models/Proberta512/Best_Checkpoint/pytorch_model.bin
Some weights of the model checkpoint 

Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 810
  Batch size = 16
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


IndexError: tuple index out of range