In [1]:
#imports
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, Trainer, TrainingArguments, BertConfig, BertForSequenceClassification, AutoModelForSequenceClassification, EarlyStoppingCallback
from transformers.integrations import TensorBoardCallback
from transformers import RobertaTokenizerFast, RobertaForMaskedLM
from scipy.stats import spearmanr
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error
from sklearn.utils import shuffle
import re
import tqdm



In [8]:
#useful constants
MAX_LENGTH = 512
OUTPUT_DIR = ""
EPOCHS = 0
LEARNING_RATE = 0 
BATCH_SIZE = 0
TOKENIZER_PATH = "../ProteinTransformersResearch/Tokenizers/Proberta512"

In [9]:
class ProteinDegreeDataset(Dataset):

    def __init__(self, max_length, data_path, tokenizer):
        self.seqs, self.labels = self.load_dataset(data_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def load_dataset(self,path):
        df = pd.read_csv(path,names=['Labels', 'Sequence','Degree','Tokenized Sequence'],skiprows=1)

        df['Degree'] = np.log(df['Degree'])
        df['Degree'] = (df['Degree'] - np.mean(df['Degree']) )/ np.std(df['Degree'])
    
        seq = list(df['Sequence'])
        label = list(df['Degree'].astype(float))

        return seq, label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOB]", "X", seq)

        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return sample

In [10]:
tokenizer = RobertaTokenizerFast.from_pretrained(TOKENIZER_PATH)

OSError: Can't load tokenizer for '../ProteinTransformersResearch/Tokenizers/Proberta512'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '../ProteinTransformersResearch/Tokenizers/Proberta512' is the correct path to a directory containing all relevant tokenizer files.

In [None]:
tokenizer

In [None]:
tokenizer('ABC')

In [None]:
train_dataset = ProteinDegreeDataset(MAX_LENGTH, '../degree_tokenized.csv',tokenizer)