In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import needed libraries

In [None]:
from dataclasses import dataclass, field
from transformers import RobertaForMaskedLM, RobertaTokenizerFast, RobertaModel, DefaultDataCollator, DataCollatorWithPadding, DataCollatorForLanguageModeling, Trainer, LongformerForMaskedLM,  LongformerTokenizerFast
from transformers import TrainingArguments, HfArgumentParser, AutoModelForMaskedLM, AutoModel, AutoTokenizer
from datasets import load_dataset, Dataset
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from accelerate import Accelerator, DeepSpeedPlugin
from torchmetrics import AUROC, AveragePrecision
from transformers import RobertaConfig
from tokenizers.implementations import CharBPETokenizer
from tokenizers.processors import BertProcessing
from safetensors.torch import load_file
import threading
import os
import torch.nn as nn
import logging
import math
import copy
import torch

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

## Data prepocessing

### Download and extract FASTA files

In [None]:
#!wget -O interaction.protein.gz  "https://rest.uniprot.org/idmapping/uniprotkb/results/stream/b36da80dd9f77ce4d631842f8d85fe80785b0702?compressed=true&format=fasta"
#!gzip -df "result.fasta.gz"

### Create directories for dataset

In [None]:
!mkdir dataset
!mkdir dataset/train
!mkdir dataset/val
!mkdir dataset/test
!mkdir aminobert

### Reading and extraction of DNA sequences from FASTA files

In [None]:
# Read sequences from fasta files
def preprocess_fasta(file_name):
    fastas = []
    with open(file_name, "r") as f:
        fasta_sequence = ""
        while(True):
            line1 = f.readline()
            if not line1:
                break
            if line1[0] == ">":
                fastas.append(fasta_sequence)
                fasta_sequence = ""
                continue
            fasta_sequence += line1.strip()
            
    fastas.append(fasta_sequence)
    return fastas[1:]

fastas = preprocess_fasta("result.fasta")
#fastas = fastas[:len(fastas)\\10]
train, val = train_test_split(fastas, train_size = 0.8)
train, test = train_test_split(train, train_size = 0.8)

### Write DNA sequences to files as datasets

In [None]:
# Write sequence data to file (dataset)
def write_to_file(file, sequences):
    with open(file, "w", encoding='utf-8') as f:
        for seq in sequences:
            f.write(seq)
            f.write("\n")

write_to_file("dataset/train/data.txt", train)
write_to_file("dataset/test/data.txt", test)
write_to_file("dataset/val/data.txt", val)

### Create interaction files

In [None]:
notfound = "data/9606.notfound.txt"
idfile = "data/9606.proteins.0.9.txt"
newids = "proteinids.txt"
positive_interact = "data/9606.links.0.9.txt"
negative_interact = "data/9606.negatives.0.9.txt"

with open(notfound) as f:
    notprotids = f.readlines()

with open(idfile) as f:
    protids = f.readlines()

for notid in notprotids:
    protids.remove(notid)

with open(newids, "w") as f:
    f.writelines(protids)

notprotids = list(map(lambda x:x.strip(), notprotids))
pos_inter = []
with open("positive_inter.txt", "w") as w:
    with open(positive_interact, "r") as f:
        for row in f.readlines():
            els = row.strip().split()[:2]
            if els[0] in notprotids:
                continue
            if els[1] in notprotids:
                continue
            pos_inter.append(row)
    w.writelines("".join(pos_inter))
        
neg_inter = []
with open("negative_inter.txt", "w") as w:
    with open(negative_interact, "r") as f:
        for row in f.readlines():
            els = row.strip().split()[:2]
            if els[0] in notprotids:
                continue
            if els[1] in notprotids:
                continue
            neg_inter.append(row)
        
    w.writelines("".join(neg_inter))

In [None]:

pos_interact = "positive_inter.txt"
neg_interact = "negative_inter.txt"

with open(newids, "r") as f:
    protids = list(map(lambda x: x.strip(), f.readlines()))

prot1 = preprocess_fasta("interaction.protein")
print(len(prot1))

data = {"ID": protids,
        "FASTA": prot1}
proteins = pd.DataFrame(data)
proteins = proteins.set_index("ID")

with open(pos_interact, "r") as f:
        pos_inter = np.array(list(map(lambda x: x.strip().split()[:2], f.readlines())))

with open(neg_interact, "r") as f:
        neg_inter = np.array(list(map(lambda x: x.strip().split()[:2], f.readlines())))

#prot2 = preprocess_fasta("/kaggle/working/target.fasta")

trainpos, valpos = train_test_split(pos_inter, train_size = 0.8)
trainpos, testpos = train_test_split(pos_inter, train_size = 0.8)
trainneg, valneg = train_test_split(neg_inter, train_size = 0.8)
trainneg, testneg = train_test_split(neg_inter, train_size = 0.8)

train = np.concatenate([trainpos, trainneg])
val = np.concatenate([valpos, valneg])
test = np.concatenate([testpos, testneg])
train_prot1 = proteins.loc[train[:,0], "FASTA"].values
train_prot2 = proteins.loc[train[:,1], "FASTA"].values
val_prot1 = proteins.loc[val[:,0], "FASTA"].values
val_prot2 = proteins.loc[val[:,1], "FASTA"].values
test_prot1 = proteins.loc[test[:,0], "FASTA"].values
test_prot2 = proteins.loc[test[:,1], "FASTA"].values

train_labels = [1]*len(trainpos) + [0]*len(trainneg)
val_labels = [1]*len(valpos) + [0]*len(valneg)
test_labels = [1]*len(testpos) + [0]*len(testneg)

with open("train_interactions.csv", "w") as f:
       f.write("fasta1,fasta2,labels\n")
       for el1, el2, y in zip(train_prot1, train_prot2, train_labels):
              f.write(f"{el1},{el2},{y}\n")

with open("val_interactions.csv", "w") as f:
       f.write("fasta1,fasta2,labels\n")
       for el1, el2, y in zip(val_prot1, val_prot2, val_labels):
              f.write(f"{el1},{el2},{y}\n")

with open("test_interactions.csv", "w") as f:
       f.write("fasta1,fasta2,labels\n")
       for el1, el2, y in zip(test_prot1, test_prot2, test_labels):
              f.write(f"{el1},{el2},{y}\n")


## Pretraining

### Enable CUDA

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

### Enable XLA

In [5]:
#import torch_xla
#import torch_xla.core.xla_model as xm
#device = xm.xla_device()
#print(device)

### Initialize an Argument Parser for transformers.TrainingArguments

In [11]:
@dataclass
class ModelArgs:
    attention_window: int = field(default=512, metadata={"help": "Size of attention window"})
    max_pos: int = field(default=4096, metadata={"help": "Maximum position"})

parser = HfArgumentParser((TrainingArguments, ModelArgs,))

### Define tokenize function and pretraining and evaluation function

In [12]:
def tokenize_function(tokenizer, examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):
    data_files = {"val": args.val_datapath}
    
    if eval_only:
        data_files["train"] = data_files["val"]
    else:
        logger.info(f'Loading and tokenizing training data is usually slow: {args.train_datapath}')
        data_files["train"] = args.train_datapath

    datasets = load_dataset("text", data_files=data_files)
    datasets = datasets.map(lambda x: tokenize_function(tokenizer, x), batched=True)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    trainer = Trainer(model=model, args=args, data_collator=data_collator,
                      train_dataset=datasets["train"], eval_dataset=datasets["val"],)
        
    print("I'm evaluating...")
    eval_loss = trainer.evaluate()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')

    print(f"Initial eval bpc: {eval_loss/math.log(2)}")

    if not eval_only:
        print("I'm training...")
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')
        print(f"Eval bpc after pretraining: {eval_loss/math.log(2)}")
        

### Define arguments for pretraining (TrainingArguments)
Temporary parameters. In the real model, parameters need to be "optimized", or in any case chosen using appropriate heuristics.<br>
Scaling laws will be used to calculate the optimal amount of training steps/parameters. Learning rate, cosine annealing cycle length and decay and other information on choosing hyperparameters can be found in this paper https://arxiv.org/pdf/2203.15556 and this paper https://openreview.net/pdf?id=Bx6qKuBM2AD.


In [13]:
training_args, model_args = parser.parse_args_into_dataclasses(look_for_args_file=False, args=[
    '--output_dir', 'tmp',
    '--learning_rate', '2e-4',
    '--weight_decay', '0.01',
    '--adam_epsilon', '1e-6',
    '--max_steps', '2000',
    '--logging_steps', '100',
    '--save_steps', '500',
    '--max_grad_norm', '5.0',
    '--per_device_eval_batch_size', '8',
    '--per_device_train_batch_size', '2',  # 32GB gpu with fp32
    '--gradient_accumulation_steps', '32',
    '--do_train',
    '--do_eval',
    #'--num_train_epochs', '1',
    '--tpu_num_cores', '8',                      # Number of TPU cores (typically 8)
    '--lr_scheduler_type', 'cosine',
    '--warmup_ratio', '0.1',
    # This cosine scheduler drops to min lr rate of zero, not of 10x less the initial lr like in the paper

    #'--lr_scheduler_kwargs', '{"num_cycles": 0.5}',           
    # This drops approximately 10x  
    '--lr_scheduler_kwargs', '{"num_cycles": 0.41}',            

    #'--warmup_steps', '500',

])

training_args.val_datapath = './dataset/val/data.txt'
training_args.train_datapath = './dataset/train/data.txt'
training_args.prediction_loss_only = True
print("Device:", training_args.device)


Device: cuda:0


### Define tokenizer and model

In [None]:
# ------------------------  Train tokenizer on DNA sequences --------------------------
files = ['./dataset/train/data.txt','./dataset/val/data.txt'] 
roberta_base_tokenizer = CharBPETokenizer()

# Customize training (change vocab size)
roberta_base_tokenizer.train(files=files, vocab_size=1000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

#roberta_base_tokenizer._tokenizer.post_processor = BertProcessing(
#    ("</s>", roberta_base_tokenizer.token_to_id("</s>")),
#    ("<s>", roberta_base_tokenizer.token_to_id("<s>")),
#)

roberta_base_tokenizer.enable_truncation(max_length=512)
roberta_base_tokenizer.save_model("./aminobert")

# -------------------------------------------------------------------------------------







KeyboardInterrupt: 

In [17]:

# ---------------------  Define model and resize token embeddings ---------------------

roberta_base_tokenizer = RobertaTokenizerFast.from_pretrained("./aminobert")

roberta_base = RobertaForMaskedLM.from_pretrained('roberta-base')
roberta_base.resize_token_embeddings(len(roberta_base_tokenizer))
print(f"Roberta parameters: {int(roberta_base.num_parameters()/1000000)}M")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


Roberta parameters: 86M


### Create a smaller version of RoBERTa (for testing) and pretrain it

In [18]:
def deleteEncodingLayers(model, num_layers_to_keep):  # must pass in the full bert model
    oldModuleList = model.roberta.encoder.layer
    newModuleList = nn.ModuleList()

    # Now iterate over all layers, only keepign only the relevant layers.
    for i in range(0, num_layers_to_keep):
        newModuleList.append(oldModuleList[i])

    # create a copy of the model, modify it with the new list, and return
    copyOfModel = copy.deepcopy(model)
    copyOfModel.roberta.encoder.layer = newModuleList

    return copyOfModel
    
small_roberta = deleteEncodingLayers(roberta_base,6)
print(f"Small roberta parameters: {int(small_roberta.num_parameters()/1000000)}M")



Small roberta parameters: 43M


In [None]:
pretrain_and_evaluate(training_args, small_roberta, roberta_base_tokenizer, eval_only=False, model_path=None)

INFO:__main__:Loading and tokenizing training data is usually slow: ./dataset/train/data.txt


Map:   0%|          | 0/41010 [00:00<?, ? examples/s]

Map:   0%|          | 0/131228 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


I'm evaluating...
Trainer: Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 41010
})
Eval loop:  False
Prediction_loss_only: True
True
  Num examples = 41010


  0%|          | 0/5127 [00:00<?, ?it/s]

INFO:__main__:Initial eval bpc: 12.693354281302868


Output metrics: {'eval_loss': 8.798362731933594, 'eval_runtime': 851.7556, 'eval_samples_per_second': 48.148, 'eval_steps_per_second': 6.019}
Initial eval bpc: 12.693354281302868
I'm training...




  0%|          | 0/2000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Assuming `model` is your pretrained model and `tokenizer` is your tokenizer
#output_dir = "./pretrained"

# Save the model
#small_roberta.save_pretrained(output_dir)

# Save the tokenizer
#roberta_base_tokenizer.save_pretrained(output_dir)


('./pretrained/tokenizer_config.json',
 './pretrained/special_tokens_map.json',
 './pretrained/vocab.json',
 './pretrained/merges.txt',
 './pretrained/added_tokens.json',
 './pretrained/tokenizer.json')

## Finetuning

In [None]:
# Load the model for sequence classification
model_encoder = AutoModelForMaskedLM.from_pretrained("./pretrained")
model_encoder = deleteEncodingLayers(model_encoder,6).roberta

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at ./pretrained and are newly initialized: ['roberta.encoder.layer.10.attention.output.LayerNorm.bias', 'roberta.encoder.layer.10.attention.output.LayerNorm.weight', 'roberta.encoder.layer.10.attention.output.dense.bias', 'roberta.encoder.layer.10.attention.output.dense.weight', 'roberta.encoder.layer.10.attention.self.key.bias', 'roberta.encoder.layer.10.attention.self.key.weight', 'roberta.encoder.layer.10.attention.self.query.bias', 'roberta.encoder.layer.10.attention.self.query.weight', 'roberta.encoder.layer.10.attention.self.value.bias', 'roberta.encoder.layer.10.attention.self.value.weight', 'roberta.encoder.layer.10.intermediate.dense.bias', 'roberta.encoder.layer.10.intermediate.dense.weight', 'roberta.encoder.layer.10.output.LayerNorm.bias', 'roberta.encoder.layer.10.output.LayerNorm.weight', 'roberta.encoder.layer.10.output.dense.bias', 'roberta.encoder.layer.10.output.dense.weight', 'roberta.e

"\nroberta_base_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')\nroberta_base = RobertaForMaskedLM.from_pretrained('roberta-base')\nfiles = ['./dataset/train/data.txt','./dataset/val/data.txt'] \nroberta_base_tokenizer = roberta_base_tokenizer.train_new_from_iterator(files, 1000)\n\nroberta_base.resize_token_embeddings(len(roberta_base_tokenizer))\nsmall_roberta = deleteEncodingLayers(roberta_base,6)\n"

In [None]:

class InputDataset(Dataset):
    def __init__(self, interactions_file, tokenizer1, tokenizer2):
        interactions = pd.read_csv(interactions_file)
        
        if interactions.shape[1] == 3:
            self.targets = torch.tensor(interactions.iloc[:,2].values, dtype=torch.long)
        elif interactions.shape[1] != 2:
            raise Exception(f"Invalid input file: input should have shape (N, 3) or (N,2), but has shape {self.interactions.shape}.")
        
        self.inputs = interactions.iloc[:,:2].values
        self.tokenizer1 = tokenizer1
        self.tokenizer2 = tokenizer2


    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        input1 = self.tokenizer1(list(np.atleast_1d(self.inputs[idx, 0])),
                            padding="max_length", 
                            truncation=True, 
                            max_length=512,
                            return_tensors="pt")
        input2 = self.tokenizer2(list(np.atleast_1d(self.inputs[idx, 1])),
                            padding="max_length", 
                            truncation=True, 
                            max_length=512,
                            return_tensors="pt")
        
        input1["input_ids"] = input1["input_ids"].view(-1)
        input2["input_ids"] = input2["input_ids"].view(-1)


        return ((input1, input2), self.targets[idx])


In [23]:
class InteractionModel(nn.Module):
    def __init__(self, encoder, num_labels):
        super().__init__()
        self.encoder = encoder
        self.linear_gelu_stack = nn.Sequential(
            nn.Linear(1536, 1024),
            nn.Dropout(0.2),
            nn.GELU(),
            nn.Linear(1024, 512),
            nn.Dropout(0.2),
            nn.GELU(),
            nn.Linear(512, num_labels),
        ) 
        
    def forward(self, x1, x2):
        
        y1 = self.encoder(**x1).last_hidden_state
        y2 = self.encoder(**x2).last_hidden_state
        z = torch.concatenate([y1.mean(dim=1), y2.mean(dim=1)], dim=1)
        y = self.linear_gelu_stack(z)
        return y


In [24]:

tokenizer1 = AutoTokenizer.from_pretrained("./pretrained")
tokenizer2 = AutoTokenizer.from_pretrained("./pretrained")
res = tokenizer1("ACGCGCAG", max_length=512, padding="max_length", return_tensors="pt")
res1 = tokenizer1(["ACGCGCAG", "DACDAFV"] , max_length=512,  padding="max_length", return_tensors="pt")
res2 = tokenizer2(["ACGCGCCDFAG", "CADCAEDF"] , max_length=512,  padding="max_length", return_tensors="pt")
model_encoder.eval()
p1 = model_encoder(**res1).last_hidden_state
p2 = model_encoder(**res2).last_hidden_state
z = torch.concatenate([p1.mean(dim=1), p2.mean(dim=1)], dim=1)

print(z.shape)
model_encoder.config


#datasets = datasets.map(lambda x: finetune_tokenize(tokenizer, x), batched=True)
#tokenizer(datasets["train"]["text"][:100])


torch.Size([2, 1536])


RobertaConfig {
  "_name_or_path": "./pretrained",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.41.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 1000
}

In [None]:
train_interactions_file = "train_interactions.csv"
val_interactions_file = "val_interactions.csv"

train_parameters = {"train_batch_size": 4,
                    "device": "cuda",
                    "learning_rate": 5e-5,
                    "adam_epsilon": 1e-6,
                    "gradient_accumulation_steps": 16,
                    "num_training_steps":500,
                    "log_performance_every":100,
                    }

model = InteractionModel(model_encoder, 2)
train_dataset = InputDataset(train_interactions_file, tokenizer1, tokenizer2)
val_dataset = InputDataset(val_interactions_file, tokenizer1, tokenizer2)

def evaluate_model(cpu_model, data_loader, thread_id):
    cpu_model.eval() 

    auroc_metric = AUROC(task="multiclass", num_classes=2)
    auprc_metric = AveragePrecision(task="multiclass", num_classes=2)
    
    eval_n = 0

    with torch.no_grad():
        for source, targets in data_loader:
      
            if eval_n > 100:
                break
                        
            output = cpu_model(source[0], source[1])
            auroc_metric.update(output, targets)
            auprc_metric.update(output, targets)

            eval_n += 1

    auroc_score = auroc_metric.compute()
    auprc_score = auprc_metric.compute()
    
    print(f"Step {thread_id} - AUROC val: {auroc_score:.4f}, AUPRC val: {auprc_score:.4f}")
      
#add weight decay
def finetune_and_evaluate(model):
    deepspeed_plugin = DeepSpeedPlugin(zero_stage=3, 
                                       offload_param_device = "cpu",
                                        offload_optimizer_device = "cpu"
                                       )
    accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin, gradient_accumulation_steps=train_parameters["gradient_accumulation_steps"])
    device = accelerator.device

    auroc_metric_train = AUROC(task="multiclass", num_classes=2).to(device)
    auprc_metric_train = AveragePrecision(task="multiclass", num_classes=2).to(device)

    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=train_parameters["learning_rate"], eps=train_parameters["adam_epsilon"], weight_decay=0.01)
    train_loader = DataLoader(train_dataset, batch_size=train_parameters["train_batch_size"], shuffle=True)
    val_loader = DataLoader(val_dataset, shuffle=True)

    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)
    print("DEVICE USED:", accelerator.device)
    print("NUM PROCESSES:", accelerator.num_processes)

    model.train()

    step = 0  # Initialize the step variable
    max_steps = train_parameters["num_training_steps"]
    criterion = torch.nn.CrossEntropyLoss()
    while step < max_steps:

        for train_source, train_targets in train_loader:
            with accelerator.accumulate(model):
                output = model(train_source[0], train_source[1])
                loss = criterion(output, train_targets)
                grad_mean = []

                accelerator.backward(loss)
                auroc_metric_train.update(output, train_targets)
                auprc_metric_train.update(output, train_targets)

                # Compute metrics at a frequency of your choice (e.g., every 5 steps)
                if (step + 1) % train_parameters["log_performance_every"] == 0:
                    auroc_score_train = auroc_metric_train.compute()
                    auprc_score_train = auprc_metric_train.compute()
                    auroc_metric_train.reset()
                    auroc_metric_train.reset()
                    # compute metrics for validation set
                    cpu_model = copy.deepcopy(model).cpu()
                    eval_thread = threading.Thread(
                        target=evaluate_model, 
                        args=(cpu_model, val_loader, step+1)
                    )
                    eval_thread.start()

            
                optimizer.step()

                for param in model.parameters():
                    if param.grad is not None:
                        #print(param.grad.mean())
                        grad_mean.append(param.grad.mean().cpu().detach())
                
                #plt.plot(grad_mean)

                optimizer.zero_grad()  # Reset gradients

                if (step + 1) % train_parameters["log_performance_every"] == 0:
                    print(f"Step {step + 1} - Loss: {loss.item():.4f}, AUROC: {auroc_score_train:.4f}, AUPRC: {auprc_score_train:.4f}")

                step += 1

                if step >= max_steps:
                    break
                

finetune_and_evaluate(model)

DEVICE USED: cuda
NUM PROCESSES: 1
Step 100 - Loss: 0.6549, AUROC: 0.5196, AUPRC: 0.5222
Step 100 - AUROC val: 0.4994, AUPRC val: 0.5152
Step 200 - Loss: 0.6820, AUROC: 0.5094, AUPRC: 0.5214
Step 200 - AUROC val: 0.6207, AUPRC val: 0.6703
Step 300 - Loss: 0.6445, AUROC: 0.6034, AUPRC: 0.5461
Step 300 - AUROC val: 0.5646, AUPRC val: 0.6114
Step 400 - Loss: 0.6008, AUROC: 0.5900, AUPRC: 0.5513
Step 400 - AUROC val: 0.6143, AUPRC val: 0.6437
Step 500 - Loss: 0.7124, AUROC: 0.6133, AUPRC: 0.5637


Step 500 - AUROC val: 0.5619, AUPRC val: 0.5998


## Longformer

In [None]:
longformer = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
longformer_base_tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')



In [None]:
pretrain_and_evaluate(training_args, longformer, longformer_base_tokenizer, eval_only=False, model_path=None)
