In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import numpy as np
import torch

from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training, AdaLoraConfig

import logging

from datasets import load_dataset

raw_datasets  = load_dataset("glue", 'rte')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification


model_name = "microsoft/deberta-v3-base"

#config.num_labels=2
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [3]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer.padding_side = 'left'
mask_token = tokenizer.mask_token



# col_to_delete = ['idx']
col_to_delete = ['sentence1','sentence2']

def preprocessing_function(examples):
    prompts = [
        f"Premise: {premise} Hypothesis: {hypothesis} "
        f"Does the premise imply the hypothesis? Answer:{mask_token}"
        for premise, hypothesis in zip(examples["sentence1"], examples["sentence2"])
    ]
    return tokenizer(prompts, padding = False, truncation=True, max_length=512)

tokenized_datasets = raw_datasets.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 2490/2490 [00:00<00:00, 8811.77 examples/s]
Map: 100%|██████████| 277/277 [00:00<00:00, 9311.19 examples/s]
Map: 100%|██████████| 3000/3000 [00:00<00:00, 10156.31 examples/s]


In [4]:
tokenizer.decode(tokenized_datasets['validation']['input_ids'][10])

'[CLS] Premise: The international humanitarian aid organization, Doctors Without Borders/Medecins Sans Frontieres (MSF), continues to treat victims of violence in all locations where it is present in Darfur. Hypothesis: Doctors Without Borders is an international aid organization. Does the premise imply the hypothesis? Answer:[MASK][SEP]'

In [5]:
tokenizer.mask_token_id

128000

In [6]:
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification
from transformers import AutoModelForSequenceClassification
from transformers.activations import ACT2FN
import random
# from modeling import MLMSequenceClassification

config = AutoConfig.from_pretrained(model_name)
config.mask_token_id=tokenizer.mask_token_id

model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config, )

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import RoCoFT

RoCoFT.PEFT(model, method='column', rank=3) 
#targets=['key', 'value', 'dense', 'query'])

In [8]:
model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): column()
              (key_proj): column()
              (value_proj): column()
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): column()
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): DebertaV2Intermediat

In [9]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):


    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [10]:
from transformers import TrainingArguments, Trainer

import time
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='dir',
    learning_rate=2e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.20,
    eval_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=10000000,
    logging_steps=100,
   
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",  # You can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [11]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
100,0.7005,0.682763,0.613458,0.533279,0.444673,0.555957
200,0.6919,0.687468,0.613423,0.52907,0.433201,0.552347
300,0.6965,0.691412,0.263538,0.5,0.345154,0.527076
400,0.6967,0.693924,0.236462,0.5,0.321078,0.472924
500,0.696,0.693107,0.764493,0.503817,0.353547,0.530686
600,0.6945,0.691806,0.263538,0.5,0.345154,0.527076
700,0.6941,0.693257,0.236462,0.5,0.321078,0.472924
800,0.696,0.691512,0.263538,0.5,0.345154,0.527076
900,0.6946,0.697006,0.236462,0.5,0.321078,0.472924
1000,0.6952,0.698315,0.386577,0.47093,0.339706,0.447653


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=3120, training_loss=0.4992836372974591, metrics={'train_runtime': 593.9058, 'train_samples_per_second': 83.852, 'train_steps_per_second': 5.253, 'total_flos': 19094072276928.0, 'train_loss': 0.4992836372974591, 'epoch': 20.0})