In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

import numpy as np
import torch

from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training, AdaLoraConfig

import logging

from datasets import load_dataset

raw_datasets  = load_dataset("glue", 'cola')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification


model_name = "microsoft/deberta-v3-base"

#config.num_labels=2
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [4]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer.padding_side = 'left'
mask_token = tokenizer.mask_token



# col_to_delete = ['idx']
col_to_delete = ['question','sentence']

def preprocessing_function(examples):
    prompts = [
        f"Sentence: {sentence} Is this sentence grammatically correct? Answer{mask_token}"
        for sentence in examples["sentence"]
    ]
    return tokenizer(prompts, truncation=True, max_length=512)

tokenized_datasets = raw_datasets.map(preprocessing_function, batched=True)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map: 100%|██████████| 8551/8551 [00:00<00:00, 46201.75 examples/s]
Map: 100%|██████████| 1043/1043 [00:00<00:00, 41264.92 examples/s]
Map: 100%|██████████| 1063/1063 [00:00<00:00, 32608.63 examples/s]


In [5]:
tokenizer.decode(tokenized_datasets['validation']['input_ids'][10])

'[CLS] Sentence: The more Fred is obnoxious, the less attention you should pay to him. Is this sentence grammatically correct? Answer[MASK][SEP]'

In [6]:
tokenizer.mask_token_id

128000

In [7]:
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification
from transformers.activations import ACT2FN
import random
from modeling import MLMSequenceClassification

config = AutoConfig.from_pretrained(model_name, num_labels=2)
config.mask_token_id = 50264
model = MLMSequenceClassification.from_pretrained(model_name, config=config, mask_token_id=tokenizer.mask_token_id)


Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import RoCoFT

RoCoFT.PEFT(model, method='column', rank=3) 
#targets=['key', 'value', 'dense', 'query'])

In [9]:
model

MLMSequenceClassification(
  (transformer): DebertaV2ForMaskedLM(
    (deberta): DebertaV2Model(
      (embeddings): DebertaV2Embeddings(
        (word_embeddings): Embedding(128100, 768, padding_idx=0)
        (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): DebertaV2Encoder(
        (layer): ModuleList(
          (0-11): 12 x DebertaV2Layer(
            (attention): DebertaV2Attention(
              (self): DisentangledSelfAttention(
                (query_proj): column()
                (key_proj): column()
                (value_proj): column()
                (pos_dropout): Dropout(p=0.1, inplace=False)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): DebertaV2SelfOutput(
                (dense): column()
                (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
                (dropout): Dropout(p=0.1, inplace=False)


In [10]:
import evaluate
import numpy as np
from sklearn import metrics
import torch
import numpy as np

def compute_metrics(eval_pred):


    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    precision = metrics.precision_score(labels, predictions, average="macro")
    recall = metrics.recall_score(labels, predictions, average="macro")
    f1 = metrics.f1_score(labels, predictions, average="macro")
    accuracy = metrics.accuracy_score(labels, predictions)
    
    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}

In [11]:
from transformers import TrainingArguments, Trainer

import time
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='dir',
    learning_rate=2e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.20,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=10000000,
    logging_steps=100,
   
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",  # You can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],

    data_collator=data_collator,
    compute_metrics=compute_metrics
)



[2025-05-05 18:57:39,634] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/guangyu/anaconda3/envs/MD/comp

In [12]:
trainer.train()

Got mask position:  tensor(-2, device='cuda:0')


Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
100,0.6114,0.445539,0.830405,0.709205,0.733262,0.808245
200,0.4464,0.422165,0.759765,0.79277,0.767815,0.787152
300,0.4124,0.453952,0.844032,0.72612,0.752051,0.819751
400,0.4251,0.468971,0.838287,0.73904,0.764087,0.824545
500,0.3898,0.455314,0.854831,0.701306,0.726313,0.809204
600,0.3534,0.336272,0.825785,0.826343,0.826063,0.85139
700,0.3231,0.401877,0.804344,0.82761,0.813354,0.834132
800,0.3377,0.379106,0.848686,0.791971,0.811879,0.85139
900,0.3669,0.366038,0.847504,0.781629,0.803376,0.846596
1000,0.3232,0.378865,0.852947,0.799735,0.818949,0.856184


TrainOutput(global_step=5350, training_loss=0.21168220707189256, metrics={'train_runtime': 712.578, 'train_samples_per_second': 120.001, 'train_steps_per_second': 7.508, 'total_flos': 16288147726200.0, 'train_loss': 0.21168220707189256, 'epoch': 10.0})