In [1]:
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score, precision_score, recall_score
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import StratifiedKFold


from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

from sklearn.metrics import precision_recall_fscore_support

In [None]:
def tokenize_examples(examples, tokenizer, classes):
    text = f"Issue: {examples['issue']}.\nAnswer: {examples['post_text']}"
    labels = [examples[label] for label in classes]
    tokenized_inputs = tokenizer(text, truncation=True, max_length=700, padding=True)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels']).type(torch.float)
    return d


# define which metrics to compute for evaluation
def compute_metrics(p, id2class, classes):
    predictions, labels = p
    predictions_binary = predictions > 0
    
    metrics = {}
    for j, dim in enumerate(classes):
        scores = precision_recall_fscore_support(
            [x[j] for x in labels], [x[j] for x in predictions_binary], average="macro"
        )
        metrics["Macro-F1 " + dim] = scores[2]
        # print({"Macro-F1 " + dim + ": ", scores[2]})
    return metrics


# create custom trainer class to be able to pass label weights and calculate mutilabel loss
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights
    
    def compute_loss(self, model, inputs, num_items_in_batch=1000, return_outputs=False):
        labels = inputs.pop("labels")
        
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss


In [3]:
from datasets import load_dataset
    
ds = load_dataset('timonziegenbein/appropriateness-corpus')

classes = [
    'Toxic Emotions',
    'Missing Commitment',
    'Missing Intelligibility',
    'Other Reasons',
    'Inappropriateness',
    'Excessive Intensity',
    'Emotional Deception',
    'Missing Seriousness',
    'Missing Openness',
    'Unclear Meaning',
    'Missing Relevance',
    'Confusing Reasoning',
    'Detrimental Orthography',
    'Reason Unclassified'
]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}


model_name = 'microsoft/deberta-v3-large'
# model_name = 'multilabel_deberta_v3_large_peft'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token #= tokenizer.special_tokens_map['pad_token']
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer, classes=classes), batched=False)
tokenized_ds = tokenized_ds.with_format('torch')

labels = tokenized_ds['train']['labels']
# label_weights = torch.ones(len(classes))
label_weights = 1 / labels.mean(dim=0, dtype=torch.float32)
print(label_weights)



tensor([ 3.7029,  2.9652,  2.8077, 20.1711,  1.8403,  5.4362,  5.0262, 11.8837,
         3.3326,  4.7757,  4.3183, 12.5656, 27.8727, 69.6818])


In [4]:
tokenized_ds = tokenized_ds.shuffle()

In [5]:
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 8, # the dimension of the low-rank matrices
    lora_alpha = 16, # scaling factor for LoRA activations vs pre-trained weight activations
    # target_modules="all-linear",
    # target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    # target_modules = ['query_proj', 'value_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="cuda:0",
    quantization_config=quantization_config,
    num_labels=len(classes),
    problem_type="multi_label_classification",
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# define training args
training_args = TrainingArguments(
    output_dir = 'multilabel_classification',
    logging_dir = 'multilabel_classification/logs',
    learning_rate = 1e-3,
    per_device_train_batch_size = 8, # tested with 16gb gpu ram
    per_device_eval_batch_size = 8,
    num_train_epochs = 10,
    # weight_decay = 0.01,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

In [18]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['validation'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = functools.partial(compute_metrics, id2class=id2class, classes=classes),
    label_weights = torch.tensor(label_weights, device=model.device)
)

# trainer.train()

  label_weights = torch.tensor(label_weights, device=model.device)
  super().__init__(**kwargs)


In [14]:
from pathlib import Path
import pandas as pd


results_dir = Path("../results/deberta-v3-large")
if not results_dir.exists():
    results_dir.mkdir(parents=True)
    
val_metrics = trainer.evaluate(tokenized_ds['validation'], metric_key_prefix="validation")
print(f"{val_metrics=}")
# pd.DataFrame(val_metrics, index=[0]).to_csv(results_dir / "validation.csv")

  0%|          | 0/28 [00:00<?, ?it/s]

val_metrics={'validation_loss': 1.2480069398880005, 'validation_model_preparation_time': 0.0112, 'validation_Macro-F1 Toxic Emotions': 0.65625, 'validation_Macro-F1 Missing Commitment': 0.25647395868305023, 'validation_Macro-F1 Missing Intelligibility': 0.5214990798059227, 'validation_Macro-F1 Other Reasons': 0.0629393763956662, 'validation_Macro-F1 Inappropriateness': 0.36046511627906974, 'validation_Macro-F1 Excessive Intensity': 0.6024659863945578, 'validation_Macro-F1 Emotional Deception': 0.1634980988593156, 'validation_Macro-F1 Missing Seriousness': 0.16469132176795118, 'validation_Macro-F1 Missing Openness': 0.26538108356290174, 'validation_Macro-F1 Unclear Meaning': 0.17539824516568703, 'validation_Macro-F1 Missing Relevance': 0.18518518518518517, 'validation_Macro-F1 Confusing Reasoning': 0.27134741505998994, 'validation_Macro-F1 Detrimental Orthography': 0.5573440643863179, 'validation_Macro-F1 Reason Unclassified': 0.25621345029239767, 'validation_runtime': 8.7841, 'validati

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
test_metrics = trainer.evaluate(tokenized_ds['test'], metric_key_prefix="test")
print(f"{test_metrics=}")
# pd.DataFrame(test_metrics, index=[0]).to_csv(results_dir / "test.csv")

  0%|          | 0/55 [00:00<?, ?it/s]

test_metrics={'test_loss': 1.234268307685852, 'test_model_preparation_time': 0.0081, 'test_Macro-F1 Toxic Emotions': 0.7109288724801517, 'test_Macro-F1 Missing Commitment': 0.24871355060034306, 'test_Macro-F1 Missing Intelligibility': 0.6248984401876752, 'test_Macro-F1 Other Reasons': 0.2254663377588071, 'test_Macro-F1 Inappropriateness': 0.3443283004258614, 'test_Macro-F1 Excessive Intensity': 0.6537549407114625, 'test_Macro-F1 Emotional Deception': 0.2999518855872447, 'test_Macro-F1 Missing Seriousness': 0.6528500739027054, 'test_Macro-F1 Missing Openness': 0.2506196067562454, 'test_Macro-F1 Unclear Meaning': 0.3424657534246575, 'test_Macro-F1 Missing Relevance': 0.43410852713178294, 'test_Macro-F1 Confusing Reasoning': 0.5516405366090189, 'test_Macro-F1 Detrimental Orthography': 0.30250794637702305, 'test_Macro-F1 Reason Unclassified': 0.5184117171130428, 'test_runtime': 17.2191, 'test_samples_per_second': 25.437, 'test_steps_per_second': 3.194}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# save model
peft_model_id = 'multilabel_deberta_v3_large_peft'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

('multilabel_deberta_v3_large_peft/tokenizer_config.json',
 'multilabel_deberta_v3_large_peft/special_tokens_map.json',
 'multilabel_deberta_v3_large_peft/spm.model',
 'multilabel_deberta_v3_large_peft/added_tokens.json',
 'multilabel_deberta_v3_large_peft/tokenizer.json')

In [None]:
from huggingface_hub import login
token = "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"
login(token)

repository_id = 'anismk/' + peft_model_id
trainer.model.push_to_hub(repository_id)
tokenizer.push_to_hub(repository_id)

adapter_model.safetensors:   0%|          | 0.00/3.22M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/anismk/multilabel_deberta_v3_large_peft/commit/b15cd9dd7f43c674170a74ed8f71d7ba33f078a4', commit_message='Upload tokenizer', commit_description='', oid='b15cd9dd7f43c674170a74ed8f71d7ba33f078a4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/anismk/multilabel_deberta_v3_large_peft', endpoint='https://huggingface.co', repo_type='model', repo_id='anismk/multilabel_deberta_v3_large_peft'), pr_revision=None, pr_num=None)

In [None]:
# load model
repository_id = 'anismk/' + peft_model_id
model = repository_id
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForSequenceClassification.from_pretrained(
    model,
    device_map="cuda:0",
    num_labels=len(classes),
    problem_type="multi_label_classification",
)

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


adapter_model.safetensors:   0%|          | 0.00/3.22M [00:00<?, ?B/s]

In [8]:
model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=1024, bias=False)
                )
        