In [1]:
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import StratifiedKFold


from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

In [8]:
def tokenize_examples(examples, tokenizer, classes):
    text = f"Issue: {examples['issue']}.\nAnswer: {examples['post_text']}"
    labels = [examples[label] for label in classes]
    tokenized_inputs = tokenizer(text, truncation=True, max_length=700, padding=True)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels']).type(torch.float)
    return d


# define which metrics to compute for evaluation
def compute_metrics(p):
    predictions, labels = p
    f1_micro = f1_score(labels, predictions > 0, average = 'micro')
    f1_macro = f1_score(labels, predictions > 0, average = 'macro')
    f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
    
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'predictions': predictions,
        'labels': labels
    }


# create custom trainer class to be able to pass label weights and calculate mutilabel loss
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights
    
    def compute_loss(self, model, inputs, num_items_in_batch=1000, return_outputs=False):
        labels = inputs.pop("labels")
        
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss


In [3]:
from datasets import load_dataset
    
ds = load_dataset('timonziegenbein/appropriateness-corpus')
# classes = [
#     'Excessive Intensity',
#     'Emotional Deception',
#     'Missing Seriousness',
#     'Missing Openness',
#     'Unclear Meaning',
#     'Missing Relevance',
#     'Confusing Reasoning',
#     'Detrimental Orthography',
#     'Reason Unclassified'
# ]
# classes = [
#     'Toxic Emotions',
#     'Missing Commitment',
#     'Missing Intelligibility',
#     'Other Reasons'
# ]
classes = [
    'Inappropriateness'
]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

# model name
# model_name = 'Linq-AI-Research/Linq-Embed-Mistral'
# model_name = 'dunzhang/stella_en_1.5B_v5'
# model_name = 'dunzhang/stella_en_400M_v5'
# model_name = 'HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1'
# model_name = 'textdetox/xlmr-large-toxicity-classifier'
# model_name = 'JungleLee/bert-toxic-comment-classification'
model_name = 'microsoft/deberta-v3-large'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token #= tokenizer.special_tokens_map['pad_token']
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer, classes=classes), batched=False)
tokenized_ds = tokenized_ds.with_format('torch')

labels = tokenized_ds['train']['labels']
label_weights = 1 - labels.sum(axis=0) / labels.sum()



Map:   0%|          | 0/438 [00:00<?, ? examples/s]

In [4]:
tokenized_ds['train']['input_ids']

[tensor([    1, 12217,   294,   273,   268,   262,   563,  6945,   266,   397,
           289,   966,   781,   294,   260, 10519,   294,   355,   295,   297,
           282,  2705,   264,  1929,   563, 16280,   261,   584,   952,   448,
           604,   303,   262,  5167,   451,  1437,   786,   306,   409,   264,
           289,  6421,   409,   264,  1929,   563, 16280,   260,   273,   428,
           448,  4891,   403,   428,   704,   541,  1712,   262, 16280,     2]),
 tensor([    1, 12217,   294, 11841,  2601,  1565, 19591,   294,   260, 10519,
           294,   512,   726,   265,  3585, 74161,   291,  3867,   261,   263,
           296,  1138,   262,  6025,   264,  1394,   264,   262,  4448,  1019,
         36108,   260,   329,  1180,   261,   307, 43864,   268, 14643,  1950,
           309,   261,   273,   333,   298,   428,   278,   666,   339,   274,
           428,   278,   490,   260,   273,  6544,   290,  1674,   292,   753,
           517,   261,   337,   942,   995,   260,

In [5]:
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 8, # the dimension of the low-rank matrices
    lora_alpha = 16, # scaling factor for LoRA activations vs pre-trained weight activations
    # target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    target_modules = ['query_proj', 'value_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# transfer learning, we're going to take another BertForSequenceClassification
# trained on a text toxicity dataset (has different number of classes), and
# remove the classifier layers and replace with our own

# model = AutoModelForSequenceClassification.from_pretrained(
#     'xlm-roberta-base',
#     # 'bert-base-uncased',
#     device_map='cuda:0',
#     # device_map="auto",
#     # quantization_config=quantization_config,
#     num_labels=len(classes),
# )

# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     device_map='cuda:0',
#     num_labels=len(classes),
#     quantization_config=quantization_config,
# )

# print(model.roberta)
# model.roberta = pretrained_classification_model.roberta

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="cuda:0",
    quantization_config=quantization_config,
    num_labels=len(classes),
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# define training args
training_args = TrainingArguments(
    output_dir = 'multilabel_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8, # tested with 16gb gpu ram
    per_device_eval_batch_size = 8,
    num_train_epochs = 1,
    # weight_decay = 0.01,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

In [9]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['validation'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics,
    # label_weights = torch.tensor(label_weights, device=model.device)
)

trainer.train()

  trainer = Trainer(
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,Predictions,Labels
1,No log,0.23866,0.563636,0.360465,0.406342,[0.65236676 0.6638213 0.57245964 0.5941531 0.56162465 0.56110007  0.535322 0.53109497 0.5251221 0.5739697 0.5184384 0.55450475  0.5725174 0.626038 0.65710133 0.57297593 0.6361707 0.6198534  0.56361747 0.57609624 0.6751767 0.7151615 0.5088541 0.5386518  0.5830058 0.5188453 0.54238355 0.540324 0.53128463 0.54355216  0.52359253 0.52956706 0.5330857 0.53347725 0.582632 0.68360513  0.57485414 0.53720874 0.59606105 0.5650881 0.5667899 0.53153354  0.56980824 0.5883501 0.58247226 0.6306316 0.5511114 0.5629229  0.62750083 0.7160303 0.73775274 0.5468491 0.5691342 0.5491274  0.5491348 0.5759843 0.61132437 0.5127854 0.71941257 0.54445225  0.5789981 0.5666719 0.6155562 0.71892524 0.7013577 0.61167336  0.54068446 0.5946757 0.5253277 0.53088653 0.56482905 0.53350204  0.52675533 0.5511075 0.56761247 0.5585371 0.72887665 0.59853774  0.7459256 0.60536104 0.5938397 0.75203925 0.5718512 0.5524869  0.5765202 0.5401058 0.5167723 0.5621033 0.5899311 0.5484026  0.6379381 0.6146622 0.582531 0.6452974 0.6078588 0.5783549  0.6837421 0.76254433 0.59956485 0.5864193 0.5701326 0.58853674  0.5875316 0.5498969 0.5355764 0.56350714 0.73417014 0.59084934  0.57033974 0.7042946 0.72765917 0.5580262 0.592692 0.5242223  0.5620859 0.54568547 0.57267386 0.51511073 0.5625551 0.5364627  0.56275064 0.5090695 0.70241195 0.59518844 0.5636174 0.6319038  0.6368566 0.5856157 0.6923082 0.6953593 0.5539807 0.562329  0.5177664 0.60318285 0.545175 0.5423509 0.5576578 0.52807194  0.60644907 0.6212999 0.6164852 0.5757513 0.5965602 0.5590798  0.55837435 0.71765065 0.64589816 0.5795442 0.5447995 0.54440767  0.5664652 0.53561974 0.5110367 0.5184536 0.5294214 0.69891185  0.5703286 0.59441835 0.6854164 0.6554501 0.5536508 0.6515526  0.64039475 0.51449704 0.54959875 0.5510102 0.5623514 0.578319  0.5449314 0.5467474 0.76831424 0.7173019 0.62669605 0.6196905  0.57892025 0.6515767 0.5723492 0.51760024 0.5111386 0.5707229  0.5491231 0.55036587 0.5282813 0.538579 0.5966385 0.5295961  0.5606307 0.5509612 0.51853335 0.54644364 0.54823774 0.62997556  0.55107164 0.5446828 0.5893359 0.58575255 0.56724787 0.5266154  0.5611971 0.5237957 0.54215735 0.5493188 0.53716785 0.53993803  0.53624046 0.5754669 0.5863097 0.58296466 0.50638616 0.69068205  0.52619 0.6643248 0.66197175 0.54432577 0.5547873 0.5751069  0.5532269 0.54142565 0.56644017 0.5486296 ],[[1.]  [1.]  [0.]  [1.]  [0.]  [1.]  [1.]  [0.]  [0.]  [1.]  [0.]  [0.]  [0.]  [1.]  [0.]  [1.]  [1.]  [1.]  [0.]  [0.]  [1.]  [1.]  [0.]  [0.]  [1.]  [1.]  [1.]  [0.]  [0.]  [0.]  [1.]  [0.]  [1.]  [0.]  [1.]  [0.]  [1.]  [1.]  [1.]  [1.]  [0.]  [0.]  [0.]  [0.]  [1.]  [1.]  [0.]  [1.]  [1.]  [1.]  [1.]  [0.]  [0.]  [1.]  [1.]  [1.]  [1.]  [0.]  [0.]  [1.]  [1.]  [1.]  [1.]  [0.]  [1.]  [0.]  [1.]  [0.]  [1.]  [1.]  [1.]  [0.]  [1.]  [0.]  [0.]  [1.]  [1.]  [1.]  [1.]  [0.]  [0.]  [1.]  [0.]  [0.]  [1.]  [0.]  [1.]  [1.]  [1.]  [0.]  [1.]  [0.]  [0.]  [0.]  [1.]  [0.]  [1.]  [1.]  [1.]  [1.]  [0.]  [1.]  [0.]  [1.]  [1.]  [1.]  [1.]  [1.]  [0.]  [1.]  [1.]  [0.]  [1.]  [1.]  [0.]  [0.]  [1.]  [0.]  [0.]  [0.]  [0.]  [0.]  [1.]  [1.]  [0.]  [0.]  [1.]  [0.]  [0.]  [1.]  [1.]  [1.]  [1.]  [1.]  [0.]  [1.]  [0.]  [1.]  [1.]  [0.]  [0.]  [1.]  [1.]  [0.]  [0.]  [1.]  [0.]  [0.]  [1.]  [1.]  [1.]  [1.]  [1.]  [1.]  [1.]  [1.]  [1.]  [0.]  [0.]  [0.]  [0.]  [1.]  [0.]  [0.]  [0.]  [1.]  [1.]  [1.]  [0.]  [1.]  [1.]  [1.]  [0.]  [0.]  [1.]  [1.]  [0.]  [1.]  [1.]  [1.]  [0.]  [1.]  [0.]  [1.]  [1.]  [0.]  [1.]  [0.]  [1.]  [0.]  [0.]  [1.]  [0.]  [0.]  [1.]  [1.]  [0.]  [1.]  [1.]  [0.]  [1.]  [1.]  [0.]  [1.]  [0.]  [1.]  [0.]  [0.]  [1.]  [1.]  [1.]  [1.]  [1.]  [1.]  [1.]  [0.]  [0.]  [0.]  [0.]  [0.]]


TypeError: Object of type ndarray is not JSON serializable

In [None]:
trainer.predict(tokenized_ds['test'])

In [None]:
# save model
peft_model_id = 'multilabel_mistral'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

In [None]:
# load model
peft_model_id = 'multilabel_mistral'
model = AutoModelForSequenceClassification.from_pretrained(peft_model_id)