In [1]:
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score, precision_score, recall_score
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import StratifiedKFold


from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

In [2]:
def tokenize_examples(examples, tokenizer, classes):
    text = f"Issue: {examples['issue']}.\nAnswer: {examples['post_text']}"
    labels = [examples[label] for label in classes]
    tokenized_inputs = tokenizer(text, truncation=True, max_length=700, padding=True)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs


# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels']).type(torch.float)
    return d


# define which metrics to compute for evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions_binary = predictions > 0
    
    # Average metrics
    f1_micro = f1_score(labels, predictions_binary, average='micro')
    f1_macro = f1_score(labels, predictions_binary, average='macro')
    f1_weighted = f1_score(labels, predictions_binary, average='weighted')
    
    precision_micro = precision_score(labels, predictions_binary, average='micro')
    precision_macro = precision_score(labels, predictions_binary, average='macro')
    precision_weighted = precision_score(labels, predictions_binary, average='weighted')
    
    recall_micro = recall_score(labels, predictions_binary, average='micro')
    recall_macro = recall_score(labels, predictions_binary, average='macro')
    recall_weighted = recall_score(labels, predictions_binary, average='weighted')
    
    # Per-class metrics
    precision_per_class = precision_score(labels, predictions_binary, average=None)
    recall_per_class = recall_score(labels, predictions_binary, average=None)
    
    metrics = {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        
        'precision_micro': precision_micro,
        'precision_macro': precision_macro,
        'precision_weighted': precision_weighted,
        
        'recall_micro': recall_micro,
        'recall_macro': recall_macro,
        'recall_weighted': recall_weighted,
        
        'precision_per_class': precision_per_class.tolist(),
        'recall_per_class': recall_per_class.tolist(),
    }
    
    return metrics


# create custom trainer class to be able to pass label weights and calculate mutilabel loss
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights
    
    def compute_loss(self, model, inputs, num_items_in_batch=1000, return_outputs=False):
        labels = inputs.pop("labels")
        
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss


In [None]:
from datasets import load_dataset
    
ds = load_dataset('timonziegenbein/appropriateness-corpus')
classes = [
    'Excessive Intensity',
    'Emotional Deception',
    'Missing Seriousness',
    'Missing Openness',
    'Unclear Meaning',
    'Missing Relevance',
    'Confusing Reasoning',
    'Detrimental Orthography',
    'Reason Unclassified'
]
# classes = [
#     'Toxic Emotions',
#     'Missing Commitment',
#     'Missing Intelligibility',
#     'Other Reasons'
# ]
# classes = [
#     'Inappropriateness'
# ]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

# model name
# model_name = 'Linq-AI-Research/Linq-Embed-Mistral'
# model_name = 'dunzhang/stella_en_1.5B_v5'
# model_name = 'dunzhang/stella_en_400M_v5'
# model_name = 'HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1'
# model_name = 'textdetox/xlmr-large-toxicity-classifier'
# model_name = 'JungleLee/bert-toxic-comment-classification'
model_name = 'microsoft/deberta-v3-large'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token #= tokenizer.special_tokens_map['pad_token']
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer, classes=classes), batched=False)
tokenized_ds = tokenized_ds.with_format('torch')

labels = tokenized_ds['train']['labels']
label_weights = torch.ones(len(classes))
print(label_weights)



Map:   0%|          | 0/220 [00:00<?, ? examples/s]

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1.])


In [None]:
# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 8, # the dimension of the low-rank matrices
    lora_alpha = 16, # scaling factor for LoRA activations vs pre-trained weight activations
    # target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    target_modules = ['query_proj', 'value_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# transfer learning, we're going to take another BertForSequenceClassification
# trained on a text toxicity dataset (has different number of classes), and
# remove the classifier layers and replace with our own

# model = AutoModelForSequenceClassification.from_pretrained(
#     'xlm-roberta-base',
#     # 'bert-base-uncased',
#     device_map='cuda:0',
#     # device_map="auto",
#     # quantization_config=quantization_config,
#     num_labels=len(classes),
    # problem_type="multi_label_classification",
# )

# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     device_map='cuda:0',
#     num_labels=len(classes),
#     quantization_config=quantization_config,
# )

# print(model.roberta)
# model.roberta = pretrained_classification_model.roberta

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    device_map="cuda:0",
    quantization_config=quantization_config,
    num_labels=len(classes),
    problem_type="multi_label_classification",
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# define training args
training_args = TrainingArguments(
    output_dir = 'multilabel_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8, # tested with 16gb gpu ram
    per_device_eval_batch_size = 8,
    num_train_epochs = 10,
    # weight_decay = 0.01,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

In [None]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['validation'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics,
    label_weights = torch.tensor(label_weights, device=model.device)
)

trainer.train()

  label_weights = torch.tensor(label_weights, device=model.device)
  super().__init__(**kwargs)
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Weighted,Precision Micro,Precision Macro,Precision Weighted,Recall Micro,Recall Macro,Recall Weighted,Precision Per Class,Recall Per Class
1,No log,0.37692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2,No log,0.367009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
3,0.394600,0.353967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
4,0.394600,0.332923,0.243902,0.120601,0.206335,0.592105,0.196354,0.328435,0.153584,0.088761,0.153584,"[0.0, 0.0, 0.0, 0.6, 0.5789473684210527, 0.5882352941176471, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.36363636363636365, 0.2391304347826087, 0.19607843137254902, 0.0, 0.0, 0.0]"
5,0.394600,0.341375,0.275325,0.135936,0.232051,0.576087,0.24358,0.392105,0.180887,0.104556,0.180887,"[0.0, 0.5, 0.0, 0.6222222222222222, 0.55, 0.52, 0.0, 0.0, 0.0]","[0.0, 0.022727272727272728, 0.0, 0.42424242424242425, 0.2391304347826087, 0.2549019607843137, 0.0, 0.0, 0.0]"
6,0.314300,0.33494,0.308483,0.164062,0.272591,0.625,0.269644,0.433617,0.204778,0.121391,0.204778,"[0.0, 0.5833333333333334, 0.0, 0.6744186046511628, 0.55, 0.6190476190476191, 0.0, 0.0, 0.0]","[0.0, 0.1590909090909091, 0.0, 0.4393939393939394, 0.2391304347826087, 0.2549019607843137, 0.0, 0.0, 0.0]"
7,0.314300,0.339822,0.309645,0.175118,0.283055,0.60396,0.321212,0.493981,0.208191,0.126946,0.208191,"[0.5, 0.6, 0.0, 0.65, 0.55, 0.5909090909090909, 0.0, 0.0, 0.0]","[0.05, 0.20454545454545456, 0.0, 0.3939393939393939, 0.2391304347826087, 0.2549019607843137, 0.0, 0.0, 0.0]"


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return fn(*args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return fn(*args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return fn(*args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{met

In [1]:
m = trainer.predict(tokenized_ds['test']).metrics
del m['test_labels']
del m['test_predictions']
m

NameError: name 'trainer' is not defined

In [None]:
# save model
peft_model_id = 'multilabel_mistral'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

In [None]:
# load model
peft_model_id = 'multilabel_mistral'
model = AutoModelForSequenceClassification.from_pretrained(peft_model_id)