In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import gc
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report, matthews_corrcoef
import pandas as pd
import wandb
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["WANDB_PROJECT"] = "DEBATE-ModernBERT-Base"

modname = "MoritzLaurer/ModernBERT-base-zeroshot-v2.0"
training_directory ='training_ModernBase'
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

In [None]:
ds = load_dataset("mlburnham/Pol_NLI")

In [4]:
tokenizer = AutoTokenizer.from_pretrained(modname)
id2label = {0: "entailment", 1: "not_entailment"}
label2id = {"entailment":0, "not_entailment":1}

def tokenize_function(docs):
    return tokenizer(docs['premise'], docs['augmented_hypothesis'], padding = False, truncation = False)
def model_init():
  return AutoModelForSequenceClassification.from_pretrained(modname, 
                                                           num_labels=2,
                                                           ignore_mismatched_sizes=True,
                                                           label2id = label2id, 
                                                           id2label = id2label)

dstok = ds.map(tokenize_function, batched = True)
dstok = dstok.rename_columns({'entailment':'label'})

training_args = TrainingArguments(output_dir=training_directory,
    logging_dir=f'{training_directory}/logs',
    lr_scheduler_type= "linear",
    group_by_length=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=10,
    warmup_ratio=0.06, 
    weight_decay=0.01,
    fp16=True,
    fp16_full_eval=True,
    eval_strategy="epoch",
    seed=1,
    save_strategy="epoch",
    dataloader_num_workers = 1
)

def compute_metrics_standard(eval_pred, label_text_alphabetical=list(id2label.values())):
    labels = eval_pred.label_ids
    pred_logits = eval_pred.predictions
    preds_max = np.argmax(pred_logits, axis=1)

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro')
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')
    acc_balanced = balanced_accuracy_score(labels, preds_max)
    acc_not_balanced = accuracy_score(labels, preds_max)
    mcc = matthews_corrcoef(labels, preds_max)

    metrics = {'MCC': mcc,
            'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy_balanced': acc_balanced,
            'accuracy': acc_not_balanced,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            }
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} ) 
    print("Detailed metrics: ", classification_report(
        labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical, sample_weight=None,
        digits=2, output_dict=True, zero_division='warn'),
    "\n")

    return metrics

trainer = Trainer(
    model_init=model_init,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=dstok['train'],
    eval_dataset=dstok['validation'],
    compute_metrics=lambda x: compute_metrics_standard(x, label_text_alphabetical=list(id2label.values()))
)

Device: cuda


You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in ModernBertForSequenceClassification is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`


In [5]:
trainer.train()



Epoch,Training Loss,Validation Loss,Mcc,F1 Macro,F1 Micro,Accuracy Balanced,Accuracy,Precision Macro,Recall Macro,Precision Micro,Recall Micro
1,0.1818,0.216367,0.85773,0.928861,0.930567,0.929137,0.930567,0.928594,0.929137,0.930567,0.930567
2,0.1028,0.243936,0.876359,0.938165,0.939612,0.938709,0.939612,0.93765,0.938709,0.939612,0.939612
3,0.051,0.291324,0.883721,0.941852,0.943336,0.941437,0.943336,0.942285,0.941437,0.943336,0.943336
4,0.0219,0.336943,0.888035,0.944017,0.945398,0.943989,0.945398,0.944046,0.943989,0.945398,0.945398
5,0.0141,0.443264,0.884211,0.942106,0.943536,0.942058,0.943536,0.942153,0.942058,0.943536,0.943536
6,0.0118,0.373732,0.889789,0.944834,0.946329,0.943724,0.946329,0.946067,0.943724,0.946329,0.946329
7,0.0077,0.474585,0.888586,0.944265,0.94573,0.943506,0.94573,0.945081,0.943506,0.94573,0.94573
8,0.0017,0.508356,0.891517,0.945756,0.94706,0.946004,0.94706,0.945514,0.946004,0.94706,0.94706
9,0.0013,0.562439,0.890866,0.945377,0.946595,0.946457,0.946595,0.944412,0.946457,0.946595,0.946595
10,0.0,0.58783,0.894854,0.947427,0.948723,0.947398,0.948723,0.947456,0.947398,0.948723,0.948723


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8577303790562917), 'f1_macro': 0.9288613150975644, 'f1_micro': 0.9305666400638468, 'accuracy_balanced': np.float64(0.9291367504861167), 'accuracy': 0.9305666400638468, 'precision_macro': 0.9285938004159882, 'recall_macro': 0.9291367504861167, 'precision_micro': 0.9305666400638468, 'recall_micro': 0.9305666400638468}
Detailed metrics:  {'entailment': {'precision': 0.9156853509185116, 'recall': 0.9200189304306673, 'f1-score': 0.9178470254957507, 'support': 6339.0}, 'not_entailment': {'precision': 0.9415022499134649, 'recall': 0.938254570541566, 'f1-score': 0.939875604699378, 'support': 8697.0}, 'accuracy': 0.9305666400638468, 'macro avg': {'precision': 0.9285938004159882, 'recall': 0.9291367504861167, 'f1-score': 0.9288613150975644, 'support': 15036.0}, 'weighted avg': {'precision': 0.9306181502374202, 'recall': 0.9305666400638468, 'f1-score': 0.9305886159010412, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8763587009343081), 'f1_macro': 0.9381647363490271, 'f1_micro': 0.9396115988294759, 'accuracy_balanced': np.float64(0.9387091664303628), 'accuracy': 0.9396115988294759, 'precision_macro': 0.9376501743468671, 'recall_macro': 0.9387091664303628, 'precision_micro': 0.9396115988294759, 'recall_micro': 0.9396115988294759}
Detailed metrics:  {'entailment': {'precision': 0.9244958574331718, 'recall': 0.9329547247199874, 'f1-score': 0.9287060301507538, 'support': 6339.0}, 'not_entailment': {'precision': 0.9508044912605625, 'recall': 0.9444636081407382, 'f1-score': 0.9476234425473005, 'support': 8697.0}, 'accuracy': 0.9396115988294759, 'macro avg': {'precision': 0.9376501743468671, 'recall': 0.9387091664303628, 'f1-score': 0.9381647363490271, 'support': 15036.0}, 'weighted avg': {'precision': 0.9397130819873628, 'recall': 0.9396115988294759, 'f1-score': 0.9396480849268088, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8837210713960965), 'f1_macro': 0.9418520556775534, 'f1_micro': 0.9433359936153233, 'accuracy_balanced': np.float64(0.9414367961796968), 'accuracy': 0.9433359936153233, 'precision_macro': 0.9422846819681272, 'recall_macro': 0.9414367961796968, 'precision_micro': 0.9433359936153233, 'recall_micro': 0.9433359936153233}
Detailed metrics:  {'entailment': {'precision': 0.9358220810166799, 'recall': 0.929326392175422, 'f1-score': 0.9325629254392908, 'support': 6339.0}, 'not_entailment': {'precision': 0.9487472829195744, 'recall': 0.9535472001839714, 'f1-score': 0.951141185915816, 'support': 8697.0}, 'accuracy': 0.9433359936153233, 'macro avg': {'precision': 0.9422846819681272, 'recall': 0.9414367961796968, 'f1-score': 0.9418520556775534, 'support': 15036.0}, 'weighted avg': {'precision': 0.9432981704653014, 'recall': 0.9433359936153233, 'f1-score': 0.9433088107388611, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8880348587449364), 'f1_macro': 0.944017389926101, 'f1_micro': 0.9453977121574887, 'accuracy_balanced': np.float64(0.9439889053353853), 'accuracy': 0.9453977121574887, 'precision_macro': 0.9440459552420759, 'recall_macro': 0.9439889053353853, 'precision_micro': 0.9453977121574887, 'recall_micro': 0.9453977121574887}
Detailed metrics:  {'entailment': {'precision': 0.9354482323232324, 'recall': 0.9350055213756113, 'f1-score': 0.9352268244575936, 'support': 6339.0}, 'not_entailment': {'precision': 0.9526436781609195, 'recall': 0.9529722892951592, 'f1-score': 0.9528079553946083, 'support': 8697.0}, 'accuracy': 0.9453977121574887, 'macro avg': {'precision': 0.9440459552420759, 'recall': 0.9439889053353853, 'f1-score': 0.944017389926101, 'support': 15036.0}, 'weighted avg': {'precision': 0.9453942813023735, 'recall': 0.9453977121574887, 'f1-score': 0.9453959582537639, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8842114915186957), 'f1_macro': 0.9421056364047113, 'f1_micro': 0.9435355147645651, 'accuracy_balanced': np.float64(0.9420583692632233), 'accuracy': 0.9435355147645651, 'precision_macro': 0.9421531273329296, 'recall_macro': 0.9420583692632233, 'precision_micro': 0.9435355147645651, 'recall_micro': 0.9435355147645651}
Detailed metrics:  {'entailment': {'precision': 0.9333754341648247, 'recall': 0.9326392175421991, 'f1-score': 0.9330071806202163, 'support': 6339.0}, 'not_entailment': {'precision': 0.9509308205010343, 'recall': 0.9514775209842474, 'f1-score': 0.9512040921892063, 'support': 8697.0}, 'accuracy': 0.9435355147645651, 'macro avg': {'precision': 0.9421531273329296, 'recall': 0.9420583692632233, 'f1-score': 0.9421056364047113, 'support': 15036.0}, 'weighted avg': {'precision': 0.9435296769798032, 'recall': 0.9435355147645651, 'f1-score': 0.9435324892073077, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8897889011097754), 'f1_macro': 0.9448341025042268, 'f1_micro': 0.9463288108539505, 'accuracy_balanced': np.float64(0.9437244953014299), 'accuracy': 0.9463288108539505, 'precision_macro': 0.9460674905947604, 'recall_macro': 0.9437244953014299, 'precision_micro': 0.9463288108539505, 'recall_micro': 0.9463288108539505}
Detailed metrics:  {'entailment': {'precision': 0.9445515911282546, 'recall': 0.927117841930904, 'f1-score': 0.9357535228086936, 'support': 6339.0}, 'not_entailment': {'precision': 0.9475833900612661, 'recall': 0.9603311486719558, 'f1-score': 0.9539146821997602, 'support': 8697.0}, 'accuracy': 0.9463288108539505, 'macro avg': {'precision': 0.9460674905947604, 'recall': 0.9437244953014299, 'f1-score': 0.9448341025042268, 'support': 15036.0}, 'weighted avg': {'precision': 0.9463052194416625, 'recall': 0.9463288108539505, 'f1-score': 0.9462581519137817, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8885861641701017), 'f1_macro': 0.9442649313484377, 'f1_micro': 0.945730247406225, 'accuracy_balanced': np.float64(0.9435064753794208), 'accuracy': 0.945730247406225, 'precision_macro': 0.9450810839227561, 'recall_macro': 0.9435064753794208, 'precision_micro': 0.945730247406225, 'recall_micro': 0.945730247406225}
Detailed metrics:  {'entailment': {'precision': 0.941204665281994, 'recall': 0.929326392175422, 'f1-score': 0.9352278139387205, 'support': 6339.0}, 'not_entailment': {'precision': 0.9489575025635183, 'recall': 0.9576865585834196, 'f1-score': 0.9533020487581549, 'support': 8697.0}, 'accuracy': 0.945730247406225, 'macro avg': {'precision': 0.9450810839227561, 'recall': 0.9435064753794208, 'f1-score': 0.9442649313484377, 'support': 15036.0}, 'weighted avg': {'precision': 0.9456889979394439, 'recall': 0.945730247406225, 'f1-score': 0.9456821648448538, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8915170849942896), 'f1_macro': 0.9457555775873172, 'f1_micro': 0.9470603884011706, 'accuracy_balanced': np.float64(0.9460035966076938), 'accuracy': 0.9470603884011706, 'precision_macro': 0.9455136230301673, 'recall_macro': 0.9460035966076938, 'precision_micro': 0.9470603884011706, 'recall_micro': 0.9470603884011706}
Detailed metrics:  {'entailment': {'precision': 0.9354281225451689, 'recall': 0.9392648682757533, 'f1-score': 0.9373425692695214, 'support': 6339.0}, 'not_entailment': {'precision': 0.9555991235151655, 'recall': 0.9527423249396344, 'f1-score': 0.9541685859051129, 'support': 8697.0}, 'accuracy': 0.9470603884011706, 'macro avg': {'precision': 0.9455136230301673, 'recall': 0.9460035966076938, 'f1-score': 0.9457555775873172, 'support': 15036.0}, 'weighted avg': {'precision': 0.9470952677590596, 'recall': 0.9470603884011706, 'f1-score': 0.9470749360346011, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8908658430899224), 'f1_macro': 0.9453771327617819, 'f1_micro': 0.9465948390529396, 'accuracy_balanced': np.float64(0.9464565872081592), 'accuracy': 0.9465948390529396, 'precision_macro': 0.9444116030111819, 'recall_macro': 0.9464565872081592, 'precision_micro': 0.9465948390529396, 'recall_micro': 0.9465948390529396}
Detailed metrics:  {'entailment': {'precision': 0.9290142591444513, 'recall': 0.9455750118315192, 'f1-score': 0.9372214838558361, 'support': 6339.0}, 'not_entailment': {'precision': 0.9598089468779124, 'recall': 0.9473381625847993, 'f1-score': 0.9535327816677276, 'support': 8697.0}, 'accuracy': 0.9465948390529396, 'macro avg': {'precision': 0.9444116030111819, 'recall': 0.9464565872081592, 'f1-score': 0.9453771327617819, 'support': 15036.0}, 'weighted avg': {'precision': 0.9468262702656213, 'recall': 0.9465948390529396, 'f1-score': 0.9466561311736081, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Aggregate metrics:  {'MCC': np.float64(0.8948536909586216), 'f1_macro': 0.9474268058867525, 'f1_micro': 0.9487230646448523, 'accuracy_balanced': np.float64(0.9473981024185927), 'accuracy': 0.9487230646448523, 'precision_macro': 0.9474555903866249, 'recall_macro': 0.9473981024185927, 'precision_micro': 0.9487230646448523, 'recall_micro': 0.9487230646448523}
Detailed metrics:  {'entailment': {'precision': 0.9393939393939394, 'recall': 0.938949361097965, 'f1-score': 0.9391715976331361, 'support': 6339.0}, 'not_entailment': {'precision': 0.9555172413793104, 'recall': 0.9558468437392205, 'f1-score': 0.955682014140369, 'support': 8697.0}, 'accuracy': 0.9487230646448523, 'macro avg': {'precision': 0.9474555903866249, 'recall': 0.9473981024185927, 'f1-score': 0.9474268058867525, 'support': 15036.0}, 'weighted avg': {'precision': 0.9487198477051108, 'recall': 0.9487230646448523, 'f1-score': 0.9487214175562143, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


TrainOutput(global_step=107060, training_loss=0.045580504342477825, metrics={'train_runtime': 6446.2592, 'train_samples_per_second': 265.718, 'train_steps_per_second': 16.608, 'total_flos': 1.1278615761765728e+17, 'train_loss': 0.045580504342477825, 'epoch': 10.0})