<a href="https://colab.research.google.com/github/MoritzLaurer/zeroshot-classifier/blob/main/train_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install and setup

In [1]:
!pip install transformers[sentencepiece]~=4.33.0 -qq
!pip install datasets~=2.14.0 -qq
!pip install accelerate~=0.23.0 -qq
# test mlflow for logging
!pip install mlflow~=2.7.0 -qq
!pip install mdutils~=1.6.0 -qq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
## load packages
import transformers
import torch
from datasets import ClassLabel

import pandas as pd
import numpy as np
import os
from datasets import load_dataset
import re
import time
import random
import tqdm

from torch.utils.data import DataLoader
from datasets import load_dataset, load_metric, Dataset, DatasetDict, concatenate_datasets, list_metrics
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer

from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report



In [3]:
## set global seed for reproducibility and against seed hacking
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

## for tests in Colab
USING_COLAB = True
DATE = 20230928


In [4]:
if USING_COLAB:
    # info on the GPU you are using
    !nvidia-smi
    # info on available ram
    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print('\n\nYour runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Thu Sep 28 15:13:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
if USING_COLAB:
    ## connect to google drive
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)

    #set wd
    print(os.getcwd())
    os.chdir("/content/drive/My Drive/PhD/zero-shot-models")

print(os.getcwd())

# local config.py file with tokens
import config

Mounted at /content/drive
/content
/content/drive/My Drive/PhD/zero-shot-models


### Load data

In [6]:
# load from hub
dataset_train = load_dataset("MoritzLaurer/dataset_train_nli", token=config.HF_ACCESS_TOKEN)["train"]
dataset_test_concat_nli = load_dataset("MoritzLaurer/dataset_test_concat_nli", token=config.HF_ACCESS_TOKEN)["train"]
dataset_test_disaggregated = load_dataset("MoritzLaurer/dataset_test_disaggregated_nli", token=config.HF_ACCESS_TOKEN)


Downloading readme:   0%|          | 0.00/682 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/261M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1286741 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/643 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.80M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/59140 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/4.71k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/34 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/239k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/247k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/303k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/505k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/364k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/109k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/97.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.00M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.61M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.53M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/93.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/665k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/947k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/166k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/316k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/19.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/387k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/334k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/317k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/575k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/71.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.88M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.44M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.75M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/34 [00:00<?, ?it/s]

Generating mnli_m split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating mnli_mm split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating fever split:   0%|          | 0/19652 [00:00<?, ? examples/s]

Generating anli_r1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating anli_r2 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating anli_r3 split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Generating wanli split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating ling split:   0%|          | 0/4893 [00:00<?, ? examples/s]

Generating wellformedquery split:   0%|          | 0/5934 [00:00<?, ? examples/s]

Generating rottentomatoes split:   0%|          | 0/2132 [00:00<?, ? examples/s]

Generating amazonpolarity split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating imdb split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating yelpreviews split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating hatexplain split:   0%|          | 0/2922 [00:00<?, ? examples/s]

Generating massive split:   0%|          | 0/175466 [00:00<?, ? examples/s]

Generating banking77 split:   0%|          | 0/221760 [00:00<?, ? examples/s]

Generating emotiondair split:   0%|          | 0/12000 [00:00<?, ? examples/s]

Generating emocontext split:   0%|          | 0/22036 [00:00<?, ? examples/s]

Generating empathetic split:   0%|          | 0/81344 [00:00<?, ? examples/s]

Generating agnews split:   0%|          | 0/30400 [00:00<?, ? examples/s]

Generating yahootopics split:   0%|          | 0/500000 [00:00<?, ? examples/s]

Generating biasframes_sex split:   0%|          | 0/8808 [00:00<?, ? examples/s]

Generating biasframes_offensive split:   0%|          | 0/7676 [00:00<?, ? examples/s]

Generating biasframes_intent split:   0%|          | 0/7296 [00:00<?, ? examples/s]

Generating financialphrasebank split:   0%|          | 0/2070 [00:00<?, ? examples/s]

Generating appreviews split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating hateoffensive split:   0%|          | 0/2586 [00:00<?, ? examples/s]

Generating trueteacher split:   0%|          | 0/17910 [00:00<?, ? examples/s]

Generating spam split:   0%|          | 0/2070 [00:00<?, ? examples/s]

Generating wikitoxic_toxicaggregated split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating wikitoxic_obscene split:   0%|          | 0/17382 [00:00<?, ? examples/s]

Generating wikitoxic_identityhate split:   0%|          | 0/11424 [00:00<?, ? examples/s]

Generating wikitoxic_threat split:   0%|          | 0/10422 [00:00<?, ? examples/s]

Generating wikitoxic_insult split:   0%|          | 0/16854 [00:00<?, ? examples/s]

### Tokenize, train eval

In [7]:
### Load model and tokenizer

model_name = "microsoft/deberta-v3-base"
max_length = 512

## load model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

# label2id mapping
label2id = {"true": 0, "not_true": 1}  #{"entailment": 0, "neutral": 1, "contradiction": 2}
id2label = {0: "true", 1: "not_true"}  #{0: "entailment", 1: "neutral", 2: "contradiction"}

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, model_max_length=max_length)  # model_max_length=512
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, label2id=label2id, id2label=id2label
).to(device)


Device: cuda


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Tokenize

In [8]:
### tokenization
# Dynamic padding HF course: https://huggingface.co/course/chapter3/2?fw=pt

# without padding="max_length" & max_length=512, it should do dynamic padding.
def tokenize_func(examples):
    return tokenizer(examples["text"], examples["hypothesis"], truncation=True)  # max_length=512,  padding=True

# training on:
encoded_dataset_train = dataset_train.map(tokenize_func, batched=True)
print(len(encoded_dataset_train))
# testing on:
encoded_dataset_test = dataset_test_concat_nli.map(tokenize_func, batched=True)
print(len(encoded_dataset_test))
# testing on individual datasets:
encoded_dataset_test_disaggregated = dataset_test_disaggregated.map(tokenize_func, batched=True)

# remove columns the library does not expect
encoded_dataset_train = encoded_dataset_train.remove_columns(["hypothesis", "text"])
encoded_dataset_test = encoded_dataset_test.remove_columns(["hypothesis", "text"])


Map:   0%|          | 0/1286741 [00:00<?, ? examples/s]

1286741


Map:   0%|          | 0/59140 [00:00<?, ? examples/s]

59140


Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/19652 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4893 [00:00<?, ? examples/s]

Map:   0%|          | 0/5934 [00:00<?, ? examples/s]

Map:   0%|          | 0/2132 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2922 [00:00<?, ? examples/s]

Map:   0%|          | 0/175466 [00:00<?, ? examples/s]

Map:   0%|          | 0/221760 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/22036 [00:00<?, ? examples/s]

Map:   0%|          | 0/81344 [00:00<?, ? examples/s]

Map:   0%|          | 0/30400 [00:00<?, ? examples/s]

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8808 [00:00<?, ? examples/s]

Map:   0%|          | 0/7676 [00:00<?, ? examples/s]

Map:   0%|          | 0/7296 [00:00<?, ? examples/s]

Map:   0%|          | 0/2070 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2586 [00:00<?, ? examples/s]

Map:   0%|          | 0/17910 [00:00<?, ? examples/s]

Map:   0%|          | 0/2070 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/17382 [00:00<?, ? examples/s]

Map:   0%|          | 0/11424 [00:00<?, ? examples/s]

Map:   0%|          | 0/10422 [00:00<?, ? examples/s]

Map:   0%|          | 0/16854 [00:00<?, ? examples/s]

#### Training

In [9]:
# release memory: https://huggingface.co/blog/optimize-llm
import gc
from accelerate.utils import release_memory

def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()


In [10]:

def compute_metrics_standard(eval_pred, label_text_alphabetical=None):
    labels = eval_pred.label_ids
    pred_logits = eval_pred.predictions
    preds_max = np.argmax(pred_logits, axis=1)  # argmax on each row (axis=1) in the tensor

    # metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    acc_balanced = balanced_accuracy_score(labels, preds_max)
    acc_not_balanced = accuracy_score(labels, preds_max)

    metrics = {'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy_balanced': acc_balanced,
            'accuracy': acc_not_balanced,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            #'label_gold_raw': labels,
            #'label_predicted_raw': preds_max
            }
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} )  # print metrics but without label lists
    print("Detailed metrics: ", classification_report(
        labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical, sample_weight=None,
        digits=2, output_dict=True, zero_division='warn'),
    "\n")

    return metrics


def compute_metrics_nli_binary(eval_pred, label_text_alphabetical=None):
    predictions, labels = eval_pred

    # split in chunks with predictions for each hypothesis for one unique premise
    def chunks(lst, n):  # Yield successive n-sized chunks from lst. https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
        for i in range(0, len(lst), n):
            yield lst[i:i + n]

    # for each chunk/premise, select the most likely hypothesis, either via raw logits, or softmax
    prediction_chunks_lst = list(chunks(predictions, len(set(label_text_alphabetical)) ))  # len(LABEL_TEXT_ALPHABETICAL)
    hypo_position_highest_prob = []
    for i, chunk in enumerate(prediction_chunks_lst):
        # only accesses the first column of the array, i.e. the entailment prediction logit of all hypos and takes the highest one
        hypo_position_highest_prob.append(np.argmax(chunk[:, 0]))

    label_chunks_lst = list(chunks(labels, len(set(label_text_alphabetical)) ))
    label_position_gold = []
    for chunk in label_chunks_lst:
        label_position_gold.append(np.argmin(chunk))  # argmin to detect the position of the 0 among the 1s

    # for inspection
    print("Highest probability prediction per premise: ", hypo_position_highest_prob)
    print("Correct label per premise: ", label_position_gold)

    ## metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(label_position_gold, hypo_position_highest_prob, average='macro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(label_position_gold, hypo_position_highest_prob, average='micro')  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html
    acc_balanced = balanced_accuracy_score(label_position_gold, hypo_position_highest_prob)
    acc_not_balanced = accuracy_score(label_position_gold, hypo_position_highest_prob)
    metrics = {'f1_macro': f1_macro,
               'f1_micro': f1_micro,
               'accuracy_balanced': acc_balanced,
               'accuracy': acc_not_balanced,
               'precision_macro': precision_macro,
               'recall_macro': recall_macro,
               'precision_micro': precision_micro,
               'recall_micro': recall_micro,
               #'label_gold_raw': label_position_gold,
               #'label_predicted_raw': hypo_position_highest_prob
               }
    print("Aggregate metrics: ", {
        key: metrics[key] for key in metrics
        if key not in ["label_gold_raw", "label_predicted_raw"]
    })
    print("Detailed metrics: ", classification_report(
        label_position_gold,
        hypo_position_highest_prob,
        labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical,
        sample_weight=None, digits=2, output_dict=True,
        zero_division='warn'),
    "\n")

    return metrics




In [11]:
training_directory = f'./results/{model_name.split("/")[-1]}-zeroshot-{DATE}'

## test logging with mlflow
# https://mlflow.org/docs/latest/python_api/mlflow.transformers.html
# https://gitlab.com/juliensimon/huggingface-demos/-/blob/main/mlflow/MLflow%20and%20Transformers.ipynb
import mlflow
import json
from datetime import datetime
# https://huggingface.co/docs/transformers/v4.33.3/en/main_classes/callback#transformers.integrations.MLflowCallback
#mlflow.create_experiment(name='your_experiment_name', artifact_location=f'{training_directory}/logs')
now = datetime.now().strftime("%Y-%m-%d-%H-%M")
run_name = f"mlflow-{model_name.split('/')[-1]}-{now}"
os.environ["MLFLOW_EXPERIMENT_NAME"] = f"mlflow-{model_name.split('/')[-1]}-zeroshot-{DATE}"
mlflow_tags = {
    "train_data": np.unique(dataset_train["task_name"]).tolist(),
    #"train_data_nli": np.unique(dataset_train_nli["task_name"]).tolist(),
    #"train_data_not_nli": np.unique(dataset_train_not_nli["task_name"]).tolist()
}
os.environ['MLFLOW_TAGS'] = json.dumps(mlflow_tags)
#os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"
# https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.autolog
mlflow.autolog(log_datasets=False, log_models=False, silent=False)


fp16_bool = True if torch.cuda.is_available() else False
if "mDeBERTa" in model_name: fp16_bool = False  # mDeBERTa does not support FP16 yet

# https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments
eval_batch = 64

train_args = TrainingArguments(
    output_dir=training_directory,
    logging_dir=f'{training_directory}/logs',
    #deepspeed="ds_config_zero3.json",  # if using deepspeed
    lr_scheduler_type= "linear",
    group_by_length=False,  # can increase speed with dynamic padding, by grouping similar length texts https://huggingface.co/transformers/main_classes/trainer.html
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=eval_batch,
    gradient_accumulation_steps=2,  # (!adapt/halve batch size accordingly). accumulates gradients over X steps, only then backward/update. decreases memory usage, but also slightly speed
    #eval_accumulation_steps=2,
    num_train_epochs=2,
    #max_steps=400,
    #warmup_steps=0,  # 1000,
    warmup_ratio=0.06,  #0.1, 0.06
    weight_decay=0.01,  #0.1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=fp16_bool,   # ! only makes sense at batch-size > 8. loads two copies of model weights, which creates overhead. https://huggingface.co/transformers/performance.html?#fp16
    fp16_full_eval=fp16_bool,
    evaluation_strategy="epoch",
    seed=SEED_GLOBAL,
    #eval_steps=300  # evaluate after n steps if evaluation_strategy!='steps'. defaults to logging_steps
    save_strategy="epoch",  # options: "no"/"steps"/"epoch"
    #save_steps=1_000_000,              # Number of updates steps before two checkpoint saves.
    save_total_limit=3,             # If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir
    #logging_strategy="epoch",
    report_to="all",  # "all"
    run_name=run_name,
    #push_to_hub=True,
    #push_to_hub_model_id="test97531", #f"{model_name}-finetuned-{task}",
    #hub_token="XXX",  # for pushing to hub  # https://discuss.huggingface.co/t/where-to-put-use-auth-token-in-the-code-if-you-cant-run-hugginface-cli-login-command/11701/2
)


2023/09/28 15:21:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.
2023/09/28 15:21:15 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/09/28 15:21:15 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.


In [12]:
trainer = Trainer(
    model=model,
    #model_init=model_init,
    tokenizer=tokenizer,
    args=train_args,
    train_dataset=encoded_dataset_train.shard(index=1, num_shards=900),  # https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
    eval_dataset=encoded_dataset_test.shard(index=1, num_shards=90),
    compute_metrics=lambda x: compute_metrics_standard(x, label_text_alphabetical=["true", "not_true"])  #compute_metrics,
    #data_collator=data_collator,  # for weighted sampling per dataset; for dynamic padding probably not necessary because done by default  https://huggingface.co/course/chapter3/3?fw=pt
)

if device == "cuda":
    # free memory
    flush()
    release_memory(model)
    #del (model, trainer)


In [13]:
# train
trainer.train()


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy Balanced,Accuracy,Precision Macro,Recall Macro,Precision Micro,Recall Micro
0,No log,0.624412,0.397987,0.661094,0.5,0.661094,0.330547,0.5,0.661094,0.661094
1,No log,0.596787,0.591691,0.653495,0.58932,0.653495,0.601054,0.58932,0.653495,0.653495


Aggregate metrics:  {'f1_macro': 0.3979871912168344, 'f1_micro': 0.6610942249240122, 'accuracy_balanced': 0.5, 'accuracy': 0.6610942249240122, 'precision_macro': 0.3305471124620061, 'recall_macro': 0.5, 'precision_micro': 0.6610942249240122, 'recall_micro': 0.6610942249240122}
Detailed metrics:  {'true': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 223}, 'not_true': {'precision': 0.6610942249240122, 'recall': 1.0, 'f1-score': 0.7959743824336688, 'support': 435}, 'accuracy': 0.6610942249240122, 'macro avg': {'precision': 0.3305471124620061, 'recall': 0.5, 'f1-score': 0.3979871912168344, 'support': 658}, 'weighted avg': {'precision': 0.4370455742278804, 'recall': 0.6610942249240122, 'f1-score': 0.5262140674143555, 'support': 658}} 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Aggregate metrics:  {'f1_macro': 0.5916914334236912, 'f1_micro': 0.6534954407294833, 'accuracy_balanced': 0.5893201381372094, 'accuracy': 0.6534954407294833, 'precision_macro': 0.6010543380646365, 'recall_macro': 0.5893201381372094, 'precision_micro': 0.6534954407294833, 'recall_micro': 0.6534954407294833}
Detailed metrics:  {'true': {'precision': 0.4860335195530726, 'recall': 0.3901345291479821, 'f1-score': 0.43283582089552236, 'support': 223}, 'not_true': {'precision': 0.7160751565762005, 'recall': 0.7885057471264367, 'f1-score': 0.7505470459518601, 'support': 435}, 'accuracy': 0.6534954407294833, 'macro avg': {'precision': 0.6010543380646365, 'recall': 0.5893201381372094, 'f1-score': 0.5916914334236912, 'support': 658}, 'weighted avg': {'precision': 0.6381127172811283, 'recall': 0.6534954407294833, 'f1-score': 0.6428728769738004, 'support': 658}} 



TrainOutput(global_step=178, training_loss=0.6378026812264089, metrics={'train_runtime': 106.4409, 'train_samples_per_second': 26.869, 'train_steps_per_second': 1.672, 'total_flos': 259263594573984.0, 'train_loss': 0.6378026812264089, 'epoch': 1.99})

#### Evaluation

In [14]:

# load specific model for evaluation
#model = AutoModelForSequenceClassification.from_pretrained('./results/nli-few-shot/all-nli-3c/DeBERTa-v3-mnli-fever-anli-v1',   # nli_effect/distilroberta-paraphrase-mnli-fever-anli-v1
#                                                           label2id=label2id, id2label=id2label).to(device)

# free memory
if device == "cuda":
    flush()
    release_memory(model)


result_dic = {}
for key_task_name, value_dataset in tqdm.notebook.tqdm(encoded_dataset_test_disaggregated.items(), desc="Iterations over testsets"):
    if key_task_name in dataset_test_disaggregated.keys():
        trainer.compute_metrics = lambda x: compute_metrics_standard(x, label_text_alphabetical=["true", "not_true"])
        result = trainer.evaluate(eval_dataset=encoded_dataset_test_disaggregated[key_task_name])
    elif any(long_dataset in key_task_name for long_dataset in ["anthropic", "banking77", "massive", "empathetic"]):
        # ! anthropic hypos not in task_hypotheses !
        # handle these (very long) datasets later
        continue
    else:
        label_text_alphabetical = np.sort(np.unique(value_dataset["label_text"])).tolist()
        trainer.compute_metrics = lambda x: compute_metrics_nli_binary(x, label_text_alphabetical=label_text_alphabetical)
        result = trainer.evaluate(eval_dataset=encoded_dataset_test_disaggregated[key_task_name])

    result_dic.update({key_task_name: result})
    print(f"Result for task {key_task_name}: ", result, "\n")

print("\n\nOverall results: ", result_dic)


Iterations over testsets:   0%|          | 0/34 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Aggregate metrics:  {'f1_macro': 0.3922976905454771, 'f1_micro': 0.6455425369332655, 'accuracy_balanced': 0.5, 'accuracy': 0.6455425369332655, 'precision_macro': 0.32277126846663273, 'recall_macro': 0.5, 'precision_micro': 0.6455425369332655, 'recall_micro': 0.6455425369332655}
Detailed metrics:  {'true': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3479}, 'not_true': {'precision': 0.6455425369332655, 'recall': 1.0, 'f1-score': 0.7845953810909542, 'support': 6336}, 'accuracy': 0.6455425369332655, 'macro avg': {'precision': 0.32277126846663273, 'recall': 0.5, 'f1-score': 0.3922976905454771, 'support': 9815}, 'weighted avg': {'precision': 0.4167251669902364, 'recall': 0.6455425369332655, 'f1-score': 0.5064896927755768, 'support': 9815}} 

Result for task mnli_m:  {'eval_loss': 0.6104028820991516, 'eval_f1_macro': 0.3922976905454771, 'eval_f1_micro': 0.6455425369332655, 'eval_accuracy_balanced': 0.5, 'eval_accuracy': 0.6455425369332655, 'eval_precision_macro': 0.322771268

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Aggregate metrics:  {'f1_macro': 0.3931238812418987, 'f1_micro': 0.6477827502034175, 'accuracy_balanced': 0.5, 'accuracy': 0.6477827502034175, 'precision_macro': 0.32389137510170873, 'recall_macro': 0.5, 'precision_micro': 0.6477827502034175, 'recall_micro': 0.6477827502034175}
Detailed metrics:  {'true': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 3463}, 'not_true': {'precision': 0.6477827502034175, 'recall': 1.0, 'f1-score': 0.7862477624837974, 'support': 6369}, 'accuracy': 0.6477827502034175, 'macro avg': {'precision': 0.32389137510170873, 'recall': 0.5, 'f1-score': 0.3931238812418987, 'support': 9832}, 'weighted avg': {'precision': 0.4196224914611031, 'recall': 0.6477827502034175, 'f1-score': 0.5093177379230376, 'support': 9832}} 

Result for task mnli_mm:  {'eval_loss': 0.6053792834281921, 'eval_f1_macro': 0.3931238812418987, 'eval_f1_micro': 0.6477827502034175, 'eval_accuracy_balanced': 0.5, 'eval_accuracy': 0.6477827502034175, 'eval_precision_macro': 0.32389137

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Aggregate metrics:  {'f1_macro': 0.3999755740107474, 'f1_micro': 0.6665988194585792, 'accuracy_balanced': 0.5, 'accuracy': 0.6665988194585792, 'precision_macro': 0.3332994097292896, 'recall_macro': 0.5, 'precision_micro': 0.6665988194585792, 'recall_micro': 0.6665988194585792}
Detailed metrics:  {'true': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6552}, 'not_true': {'precision': 0.6665988194585792, 'recall': 1.0, 'f1-score': 0.7999511480214948, 'support': 13100}, 'accuracy': 0.6665988194585792, 'macro avg': {'precision': 0.3332994097292896, 'recall': 0.5, 'f1-score': 0.3999755740107474, 'support': 19652}, 'weighted avg': {'precision': 0.4443539861035716, 'recall': 0.6665988194585792, 'f1-score': 0.5332464908956637, 'support': 19652}} 

Result for task fever:  {'eval_loss': 0.6463865637779236, 'eval_f1_macro': 0.3999755740107474, 'eval_f1_micro': 0.6665988194585792, 'eval_accuracy_balanced': 0.5, 'eval_accuracy': 0.6665988194585792, 'eval_precision_macro': 0.333299409

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Aggregate metrics:  {'f1_macro': 0.3997599039615847, 'f1_micro': 0.666, 'accuracy_balanced': 0.5, 'accuracy': 0.666, 'precision_macro': 0.333, 'recall_macro': 0.5, 'precision_micro': 0.666, 'recall_micro': 0.666}
Detailed metrics:  {'true': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 334}, 'not_true': {'precision': 0.666, 'recall': 1.0, 'f1-score': 0.7995198079231693, 'support': 666}, 'accuracy': 0.666, 'macro avg': {'precision': 0.333, 'recall': 0.5, 'f1-score': 0.3997599039615847, 'support': 1000}, 'weighted avg': {'precision': 0.44355600000000006, 'recall': 0.666, 'f1-score': 0.5324801920768308, 'support': 1000}} 

Result for task anli_r1:  {'eval_loss': 0.6528143286705017, 'eval_f1_macro': 0.3997599039615847, 'eval_f1_micro': 0.666, 'eval_accuracy_balanced': 0.5, 'eval_accuracy': 0.666, 'eval_precision_macro': 0.333, 'eval_recall_macro': 0.5, 'eval_precision_micro': 0.666, 'eval_recall_micro': 0.666, 'eval_runtime': 3.4933, 'eval_samples_per_second': 286.259, 'eva

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Aggregate metrics:  {'f1_macro': 0.3997599039615847, 'f1_micro': 0.666, 'accuracy_balanced': 0.5, 'accuracy': 0.666, 'precision_macro': 0.333, 'recall_macro': 0.5, 'precision_micro': 0.666, 'recall_micro': 0.666}
Detailed metrics:  {'true': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 334}, 'not_true': {'precision': 0.666, 'recall': 1.0, 'f1-score': 0.7995198079231693, 'support': 666}, 'accuracy': 0.666, 'macro avg': {'precision': 0.333, 'recall': 0.5, 'f1-score': 0.3997599039615847, 'support': 1000}, 'weighted avg': {'precision': 0.44355600000000006, 'recall': 0.666, 'f1-score': 0.5324801920768308, 'support': 1000}} 

Result for task anli_r2:  {'eval_loss': 0.644112765789032, 'eval_f1_macro': 0.3997599039615847, 'eval_f1_micro': 0.666, 'eval_accuracy_balanced': 0.5, 'eval_accuracy': 0.666, 'eval_precision_macro': 0.333, 'eval_recall_macro': 0.5, 'eval_precision_micro': 0.666, 'eval_recall_micro': 0.666, 'eval_runtime': 3.1258, 'eval_samples_per_second': 319.92, 'eval_

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: ignored

In [16]:
# add disaggregated metrics to mlflow tracking
run_object = mlflow.search_runs(filter_string=f"attributes.run_name = '{run_name}'")
run_id = run_object["run_id"][0]

#with mlflow.start_run(run_id=run_id):
result_dic_unnested = {f"{outer_key}_{inner_key}": value for outer_key, inner_dict in result_dic.items() for inner_key, value in inner_dict.items()}
mlflow.log_metrics(result_dic_unnested)
mlflow.end_run()


In [17]:
"""## Create Model Card """

## testing automatic creation of .md file
# https://mdutils.readthedocs.io/en/latest/mdutils.html#subpackages
from mdutils import MdUtils
mdFile = MdUtils(file_name=f'README-{model_name.split("/")[-1]}-{DATE}', title='Model Card')

row_dataset_names = list(result_dic.keys())
row_metrics = [str(round(value["eval_accuracy"], 3)) for key, value in result_dic.items()]
row_samp_per_sec = [str(round(value["eval_samples_per_second"], 0)) for key, value in result_dic.items()]

table_lst = ["Datasets"] + row_dataset_names + ["Accuracy"] + row_metrics + [f"Inference text/sec (A100, batch={eval_batch})"] + row_samp_per_sec

# create markdown table with results
#mdFile.new_line()
results_table_me = mdFile.new_table(columns=len(list(result_dic.keys()))+1, rows=3, text=table_lst, text_align='center')
print(results_table_me)

# write results_table_me to training directors
path_main = os.getcwd()
os.chdir(training_directory)
mdFile.create_md_file()
os.chdir(path_main)



|Datasets|mnli_m|mnli_mm|fever|anli_r1|anli_r2|anli_r3|
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
|Accuracy|0.646|0.648|0.667|0.666|0.666|0.665|
|Inference text/sec (A100, batch=64)|400.0|391.0|152.0|286.0|320.0|245.0|



In [21]:

upload_to_hub = True

if upload_to_hub:
    # push directly via trainer to hub
    #trainer.push_to_hub()  # does not work for some reason. wheel spins but nothing happens.

    ## save best model to disk
    model_path = f"{training_directory}/best-{model_name.split('/')[-1]}-{DATE}"

    trainer.save_model(output_dir=model_path)

    print(os.getcwd())
    model = AutoModelForSequenceClassification.from_pretrained(model_path, torch_dtype=torch.float16)
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, model_max_length=512)

    ## Push to hub
    #!sudo apt-get install git-lfs
    #!huggingface-cli login
    # unnecessary if token provided below

    # https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.push_to_hub
    # repo_path_or_name=f'{model_name.split("/")[-1]}-{"-".join(NLI_DATASETS_TO_USE)}',
    model.push_to_hub(repo_id=f'MoritzLaurer/{model_name.split("/")[-1]}-zeroshot-v1', use_temp_dir=True, private=True, use_auth_token=config.HF_ACCESS_TOKEN)
    tokenizer.push_to_hub(repo_id=f'MoritzLaurer/{model_name.split("/")[-1]}-zeroshot-v1', use_temp_dir=True, private=True, use_auth_token=config.HF_ACCESS_TOKEN)



/content/drive/MyDrive/PhD/zero-shot-models




pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

### Optional: inspect MLflow traces in colab

In [None]:
assert 1 == 2, "Block following code from executing when running entire notebook"

In [None]:
## inspect mlflow logs directly in colab with ngrok and terminal
!pip install "pyngrok~=5.2.1" -qqq

import getpass
from pyngrok import ngrok, conf

print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth")
#conf.get_default().auth_token = getpass.getpass()
conf.get_default().auth_token = config.NGROK_ACCESS_TOKEN
# if the above does not work, try:
#ngrok.set_auth_token("<INSER_YOUR_NGROK_AUTHTOKEN>")

print("In terminal, set working directory: cd", os.getcwd())
print("then run command: mlflow ui")


Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth
You need to create a free ngrok account to get an authtoken. The token looks something like this: ASDO1283YZaDu95vysXYIUXZXYRR_54YfASDIb8cpNfVoz349587
In terminal, set working directory: cd /content/drive/MyDrive/PhD/zero-shot-models
then run command: mlflow ui


In [None]:
# disconnect all existing tunnels to avoid issues when rerunning cells
[ngrok.disconnect(tunnel.public_url) for tunnel in ngrok.get_tunnels()]

# create the public link
ngrok_tunnel = ngrok.connect(5000)
print("You can now access the Argilla localhost with the public link below. (It should look something like 'http://X03b-34-XXX-237-25.ngrok.io')\n")
print(f"Your ngrok public link: {ngrok_tunnel}\n")
print("After clicking on the link, there will be a warning, which you can ignore")

In terminal, set working directory: cd /content/drive/MyDrive/PhD/zero-shot-models
then run command: mlflow ui
