<a href="https://colab.research.google.com/github/HimashiRathnayake/CMCS-Text-Classification/blob/main/XLM-R/Multi_task_learning_XLM_R_ipynb_(Single_Model_Multiple_Prediction_Heads).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference: https://towardsdatascience.com/how-to-create-and-train-a-multi-task-transformer-model-18c54a146240

**Libraries Setup**

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install seqeval



**Save datasets as json files**

In [2]:
import logging
import os
import random
import sys
import torch
from dataclasses import dataclass, field
from sklearn.model_selection import train_test_split
from typing import Optional, List
import datasets
import numpy as np
import pandas as pd
from datasets import load_dataset, load_metric
import transformers
import torch.nn as nn
from google.colab import drive
from transformers import (
    AutoModel,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
    DataCollatorForTokenClassification
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# def apply_oversampling(x, y):

#   (unique, counts) = np.unique(y, axis=0, return_counts=True)
#   print("Class Distribution Without Oversampling", counts)

#   oversample = RandomOverSampler(sampling_strategy = {
#       0:int(counts[0]*1), 1:int(counts[0]*0.25), 2:int(counts[0]*0.25)})
  
#   # fit and apply the transform
#   X_over, y_over = oversample.fit_resample(x, y)

#   (unique, counts) = np.unique(y_over, axis=0, return_counts=True)
#   print("Class Distribution After Oversampling", counts)

#   return X_over, y_over

In [5]:
# sent_dataset_path = "/content/drive/Shareddrives/FYP/corpus/çompleted_draft.csv"
# task_name = "Hate_speech"
# df = pd.read_csv(sent_dataset_path)
# df = df[['Sentence', task_name]]
# df.columns = ['sentence', 'label']
# df['label'], uniq = pd.factorize(df['label'])
# trainData, testData = train_test_split(df, test_size=0.10, random_state=42)
# rosTrainData = pd.DataFrame()

# #apply oversampling
# X = trainData['sentence'].values.tolist()
# y = trainData['label'].values.tolist()
# X = np.array(X).reshape(-1, 1)
# X, y = apply_oversampling(X, y)
# X = [x[0] for x in X.tolist()]
# rosTrainData['sentence'] = X
# rosTrainData['label'] = y

# #save the data
# rosTrainData.to_json('/content/drive/Shareddrives/FYP/corpus/hate_ros_train.json', orient='records', lines=True,  force_ascii=False)
# testData.to_json('/content/drive/Shareddrives/FYP/corpus/hate_ros_test.json', orient='records', lines=True,  force_ascii=False)

# tags_ind = ['Sinhala', 'English', 'Sin-Eng', 'Eng-Sin', 'Mixed', 'NameEntity', 'Symbol']
# df = pd.read_json(token_dataset_path, lines=True)
# count = 0
# for labels in df['tags']:
#   temp =[]
#   for label in labels:
#     temp.append(tags_ind.index(label))
#   df['tags'][count] = temp
#   count +=1
# #split the data into train and test set
# trainData,testData = train_test_split(df, test_size=0.10, random_state=42)
# #save the data
# trainData.to_json('/content/drive/Shareddrives/FYP/corpus/lang_id_train.json', orient='records', lines=True,  force_ascii=False)
# testData.to_json('/content/drive/Shareddrives/FYP/corpus/lang_id_test.json', orient='records', lines=True,  force_ascii=False)

In [6]:
max_length = 128

In [7]:
@dataclass
class Task:
    id: int
    name: str
    type: str
    num_labels: int

In [8]:
def tokenize_token_classification_dataset(raw_datasets, tokenizer, task_id, data_args, training_args):

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"],
            padding="max_length",
            truncation=True,
            max_length=data_args.max_seq_length,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )
        labels = []
        for i, label in enumerate(examples["tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label[word_idx] if data_args.label_all_tokens else -100)
                previous_word_idx = word_idx
            
            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        tokenized_inputs["task_ids"] = [task_id] * len(tokenized_inputs["labels"])
        return tokenized_inputs

    with training_args.main_process_first(desc="dataset map pre-processing"):
        tokenized_datasets = raw_datasets.map(
            tokenize_and_align_labels,
            batched=True,
            num_proc=1,
            load_from_cache_file=not data_args.overwrite_cache,
            remove_columns=["tokens"],
        )

    return tokenized_datasets

In [9]:
def tokenize_seq_classification_dataset(
    tokenizer, raw_datasets, task_id, data_args, training_args
):

    def tokenize_text(examples):
        result = tokenizer(examples["sentence"], padding="max_length", max_length=data_args.max_seq_length, truncation=True)
        examples["labels"] = examples.pop("label")
        result["task_ids"] = [task_id] * len(examples["labels"])
        return result

    def tokenize_and_pad_text(examples):
        result = tokenizer(examples["sentence"], padding="max_length", max_length=data_args.max_seq_length, truncation=True)
        examples["labels"] = examples.pop("label")
        result["task_ids"] = [task_id] * len(examples["labels"])
        result["labels"] = [
            [l] + [-100] * (data_args.max_seq_length - 1) for l in examples["labels"]
        ]
        return result

    with training_args.main_process_first(desc="dataset map pre-processing"):
        col_to_remove = ["sentence"]
        train_dataset = raw_datasets["train"].map(
            tokenize_and_pad_text,
            batched=True,
            load_from_cache_file=not data_args.overwrite_cache,
            remove_columns=col_to_remove,
            desc="Running tokenizer on dataset",
        )
        validation_dataset = raw_datasets["test"].map(
            tokenize_text,
            batched=True,
            load_from_cache_file=not data_args.overwrite_cache,
            remove_columns=col_to_remove,
            desc="Running tokenizer on dataset",
        )

    return train_dataset, validation_dataset

In [10]:
def load_classification_dataset(task_id, task_name, tokenizer, data_args, training_args):

    raw_datasets = load_dataset('json', data_files={'train': f'/content/drive/Shareddrives/FYP/corpus/{task_name}_train.json',
                                           'test': f'/content/drive/Shareddrives/FYP/corpus/{task_name}_test.json'})

    num_labels = 7 if task_name == "lang_id" else len(set(raw_datasets["train"]["label"]))
    task_info = Task(
        id=task_id, name=task_name, num_labels=num_labels, type="token_classification" if task_name=="lang_id" else "seq_classification" 
    )

    if (task_name == "lang_id"):
        tokenized_datasets = tokenize_token_classification_dataset(
            raw_datasets,
            tokenizer,
            task_id,
            data_args,
            training_args,
        )
        return tokenized_datasets["train"], tokenized_datasets["test"], task_info
    else:
        train_dataset, validation_dataset = tokenize_seq_classification_dataset(
            tokenizer,
            raw_datasets,
            task_id,
            data_args,
            training_args,
        )
        return train_dataset, validation_dataset, task_info

In [11]:
def load_datasets(tokenizer, data_args, training_args):
    (train_data_1, test_data_1, task_1) = load_classification_dataset(0, "sentiment", tokenizer, data_args, training_args)
    (train_data_2, test_data_2, task_2) = load_classification_dataset(1, "humor", tokenizer, data_args, training_args)
    (train_data_3, test_data_3, task_3) = load_classification_dataset(2, "hate_ros", tokenizer, data_args, training_args)
    (train_data_4, test_data_4, task_4) = load_classification_dataset(3, "lang_id", tokenizer, data_args, training_args)
    
    # Merge train datasets
    train_dataset_df = train_data_1.to_pandas().append(train_data_2.to_pandas()).append(train_data_3.to_pandas()).append(train_data_4.to_pandas())

    train_dataset = datasets.Dataset.from_pandas(train_dataset_df)
    train_dataset.shuffle(seed=123)

    # Append validation datasets
    validation_dataset = [
        test_data_1, test_data_2, test_data_3, test_data_4
    ]

    dataset = datasets.DatasetDict(
        {"train": train_dataset, "validation": validation_dataset}
    )
    tasks = [task_1, task_2, task_3, task_4]
    return tasks, dataset

In [12]:
class MultiTaskModel(nn.Module):
    def __init__(self, encoder_name_or_path, tasks: List):
        super().__init__()

        self.encoder = AutoModel.from_pretrained(encoder_name_or_path)

        self.output_heads = nn.ModuleDict()
        
        for task in tasks:
            decoder = self._create_output_head(self.encoder.config.hidden_size, task)
            # ModuleDict requires keys to be strings
            self.output_heads[str(task.id)] = decoder

    @staticmethod
    def _create_output_head(encoder_hidden_size: int, task):
        if task.type == "seq_classification":
            return SequenceClassificationHead(encoder_hidden_size, task.num_labels)
        elif task.type == "token_classification":
            return TokenClassificationHead(encoder_hidden_size, task.num_labels)
        else:
            raise NotImplementedError()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        task_ids=None,
        **kwargs,
    ):

        outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output, pooled_output = outputs[:2]

        unique_task_ids_list = torch.unique(task_ids).tolist()

        loss_list = []
        logits = None
        for unique_task_id in unique_task_ids_list:

            task_id_filter = task_ids == unique_task_id
            logits, task_loss = self.output_heads[str(unique_task_id)].forward(
                sequence_output[task_id_filter],
                pooled_output[task_id_filter],
                labels=None if labels is None else labels[task_id_filter],
                attention_mask=attention_mask[task_id_filter],
            )

            if labels is not None:
                loss_list.append(task_loss)

        # logits are only used for eval. and in case of eval the batch is not multi task
        # For training only the loss is used
        outputs = (logits, outputs[2:])

        if loss_list:
            loss = torch.stack(loss_list)
            outputs = (loss.mean(),) + outputs

        return outputs

In [13]:
class TokenClassificationHead(nn.Module):
    def __init__(self, hidden_size, num_labels, dropout_p=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout_p)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.num_labels = num_labels

        self._init_weights()

    def _init_weights(self):
        self.classifier.weight.data.normal_(mean=0.0, std=0.02)
        if self.classifier.bias is not None:
            self.classifier.bias.data.zero_()

    def forward(
        self, sequence_output, pooled_output, labels=None, attention_mask=None, **kwargs
    ):
        sequence_output_dropout = self.dropout(sequence_output)
        logits = self.classifier(sequence_output_dropout)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()

            labels = labels.long()

            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss,
                    labels.view(-1),
                    torch.tensor(loss_fct.ignore_index).type_as(labels),
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return logits, loss

In [14]:
class SequenceClassificationHead(nn.Module):
    def __init__(self, hidden_size, num_labels, dropout_p=0.1):
        super().__init__()
        self.num_labels = num_labels
        self.dropout = nn.Dropout(dropout_p)
        self.classifier = nn.Linear(hidden_size, num_labels)

        self._init_weights()

    def forward(self, sequence_output, pooled_output, labels=None, **kwargs):
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if labels.dim() != 1:
                # Remove padding
                labels = labels[:, 0]

            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(
                logits.view(-1, self.num_labels), labels.long().view(-1)
            )

        return logits, loss

    def _init_weights(self):
        self.classifier.weight.data.normal_(mean=0.0, std=0.02)
        if self.classifier.bias is not None:
            self.classifier.bias.data.zero_()

In [15]:
def compute_metrics(p: EvalPrediction):
    
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    if preds.ndim == 2:
        # Sequence classification
        preds = np.argmax(preds, axis=1)
        labels = p.label_ids
        # print(preds, labels)
        # return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

        metric1 = load_metric("precision")
        metric2 = load_metric("recall")
        metric3 = load_metric("f1")
        metric4 = load_metric("accuracy")
        
        precision = metric1.compute(predictions=preds, references=labels, average="weighted")["precision"]
        recall = metric2.compute(predictions=preds, references=labels, average="weighted")["recall"]
        f1 = metric3.compute(predictions=preds, references=labels, average="weighted")["f1"]
        accuracy = metric4.compute(predictions=preds, references=labels)["accuracy"]
        macro_precision = metric1.compute(predictions=preds, references=labels, average="macro")["precision"]
        macro_recall = metric2.compute(predictions=preds, references=labels, average="macro")["recall"]
        macro_f1 = metric3.compute(predictions=preds, references=labels, average="macro")["f1"]
        return {"accuracy":accuracy, "precision": precision, "recall": recall, "f1": f1, "macro_precision": macro_precision, "macro_recall": macro_recall, "macro_f1": macro_f1}
    
    elif preds.ndim == 3:
        # Token classification
        metric = load_metric("seqeval")

        predictions = np.argmax(preds, axis=2)

        true_predictions = [
            [f"tag-idx-{p}" for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, p.label_ids)
        ]
        true_labels = [
            [f"tag-idx-{l}" for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, p.label_ids)
        ]

        # Remove ignored index (special tokens)
        results = metric.compute(
            predictions=true_predictions, references=true_labels
        )
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    else:
        raise NotImplementedError()

In [16]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    encoder_name_or_path: str = field(
        default="xlm-roberta-base",
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=False,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )

In [17]:
task_to_keys = {
        "cola": ("sentence", None),
        "mnli": ("premise", "hypothesis"),
        "mrpc": ("sentence1", "sentence2"),
        "qnli": ("question", "sentence"),
        "qqp": ("question1", "question2"),
        "rte": ("sentence1", "sentence2"),
        "sst2": ("sentence", None),
        "stsb": ("sentence1", "sentence2"),
        "wnli": ("sentence1", "sentence2"),
    }

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    task_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
    )
    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
            "value if set."
        },
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the training data."}
    )
    validation_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the validation data."}
    )
    label_all_tokens: Optional[bool] = field(
        default=True, metadata={"help": "A csv or a json file containing the validation data."}
    )
    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})

In [18]:
model_args = ModelArguments(encoder_name_or_path="xlm-roberta-base")
data_args = DataTrainingArguments(max_seq_length=128)
training_args = TrainingArguments(
    do_train=True,
    do_eval=True,
    output_dir="/tmp/test",
    learning_rate=2e-5,
    num_train_epochs=3,
    overwrite_output_dir=True,
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,
    save_steps=10000,
)

In [19]:
tokenizer = AutoTokenizer.from_pretrained(
    model_args.encoder_name_or_path,
    # cache_dir=model_args.cache_dir,
    # use_fast=model_args.use_fast_tokenizer,
    # revision=model_args.model_revision,
    # use_auth_token=True if model_args.use_auth_token else None,
    do_lower_case=True
)

In [20]:
set_seed(training_args.seed)
tasks, raw_datasets = load_datasets(tokenizer, data_args, training_args)
model = MultiTaskModel(model_args.encoder_name_or_path, tasks)
train_dataset = raw_datasets["train"]
eval_datasets = raw_datasets["validation"]
data_collator = DataCollatorForTokenClassification(
    tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
)

Using custom data configuration default-3f0a22b1e59e6040
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-3f0a22b1e59e6040/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-3f0a22b1e59e6040/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-7dfd6b0ce1a19d03.arrow


Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Using custom data configuration default-4605e1ab69a23fa1
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-4605e1ab69a23fa1/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-4605e1ab69a23fa1/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-903e21f8f2a0b5e9.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-4605e1ab69a23fa1/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-1445f873ec45dd05.arrow
Using custom data configuration default-71140e368f460cc9
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-71140e368f460cc9/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-71140e368f460cc9/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-c15ad63629bddcbd.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-71140e368f460cc9/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-0e204d97785a2bdb.arrow
Using custom data configuration default-0b5eb83830de93b0
Reusing dataset json (/root/.cache/huggingface/datasets/json/default-0b5eb83830de93b0/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-0b5eb83830de93b0/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-d3f7af633edc7f63.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-0b5eb83830de93b0/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-4a0a84ea02ea6454.arrow
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be

In [21]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [22]:
# Training
if training_args.do_train:
    train_result = trainer.train()
    metrics = train_result.metrics
    max_train_samples = (
        data_args.max_train_samples
        if data_args.max_train_samples is not None
        else len(train_dataset)
    )
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.save_model()  # Saves the tokenizer too for easy upload

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

The following columns in the training set  don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: __index_level_0__, tags.
***** Running training *****
  Num examples = 52964
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4968


Step,Training Loss
500,0.6579
1000,0.5654
1500,0.5283
2000,0.4905
2500,0.4716
3000,0.4487
3500,0.4306


Step,Training Loss
500,0.6579
1000,0.5654
1500,0.5283
2000,0.4905
2500,0.4716
3000,0.4487
3500,0.4306
4000,0.4201
4500,0.4064




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /tmp/test
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in /tmp/test/tokenizer_config.json
Special tokens file saved in /tmp/test/special_tokens_map.json


***** train metrics *****
  epoch                    =        3.0
  total_flos               =        0GF
  train_loss               =     0.4812
  train_runtime            = 1:00:17.05
  train_samples            =      52964
  train_samples_per_second =     43.929
  train_steps_per_second   =      1.373


In [23]:
# Evaluation
if training_args.do_eval:

    for eval_dataset, task in zip(eval_datasets, tasks):
        print(task)
        data_collator = None
        if task.type == "token_classification":
            data_collator = DataCollatorForTokenClassification(
                tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
            )
        else:
            if data_args.pad_to_max_length:
                data_collator = default_data_collator
            elif training_args.fp16:
                data_collator = DataCollatorWithPadding(
                    tokenizer, pad_to_multiple_of=8
                )
            else:
                data_collator = None

        trainer.data_collator = data_collator
        metrics = trainer.evaluate(eval_dataset=eval_dataset)

        max_eval_samples = (
            data_args.max_eval_samples
            if data_args.max_eval_samples is not None
            else len(eval_datasets)
        )
        metrics["eval_samples"] = min(max_eval_samples, len(eval_datasets))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 1352
  Batch size = 32


Task(id=0, name='sentiment', type='seq_classification', num_labels=4)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 1352
  Batch size = 32


***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.7685
  eval_f1                 =     0.7625
  eval_loss               =     0.6233
  eval_macro_f1           =     0.5044
  eval_macro_precision    =     0.5277
  eval_macro_recall       =     0.4897
  eval_precision          =     0.7596
  eval_recall             =     0.7685
  eval_runtime            = 0:00:10.13
  eval_samples            =          4
  eval_samples_per_second =    133.397
  eval_steps_per_second   =      4.243
Task(id=1, name='humor', type='seq_classification', num_labels=2)


***** Running Evaluation *****
  Num examples = 1352
  Batch size = 32


***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.9253
  eval_f1                 =     0.9045
  eval_loss               =     0.2276
  eval_macro_f1           =     0.6221
  eval_macro_precision    =     0.8101
  eval_macro_recall       =     0.5857
  eval_precision          =     0.9105
  eval_recall             =     0.9253
  eval_runtime            = 0:00:09.69
  eval_samples            =          4
  eval_samples_per_second =    139.413
  eval_steps_per_second   =      4.434
Task(id=2, name='hate_ros', type='seq_classification', num_labels=3)


The following columns in the evaluation set  don't have a corresponding argument in `MultiTaskModel.forward` and have been ignored: tags.
***** Running Evaluation *****
  Num examples = 1343
  Batch size = 32


***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.8469
  eval_f1                 =     0.8681
  eval_loss               =     0.3949
  eval_macro_f1           =     0.5385
  eval_macro_precision    =     0.5054
  eval_macro_recall       =     0.6826
  eval_precision          =     0.9006
  eval_recall             =     0.8469
  eval_runtime            = 0:00:09.68
  eval_samples            =          4
  eval_samples_per_second =    139.532
  eval_steps_per_second   =      4.438
Task(id=3, name='lang_id', type='token_classification', num_labels=7)




***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =      0.966
  eval_f1                 =     0.9088
  eval_loss               =     0.1204
  eval_precision          =     0.9119
  eval_recall             =     0.9057
  eval_runtime            = 0:00:10.39
  eval_samples            =          4
  eval_samples_per_second =    129.254
  eval_steps_per_second   =      4.042
