In [1]:
import numpy as np
import transformers
transformers.logging.set_verbosity_error()
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset, load_metric
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [96]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

def read_and_tokenize_data(train,test,evaluate):
    d = load_dataset('pandas', data_files={"train":train, "test":test,"evaluate":evaluate})
    for i in d:
        d[i]= d[i].remove_columns(column_names = ['__index_level_0__'])
    d = d.map(tokenize_function, batched=True)
    return d

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at C:\Users\dovyd/.cache\huggingface\transformers\d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at C:\Users\dovyd/.cache\huggingface\transformers\cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at C:\Users\dovyd/.cache\huggingface\transformers\d53fc0fa09b8342651efd4073d7

#### Custom model

In [53]:
from transformers import RobertaPreTrainedModel, RobertaModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn as nn
from torch.nn import CrossEntropyLoss


class RobertaForSequenceClassification2(RobertaPreTrainedModel):
    def __init__(self, config, **kwargs):
        super().__init__(transformers.PretrainedConfig())
        self.num_labels = kwargs.get("task_labels_map", {})
        self.config = config

        self.roberta = RobertaModel(config)
        classifier_dropout = (
            config.classifier_dropout
            if config.classifier_dropout is not None
            else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)

    
        self.classifier1 = nn.Linear(
            config.hidden_size, list(self.num_labels.values())[0]
        )
        self.classifier2 = nn.Linear(
            config.hidden_size, list(self.num_labels.values())[1]
        )

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        task_name=None
    ):

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.roberta(
            input_ids = input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        
        
        logits = None
        if task_name == list(self.num_labels.keys())[0]:
            logits = self.classifier1(pooled_output)
        elif task_name == list(self.num_labels.keys())[1]:
            logits = self.classifier2(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(
                logits.view(-1, self.num_labels[task_name]), labels.view(-1)
            )

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


#### Custom dataloader

In [80]:
import dataclasses
from torch.utils.data.dataloader import DataLoader
from transformers.data.data_collator import DataCollator, InputDataClass
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
from typing import List, Union, Dict
import numpy as np
import torch
import transformers



class StrIgnoreDevice(str):
    """
    This is a hack. The Trainer is going call .to(device) on every input
    value, but we need to pass in an additional `task_name` string.
    This prevents it from throwing an error
    """

    def to(self, device):
        return self


class DataLoaderWithTaskname:
    """
    Wrapper around a DataLoader to also yield a task name
    """

    def __init__(self, task_name, data_loader):
        self.task_name = task_name
        self.data_loader = data_loader

        self.batch_size = data_loader.batch_size
        self.dataset = data_loader.dataset

    def __len__(self):
        return len(self.data_loader)

    def __iter__(self):
        for batch in self.data_loader:
            batch["task_name"] = StrIgnoreDevice(self.task_name)
            yield batch


class MultitaskDataloader:
    """
    Data loader that combines and samples from multiple single-task
    data loaders.
    """

    def __init__(self, dataloader_dict):
        self.dataloader_dict = dataloader_dict
        self.num_batches_dict = {
            task_name: len(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        self.task_name_list = list(self.dataloader_dict)
        self.dataset = [None] * sum(
            len(dataloader.dataset) for dataloader in self.dataloader_dict.values()
        )

    def __len__(self):
        return sum(self.num_batches_dict.values())

    def __iter__(self):
        """
        For each batch, sample a task, and yield a batch from the respective
        task Dataloader.
        We use size-proportional sampling, but you could easily modify this
        to sample from some-other distribution.
        """
        task_choice_list = []
        for i, task_name in enumerate(self.task_name_list):
            task_choice_list += [i] * self.num_batches_dict[task_name]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task_name: iter(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        for task_choice in task_choice_list:
            task_name = self.task_name_list[task_choice]
            yield next(dataloader_iter_dict[task_name])


class MultitaskTrainer(transformers.Trainer):
    def get_single_train_dataloader(self, task_name, train_dataset):
        """
        Create a single-task data loader that also yields task names
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_sampler = (
            RandomSampler(train_dataset)
            if self.args.local_rank == -1
            else DistributedSampler(train_dataset)
        )

        data_loader = DataLoaderWithTaskname(
            task_name=task_name,
            data_loader=DataLoader(
                train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=train_sampler,
                collate_fn=self.data_collator,
            ),
        )
        return data_loader

    def get_train_dataloader(self):
        """
        Returns a MultitaskDataloader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each
        task Dataloader
        """
        return MultitaskDataloader(
            {
                task_name: self.get_single_train_dataloader(task_name, task_dataset)
                for task_name, task_dataset in self.train_dataset.items()
            }
        )
    
    
    def get_eval_dataloader(self,eval_dataset=None):
        """
        Returns a MultitaskDataloader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each
        task Dataloader
        """
        if eval_dataset is None:
            eval_dataset = self.eval_dataset
        dataloader =  MultitaskDataloader(
            {
                task_name: self.get_single_train_dataloader(task_name, task_dataset)
                for task_name, task_dataset in eval_dataset.items()
            }
        )
        dataloader.batch_size = self.args.eval_batch_size
        return dataloader

In [97]:
dataset_dict = {
    "author":  read_and_tokenize_data("data/authors_train.pkl","data/authors_test.pkl","data/authors_validation.pkl"),
    "sentiment":  read_and_tokenize_data("data/imdb_train.pkl","data/imdb_test.pkl","data/imdb_validation.pkl")
}


multitask_model = RobertaForSequenceClassification2.from_pretrained(
        "roberta-base",
        task_labels_map={"author": 4, "sentiment": 2},
    )


for i in dataset_dict.values():
    for j in i.values():
        j.set_format(type="torch",columns=["input_ids", "attention_mask", "label"])
    
    
train_dataset = {
        task_name: dataset["train"] for task_name, dataset in dataset_dict.items()}

eval_dataset = {
        task_name: dataset["evaluate"] for task_name, dataset in dataset_dict.items()}

test_dataset = {
        task_name: dataset["evaluate"] for task_name, dataset in dataset_dict.items()}



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\dovyd/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.15.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.

#### Training the model

In [92]:
training_args = TrainingArguments(
    output_dir='results',
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    num_train_epochs=15,
    evaluation_strategy="steps",
    eval_steps = 500,
    save_steps = 500,
    load_best_model_at_end = True
    per_device_eval_batch_size =  8
)


metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = MultitaskTrainer(
    model=multitask_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=training_args,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [93]:
trainer.train()

***** Running training *****
  Num examples = 40
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.

#### Saving the model

In [None]:
def save_model(trainer,tokenizer,trainer_path,tokenizer_path):
    trainer.save_model(trainer_path)
    tokenizer.save_pretrained(tokenizer_path)
    
save_model(trainer,tokenizer,"models/multihead_classification/model","models/multihead_classification/tokenizer")

#### Loading the model

In [None]:
def load_model(model_path,tokenizer_path):
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels = 4)
    tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
    trainer = Trainer(model = model)
    return (model, trainer, tokenizer)
    

model, trainer, tokenizer = load_model("models/multihead_classification/model","models/multihead_classification/tokenizer")

In [122]:
def make_predictions(test_data):
    y_true = []
    y_pred = []
    metric = load_metric("accuracy")
    for task in ["author", "sentiment"]:
            for i in test_data[task]:
                logits = multitask_model(input_ids = torch.unsqueeze(i["input_ids"],0),\
                                         attention_mask = torch.unsqueeze(i["attention_mask"],0),task_name=task)[0]
                y_true.append(i["label"])
                y_pred.append(np.argmax(logits.detach().numpy(), axis=-1))
            
            acc = metric.compute(predictions=y_pred, references=y_true)
            print(f"Task: {task}", f"Accuracy: {acc}")

In [123]:
make_predictions(test_dataset)

Task: author Accuracy: {'accuracy': 0.2}
Task: sentiment Accuracy: {'accuracy': 0.1}


In [120]:
def pipeline(text, task):
    inputs = tokenizer(text, return_tensors="pt")
    logits = multitask_model(**inputs, task_name=task)[0]
    probabilities = torch.FloatTensor(torch.softmax(logits, dim=1).detach().cpu().tolist())
    result = torch.argmax(
        probabilities
    )
    probability = torch.max(
        probabilities
    )
    print({"label":result, "score":probability})

In [121]:
pipeline("hello","author")

{'label': tensor(3), 'score': tensor(0.2630)}
