In [12]:
import numpy as np
import transformers
transformers.logging.set_verbosity_error()
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset, load_metric
from sklearn.metrics import confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

In [13]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

def read_and_tokenize_data(train,test,evaluate):
    d = load_dataset('pandas', data_files={"train":train, "test":test,"evaluate":evaluate})
    for i in d:
        d[i]= d[i].remove_columns(column_names = ['__index_level_0__'])
    d = d.map(tokenize_function, batched=True)
    return d

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
from transformers import RobertaPreTrainedModel, RobertaModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn as nn
from torch.nn import CrossEntropyLoss


class RobertaForSequenceClassification2(RobertaPreTrainedModel):
    def __init__(self, config, **kwargs):
        super().__init__(transformers.PretrainedConfig())
        self.num_labels = kwargs.get("task_labels_map", {})
        self.config = config

        self.roberta = RobertaModel(config)
        classifier_dropout = (
            config.classifier_dropout
            if config.classifier_dropout is not None
            else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)

    
        self.classifier1 = nn.Linear(
            config.hidden_size, list(self.num_labels.values())[0]
        )
        self.classifier2 = nn.Linear(
            config.hidden_size, list(self.num_labels.values())[1]
        )

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        label=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        task_name=None
    ):

        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        
        
        logits = None
        if task_name == list(self.num_labels.keys())[0]:
            logits = self.classifier1(pooled_output)
        elif task_name == list(self.num_labels.keys())[1]:
            logits = self.classifier2(pooled_output)

        loss = None
        if label is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(
                logits.view(-1, self.num_labels[task_name]), label.view(-1)
            )

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [15]:
import dataclasses
from torch.utils.data.dataloader import DataLoader
from transformers.data.data_collator import DataCollator, InputDataClass
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
from typing import List, Union, Dict
import numpy as np
import torch
import transformers



class NLPDataCollator:
    """
    Extending the existing DataCollator to work with NLP dataset batches
    """

    def __call__(
        self, features: List[Union[InputDataClass, Dict]]
    ) -> Dict[str, torch.Tensor]:
        first = features[0]
        if isinstance(first, dict):
            # NLP data sets current works presents features as lists of dictionary
            # (one per example), so we  will adapt the collate_batch logic for that
            if "label" in first and first["label"] is not None:
                if first["label"].dtype == torch.int64:
                    label = torch.tensor(
                        [f["label"] for f in features], dtype=torch.long
                    )
                else:
                    label = torch.tensor(
                        [f["label"] for f in features], dtype=torch.float
                    )
                batch = {"label": label}
            for k, v in first.items():
                if k != "label" and v is not None and not isinstance(v, str):
                    batch[k] = torch.stack([f[k] for f in features])
            return batch
        else:
            # otherwise, revert to using the default collate_batch
            return DefaultDataCollator().collate_batch(features)


class StrIgnoreDevice(str):
    """
    This is a hack. The Trainer is going call .to(device) on every input
    value, but we need to pass in an additional `task_name` string.
    This prevents it from throwing an error
    """

    def to(self, device):
        return self


class DataLoaderWithTaskname:
    """
    Wrapper around a DataLoader to also yield a task name
    """

    def __init__(self, task_name, data_loader):
        self.task_name = task_name
        self.data_loader = data_loader

        self.batch_size = data_loader.batch_size
        self.dataset = data_loader.dataset

    def __len__(self):
        return len(self.data_loader)

    def __iter__(self):
        for batch in self.data_loader:
            batch["task_name"] = StrIgnoreDevice(self.task_name)
            yield batch


class MultitaskDataloader:
    """
    Data loader that combines and samples from multiple single-task
    data loaders.
    """

    def __init__(self, dataloader_dict):
        self.dataloader_dict = dataloader_dict
        self.num_batches_dict = {
            task_name: len(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        self.task_name_list = list(self.dataloader_dict)
        self.dataset = [None] * sum(
            len(dataloader.dataset) for dataloader in self.dataloader_dict.values()
        )

    def __len__(self):
        return sum(self.num_batches_dict.values())

    def __iter__(self):
        """
        For each batch, sample a task, and yield a batch from the respective
        task Dataloader.
        We use size-proportional sampling, but you could easily modify this
        to sample from some-other distribution.
        """
        task_choice_list = []
        for i, task_name in enumerate(self.task_name_list):
            task_choice_list += [i] * self.num_batches_dict[task_name]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task_name: iter(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        for task_choice in task_choice_list:
            task_name = self.task_name_list[task_choice]
            yield next(dataloader_iter_dict[task_name])


class MultitaskTrainer(transformers.Trainer):
    def get_single_train_dataloader(self, task_name, train_dataset):
        """
        Create a single-task data loader that also yields task names
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_sampler = (
            RandomSampler(train_dataset)
            if self.args.local_rank == -1
            else DistributedSampler(train_dataset)
        )

        data_loader = DataLoaderWithTaskname(
            task_name=task_name,
            data_loader=DataLoader(
                train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=train_sampler,
                collate_fn=self.data_collator,
            ),
        )
        return data_loader

    def get_train_dataloader(self):
        """
        Returns a MultitaskDataloader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each
        task Dataloader
        """
        return MultitaskDataloader(
            {
                task_name: self.get_single_train_dataloader(task_name, task_dataset)
                for task_name, task_dataset in self.train_dataset.items()
            }
        )

In [16]:
dataset_dict = {
    "author":  read_and_tokenize_data("data/authors_train.pkl","data/authors_test.pkl","data/authors_validation.pkl"),
    "sentiment":  read_and_tokenize_data("data/imdb_train.pkl","data/imdb_test.pkl","data/imdb_validation.pkl")
}


multitask_model = RobertaForSequenceClassification2.from_pretrained(
        "roberta-base",
        task_labels_map={"author": 4, "sentiment": 2},
    )


for i in dataset_dict.values():
    for j in i.values():
        j.set_format(type="torch",columns= ["input_ids", "attention_mask", "label"])
    
    
train_dataset = {
        task_name: dataset["train"] for task_name, dataset in dataset_dict.items()}

eval_dataset = {
        task_name: dataset["evaluate"] for task_name, dataset in dataset_dict.items()}

test_dataset = {
        task_name: dataset["evaluate"] for task_name, dataset in dataset_dict.items()}

Using custom data configuration default-571dfb15c0701b97
Reusing dataset pandas (/scratch/lustre/home/doma6660/.cache/huggingface/datasets/pandas/default-571dfb15c0701b97/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /scratch/lustre/home/doma6660/.cache/huggingface/datasets/pandas/default-571dfb15c0701b97/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade/cache-5dbbbdef9f032114.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Using custom data configuration default-f45029a52432aaca
Reusing dataset pandas (/scratch/lustre/home/doma6660/.cache/huggingface/datasets/pandas/default-f45029a52432aaca/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
training_args = TrainingArguments(
    output_dir='results',
    learning_rate=2e-5,
    num_train_epochs=20,
    weight_decay=0.01
)


metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = MultitaskTrainer(
    model=multitask_model,
    train_dataset=train_dataset,
    data_collator=NLPDataCollator(),
    args=training_args
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 5391
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13500


In [None]:
def save_model(trainer,tokenizer,trainer_path,tokenizer_path):
    trainer.save_model(trainer_path)
    tokenizer.save_pretrained(tokenizer_path)
    
save_model(trainer,tokenizer,"models/multihead_classification/model","models/multihead_classification/tokenizer")