In [1]:
import dataclasses
import json
import logging
from dataclasses import dataclass
from typing import List, Optional, Union

import torch
from transformers import PreTrainedTokenizer
import os

logger = logging.getLogger(__name__)


@dataclass
class KlueNliInputExample:
    """A single training/test example for klue natural language inference.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: The label of the example.
    """

    guid: str
    text_a: str
    text_b: str
    label: float

    def to_dict(self):
        return dataclasses.asdict(self)

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2) + "\n"


@dataclass(frozen=True)
class KlueNliInputFeatures:
    """A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    """

    input_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label: Optional[Union[int, float]] = None

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(dataclasses.asdict(self)) + "\n"


class KlueNliDataset:
    labels = ["entailment", "contradiction", "neutral"]

    def __init__(self, data: list, tokenizer: PreTrainedTokenizer, max_seq_length: int):
        """Dataset for KlueStsDataset

        Args:
            data: json-loaded list
        """
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.features = self._convert_features(self._create_examples(self.data))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature = self.features[idx]
        input_ids = torch.tensor(feature.input_ids, dtype=torch.long)
        attn_mask = torch.tensor(feature.attention_mask, dtype=torch.long)
        token_type_ids = torch.tensor(
            0 if feature.token_type_ids is None else feature.token_type_ids,
            dtype=torch.long,
        )
        labels = torch.tensor(feature.label, dtype=torch.float)
        return (input_ids, attn_mask, token_type_ids, labels)

    def _create_examples(self, data):
        examples = [
            KlueNliInputExample(
                guid=d["guid"],
                text_a=d["premise"],
                text_b=d["hypothesis"],
                label=d["gold_label"],
            )
            for d in self.data
        ]
        return examples

    def _convert_features(
        self, examples: List[KlueNliInputExample]
    ) -> List[KlueNliInputFeatures]:
        return convert_examples_to_features(
            examples,
            self.tokenizer,
            max_length=self.max_seq_length,
            label_list=self.labels,
        )


def convert_examples_to_features(
    examples: List[KlueNliInputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    label_list=None,
):
    if max_length is None:
        max_length = tokenizer.model_max_length

    label_map = {label: i for i, label in enumerate(label_list)}
    labels = [label_map[example.label] for example in examples]

    batch_encoding = tokenizer(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = KlueNliInputFeatures(**inputs, label=labels[i])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features


In [2]:
from torch.utils.data import DataLoader

class KlueNliDataLoader(object):
    def __init__(self, tokenizer: PreTrainedTokenizer, max_length: int = None):
        self.tokenizer = tokenizer
        self.sep = self.tokenizer.special_tokens_map["sep_token"]
        self.max_length = max_length if max_length else self.tokenizer.model_max_length

    def collate_fn(self, input_examples):
        """KlueNliFeature padded all input up to max_seq_length"""
        pass

    def get_dataloader(self, file_path, batch_size, **kwargs):
        data = read_json(file_path)
        dataset = KlueNliDataset(data, self.tokenizer, self.max_length)
        return DataLoader(dataset, batch_size=batch_size, shuffle=False, **kwargs)

    def get_dataset(self, file_path, batch_size, **kwargs):
        data = read_json(file_path)
        dataset = KlueNliDataset(data, self.tokenizer, self.max_length)
        return dataset

def read_json(file_path):
    with open(file_path) as f:
        return json.load(f)

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-small")
# labels = ["entailment", "contradiction", "neutral"]
config = AutoConfig.from_pretrained("klue/roberta-small", num_labels=1)
model = AutoModelForSequenceClassification.from_pretrained("klue/roberta-small", config=config)

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classif

In [4]:
klue_nli_dataloader = KlueNliDataLoader(tokenizer, 128)

In [6]:
kwargs = (
    {"num_workers": 1, "pin_memory": True}
    if torch.cuda.is_available()
    else {}
)
train_dataloader = klue_nli_dataloader.get_dataloader(
    file_path=os.path.join('klue-nli-v1.1', 'klue-nli-v1.1_train.json'),
    batch_size=8,
    **kwargs,
)
eval_dataloader = klue_nli_dataloader.get_dataloader(
    file_path=os.path.join('klue-nli-v1.1', 'klue-nli-v1.1_dev.json'),
    batch_size=8,
    **kwargs,
)

In [7]:
from easydict import EasyDict

args = EasyDict({
    "data_dir" : "./klue-nli-v1.1",
    "output_dir": "./output",
    "train_filename" : "klue-nli-v1.1_train.json",
    "valid_filename" : "klue-nli-v1.1_dev.json",
    "max_seq_length":128,
    "learning_rate": 2e-5,
    "save_total_limit":2,
    "gradient_accumulation_steps":1,
    "weight_decay" : 0,
    "evaluation_strategy" : "steps",
    "save_strategy" : "steps",
    "logging_strategy" : "steps",
    "evaluation_steps" : 100,
    "save_steps" : 100,
    "logging_steps" : 100,
    "num_train_epochs":1,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "max_train_steps": None
})

In [10]:
from transformers import AdamW, get_scheduler
from accelerate import Accelerator
from datasets import load_metric
from tqdm import tqdm
import math

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

metric = load_metric("accuracy")
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

logger.info("***** Running training *****")
logger.info(f"  Num examples = {len(train_dataloader)}")
logger.info(f"  Num Epochs = {args.num_train_epochs}")
logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f"  Total optimization steps = {args.max_train_steps}")

# lr_scheduler = get_scheduler(
#     name="linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=None,
# )

num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch

progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0

for epoch in range(args.num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        # outputs = model(batch[0], batch[1], batch[2], batch[3])
        outputs = model(input_ids=batch[0], attention_mask=batch[1], token_type_ids=batch[2], labels=batch[3])
        loss = outputs.loss
        loss = loss / args.gradient_accumulation_steps
        accelerator.backward(loss)
        if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            # lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            completed_steps += 1

        if completed_steps >= args.max_train_steps:
            break

    model.eval()
    for step, batch in enumerate(eval_dataloader):
        outputs = model(input_ids=batch[0], attention_mask=batch[1], token_type_ids=batch[2], labels=batch[3])
        predictions = outputs.logits.squeeze()
        metric.add_batch(
            predictions=accelerator.gather(predictions),
            references=accelerator.gather(batch["labels"]),
        )

    eval_metric = metric.compute()
    logger.info(f"epoch {epoch}: {eval_metric}")

    if args.push_to_hub and epoch < args.num_train_epochs - 1:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)


  0%|          | 0/3125 [00:07<?, ?it/s]
  0%|          | 1/3125 [00:00<07:16,  7.15it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  1%|          | 17/3125 [00:01<04:41, 11.04it/s]

KeyboardInterrupt: 