In [1]:
import dataclasses
import json
import logging
from dataclasses import dataclass
from typing import List, Optional, Union

import torch
from transformers import PreTrainedTokenizer

logger = logging.getLogger(__name__)


@dataclass
class KlueNliInputExample:
    """A single training/test example for klue natural language inference.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: The label of the example.
    """

    guid: str
    text_a: str
    text_b: str
    label: float

    def to_dict(self):
        return dataclasses.asdict(self)

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2) + "\n"


@dataclass(frozen=True)
class KlueNliInputFeatures:
    """A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    """

    input_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label: Optional[Union[int, float]] = None

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(dataclasses.asdict(self)) + "\n"


class KlueNliDataset:
    labels = ["entailment", "contradiction", "neutral"]

    def __init__(self, data: list, tokenizer: PreTrainedTokenizer, max_seq_length: int):
        """Dataset for KlueStsDataset

        Args:
            data: json-loaded list
        """
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.features = self._convert_features(self._create_examples(self.data))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        feature = self.features[idx]
        input_ids = torch.tensor(feature.input_ids, dtype=torch.long)
        attn_mask = torch.tensor(feature.attention_mask, dtype=torch.long)
        token_type_ids = torch.tensor(
            0 if feature.token_type_ids is None else feature.token_type_ids,
            dtype=torch.long,
        )
        labels = torch.tensor(feature.label, dtype=torch.float)
        return (input_ids, attn_mask, token_type_ids, labels)

    def _create_examples(self, data):
        examples = [
            KlueNliInputExample(
                guid=d["guid"],
                text_a=d["premise"],
                text_b=d["hypothesis"],
                label=d["gold_label"],
            )
            for d in self.data
        ]
        return examples

    def _convert_features(
        self, examples: List[KlueNliInputExample]
    ) -> List[KlueNliInputFeatures]:
        return convert_examples_to_features(
            examples,
            self.tokenizer,
            max_length=self.max_seq_length,
            label_list=self.labels,
        )


def convert_examples_to_features(
    examples: List[KlueNliInputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    label_list=None,
):
    if max_length is None:
        max_length = tokenizer.model_max_length

    label_map = {label: i for i, label in enumerate(label_list)}
    labels = [label_map[example.label] for example in examples]

    batch_encoding = tokenizer(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )

    features = []
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}

        feature = KlueNliInputFeatures(**inputs, label=labels[i])
        features.append(feature)

    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info("guid: %s" % (example.guid))
        logger.info("features: %s" % features[i])

    return features


In [10]:
from torch.utils.data import DataLoader

class KlueNliDataLoader(object):
    def __init__(self, tokenizer: PreTrainedTokenizer, max_length: int = None):
        self.tokenizer = tokenizer
        self.sep = self.tokenizer.special_tokens_map["sep_token"]
        self.max_length = max_length if max_length else self.tokenizer.model_max_length

    def collate_fn(self, input_examples):
        """KlueNliFeature padded all input up to max_seq_length"""
        pass

    def get_dataloader(self, file_path, batch_size, **kwargs):
        data = read_json(file_path)
        dataset = KlueNliDataset(data, self.tokenizer, self.max_length)
        return DataLoader(dataset, batch_size=batch_size, shuffle=False, **kwargs)

    def get_dataset(self, file_path, batch_size, **kwargs):
        data = read_json(file_path)
        dataset = KlueNliDataset(data, self.tokenizer, self.max_length)
        return dataset

def read_json(file_path):
    with open(file_path) as f:
        return json.load(f)

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-small")
config = AutoConfig.from_pretrained("klue/roberta-small", num_labels=1)
model = AutoModelForSequenceClassification.from_pretrained("klue/roberta-small", config=config)

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'cl

In [11]:
import os
klue_nli_dataloader = KlueNliDataLoader(tokenizer, 510)
kwargs = (
    {"num_workers": 1, "pin_memory": True}
    if torch.cuda.is_available()
    else {}
)
train_dataset = klue_nli_dataloader.get_dataset(
    file_path=os.path.join('klue-nli-v1.1', 'klue-nli-v1.1_train.json'),
    batch_size=4,
    **kwargs,
)
eval_dataset = klue_nli_dataloader.get_dataset(
    file_path=os.path.join('klue-nli-v1.1', 'klue-nli-v1.1_dev.json'),
    batch_size=4,
    **kwargs,
)
# klue_nli_train_loader = klue_nli_dataloader.get_dataloader(
#     file_path=os.path.join('klue-nli-v1.1', 'klue-nli-v1.1_train.json'),
#     batch_size=4,
#     **kwargs,
# )
# klue_nli_dev_loader = klue_nli_dataloader.get_dataloader(
#     file_path=os.path.join('klue-nli-v1.1', 'klue-nli-v1.1_dev.json'),
#     batch_size=4,
#     **kwargs,
# )

In [12]:
from datasets import load_metric
from easydict import EasyDict
from transformers import TrainingArguments, Trainer

args = EasyDict({
    "data_dir" : "./klue-nli-v1.1",
    "model_dir": "./model",
    "output_dir": "./output",
    "train_filename" : "klue-nli-v1.1_train.json",
    "valid_filename" : "klue-nli-v1.1_dev.json",
    "max_seq_length":512,
    "learning_rate": 2e-5,
    "save_total_limit":2,
    # "gradient_accumulation_steps":1,
    # "weight_decay" : 0.01,
    "evaluation_strategy" : "epoch",
    "save_strategy" : "epoch",
    "logging_strategy" : "epoch",
    "num_train_epochs":5,
    "batch_size": 4,
})

accuracy = load_metric("accuracy").compute
def compute_metrics(pred):
    references = pred.label_ids
    predictions = pred.predictions
    metric = accuracy(predictions=predictions, references=references)
    return metric

training_args = TrainingArguments(
    output_dir= args.model_dir,
    save_total_limit=args.save_total_limit,
    save_strategy=args.save_strategy,
    evaluation_strategy= args.evaluation_strategy,
    logging_strategy= args.logging_strategy,
    num_train_epochs=args.num_train_epochs,
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.batch_size,
    per_device_eval_batch_size=args.batch_size,
    logging_dir='./logs',
    metric_for_best_model = 'accuracy',
    fp16=True,
    fp16_opt_level='O1',
    load_best_model_at_end = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using amp fp16 backend


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
os.environ["WANDB_DISABLED"] = "true"

trainer.train()

Error in callback <function _WandbInit._resume_backend at 0x7f61f47061f0> (for pre_run_cell):


Exception: The wandb backend process has shutdown

***** Running training *****
  Num examples = 24998
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 31250


TypeError: vars() argument must have __dict__ attribute

Error in callback <function _WandbInit._pause_backend at 0x7f61f4706550> (for post_run_cell):


Exception: The wandb backend process has shutdown