Setup

In [None]:
# setup|
!pip install transformers
!pip install datasets

In [None]:
import gc
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from datasets import load_metric
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding, 
    EarlyStoppingCallback,
    AdamW,
    get_cosine_schedule_with_warmup

)

In [None]:
cd drive/MyDrive

Prepare

In [None]:
# 출처: https://dacon.io/en/competitions/official/235875/codeshare/4441?page=1&dtype=recent
def get_basic_example_fn(tokenizer, src_cols=[], tar_col='label', label_fn=None, max_src_len=256, max_tar_len=256, truncation=True, padding="max_length"):

    def example_fn(examples):
        output = tokenizer(*[examples[col] for col in [c for c in src_cols if c in examples]],
                           padding=padding,
                           max_length=max_src_len,
                           truncation=True)
        if tar_col in examples:
            output["labels"] = [label_fn(c) for c in examples[tar_col]] if label_fn else examples[tar_col]
        return output
    
    return example_fn

metric_fn = load_metric('glue', 'mnli')

def metric(p):
    preds, labels = p
    if not isinstance(preds, tuple) and not isinstance(preds, list):
        if len(preds.shape) == 2 and preds.shape[1] == 1:
            preds = preds[:, 0]
        elif len(preds.shape) - len(labels.shape) == 1:
            preds = np.argmax(preds, axis=-1)
    return metric_fn.compute(predictions=preds, references=labels)


# back-translation function for data augmentation.
def backtranslate_text(text):
    import six
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()
    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    eng = translate_client.translate(text, target_language="en")
    eng = [t["translatedText"] for t in eng]
    kor = translate_client.translate(eng, target_language="ko")
    kor = [t["translatedText"] for t in kor]
    print("{} - > {}".format(text, kor))

    return kor

Back translation examples

In [None]:
# *_gen이 backtranslation을 통해 생성된 문장입니다.
aug_df = pd.read_csv("dacon/knlu/data/klue-xnli-hypo_premise_gen_include.csv")
print(aug_df)

Train argument

In [None]:
TRAIN = "dacon/knlu/data/klue_xnli_aug_multi_snli.csv"
# EVAL = 'dacon/knlu/data/full_test_data.csv'
TEST = "dacon/knlu/data/test_data.csv"
CKPT = "dacon/knlu/runs/"
# MODEL = 'klue/roberta-large' # 1st train
MODEL = 'klue/roberta-large-full-data-trained' # 2nd train

MAX_LEN = 256
TRAIN_BATCH = 8
EVAL_BATCH = 16

map_dict = {'contradiction': 0, 'neutral': 1, 'entailment': 2}
tokenizer = AutoTokenizer.from_pretrained(MODEL)

example_fn = get_basic_example_fn(
    tokenizer,
    src_cols=['premise', 'hypothesis'], 
    tar_col='label', 
    label_fn=lambda x: map_dict.get(x),
    max_src_len=MAX_LEN,
    max_tar_len=MAX_LEN,
    truncation=True, padding="max_length"
    )

dataset = load_dataset("csv", data_files={"train":TRAIN})
dataset = dataset['train'].map(example_fn, remove_columns=['index', 'premise', 'hypothesis', 'label'], batched=True)
dataset = dataset.train_test_split(0.03)
train_data, eval_data = dataset['train'], dataset['test']

model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=len(map_dict))

Layer-wise learning rate decay

In [None]:
param_optimizer = list(model.named_parameters())
learning_rate = 2e-5
mu = 0.95

group1=['layer.0.','layer.1.','layer.2.','layer.3.']
group2=['layer.4.','layer.5.','layer.6.','layer.7.']    
group3=['layer.8.','layer.9.','layer.10.','layer.11.']
group4=['layer.12.','layer.13.','layer.14.','layer.15.']
group5=['layer.16.','layer.17.','layer.18.','layer.19.']
group6=['layer.20.','layer.21.','layer.22.','layer.23.']
groupall = ['layer.0.','layer.1.','layer.2.','layer.3.', 'layer.4.','layer.5.','layer.6.','layer.7.', 'layer.8.','layer.9.','layer.10.','layer.11.', 'layer.12.','layer.13.','layer.14.','layer.15.', 'layer.16.','layer.17.','layer.18.','layer.19.', 'layer.20.','layer.21.','layer.22.','layer.23.']

optimizer_parameters = [{'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in group1) or not any(nd in n for nd in groupall)], 'lr':learning_rate*mu*mu*mu*mu*mu},
                        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in group2)], 'lr':learning_rate*mu*mu*mu*mu},
                        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in group3)], 'lr':learning_rate*mu*mu*mu},
                        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in group4)], 'lr':learning_rate*mu*mu},
                        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in group5)], 'lr':learning_rate*mu},
                        {'params': [p for n, p in model.roberta.named_parameters() if any(nd in n for nd in group6)], 'lr':learning_rate},
                        {'params': [p for n, p in model.classifier.named_parameters()], 'lr':learning_rate}]

optimizer = AdamW(optimizer_parameters, lr=learning_rate, weight_decay=0.01, correct_bias=False)
lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=230, num_training_steps=2290)


Train

In [None]:
training_arguments = TrainingArguments(
    output_dir="runs/roberta_large_hypopreaug_layerlr/",
    per_device_train_batch_size=TRAIN_BATCH,
    per_device_eval_batch_size=EVAL_BATCH,
    logging_strategy='steps',
    evaluation_strategy='steps',
    save_strategy='steps',
    save_steps=230,
    save_total_limit=3,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    load_best_model_at_end=True,
    label_smoothing_factor=0.025,
    gradient_accumulation_steps=128,
    logging_steps=230,
    eval_steps=230,
    num_train_epochs=5
)


trainer = Trainer(
    model=model,
    args=training_arguments,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=metric,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    optimizers=(optimizer, lr_scheduler)
)

gc.collect()
torch.cuda.empty_cache()

# Training.
trainer.train()

Inference

In [None]:
# inference
model = AutoModelForSequenceClassification.from_pretrained('runs/roberta_large_hypopreaug_layerlr/checkpoint-690')

# arguments for Trainer
test_args = TrainingArguments(
    output_dir = "runs/roberta_large_fdtrain_hypopreaug/",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = EVAL_BATCH,   
    dataloader_drop_last = False    
)

trainer = Trainer(
    model=model,
    args=test_args,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=metric,
    train_dataset=train_data,
    eval_dataset=eval_data,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Prepare the test data.
test_data = load_dataset("csv", data_files={"test":TEST})
test_data = test_data['test'].remove_columns("label").map(example_fn, batched=True, remove_columns = ['premise', 'hypothesis'])

# Predict the test outputs.
outputs = trainer.predict(test_data)

# Transform the test outputs for the submission.
df_sub = pd.DataFrame({"index":test_data['index'], "label":np.argmax(outputs.predictions, axis=-1)})
df_sub['label'] = df_sub['label'].apply(lambda x: {v:k for k,v in map_dict.items()}[x])
df_sub.to_csv("submission-roberta-large-final.csv", index=False)
