In [1]:
%pip install transformers datasets evaluate accelerate scikit-learn sacrebleu sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Python 내장 라이브러리로, 무작위(randomness)를 생성하기 위해 사용
import random
# HuggingFace에서 제공하며, NLP 작업의 평가 지표를 계산하는 데 사용
import evaluate
# 다차원 배열 객체와 이를 처리하기 위한 다양한 수학적 기능을 제공, 주로 수치 계산이나 데이터 분석에 사용
import numpy as np

from datasets import load_dataset
from transformers import AutoTokenizer

In [3]:
dataset = load_dataset("nyu-mll/glue", "mnli")
dataset_split = dataset["train"].train_test_split(test_size=0.2)


def load_train_data() -> list:
    data = []
    for row in dataset_split["train"]:
        data.append({"premise": row["premise"], "hypothesis": row["hypothesis"], "label": row["label"]})
    return data

def load_valid_data() -> list:
    data = []
    for row in dataset_split["test"]:
        data.append({"premise": row["premise"], "hypothesis": row["hypothesis"], "label": row["label"]})
    return data

def load_eval_data() -> list:
    data = []
    for row in dataset["validation_matched"]:
        data.append({"premise": row["premise"], "hypothesis": row["hypothesis"], "label": row["label"]})
    return data

In [4]:
dataset["train"][0]

{'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'label': 1,
 'idx': 0}

In [5]:
dataset["validation_matched"][0]

{'premise': 'The new rights are nice enough',
 'hypothesis': 'Everyone really likes the newest benefits ',
 'label': 1,
 'idx': 0}

In [19]:
import torch
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", model_max_length=128)

def preprocess_data(data):
    texts = tokenizer(data["premise"], data["hypothesis"], truncation=True, padding='max_length', max_length=128, return_tensors=None)
    texts['labels'] = data['label']
    return texts

train_data = Dataset.from_list(load_train_data())
valid_data = Dataset.from_list(load_valid_data())
eval_data = Dataset.from_list(load_eval_data())

train_dataset = train_data.map(preprocess_data, batched=True, remove_columns=train_data.column_names)
valid_dataset = valid_data.map(preprocess_data, batched=True, remove_columns=valid_data.column_names)
eval_dataset = eval_data.map(preprocess_data, batched=True, remove_columns=eval_data.column_names)

Map:   0%|          | 0/314161 [00:00<?, ? examples/s]

Map:   0%|          | 0/78541 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

In [14]:
train_dataset[0]

{'premise': 'i think the better sight seeing is in the uh British Virgins',
 'hypothesis': 'I have only ever been to the British Virgins.',
 'label': 1,
 'input_ids': [101,
  1045,
  2228,
  1996,
  2488,
  4356,
  3773,
  2003,
  1999,
  1996,
  7910,
  2329,
  6261,
  2015,
  102,
  1045,
  2031,
  2069,
  2412,
  2042,
  2000,
  1996,
  2329,
  6261,
  2015,
  1012,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  

In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_strategy="epoch",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3, # 0.001
    load_best_model_at_end=True,
    logging_dir="./logs",
)

In [9]:
import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [20]:
from transformers import AutoModelForSequenceClassification, DistilBertConfig

# config = DistilBertConfig()

# config.hidden_size = 64  # BERT layer의 기본 hidden dimension
# config.intermediate_size = 64  # FFN layer의 중간 hidden dimension
# config.num_hidden_layers = 2  # BERT layer의 개수
# config.num_attention_heads = 4  # Multi-head attention에서 사용하는 head 개수
# config.num_labels = 3  # 마지막에 예측해야 하는 분류 문제의 class 개수

# model = AutoModelForSequenceClassification.from_config(config)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

In [22]:
trainer.train()

  0%|          | 0/392710 [00:00<?, ?it/s]

In [None]:
trainer.evaluate(eval_dataset)

In [None]:
trainer.save_model("./model")