<a href="https://colab.research.google.com/github/KwonDuHyeon/hanghae99/blob/main/4%EC%A3%BC%EC%B0%A8%EA%B8%B0%EB%B3%B8%EA%B3%BC%EC%A0%9C(%EA%B6%8C%EB%91%90%ED%98%84).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 4주차 기본과제(권두현) HuggingFace로 두 문장의 논리적 모순 분류하기

In [None]:
#!pip install transformers datasets evaluate accelerate scikit-learn

## Import 부분

In [None]:
import random
import evaluate
import numpy as np

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## [MY CODE] MNLI Dataset Load

In [None]:
mnli = load_dataset("nyu-mll/glue", "mnli")
mnli

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

## [MY CODE] 데이터셋 확인

In [None]:
mnli['train'][0]

{'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'label': 1,
 'idx': 0}

In [None]:
len(mnli['train'])

392702

In [None]:
print(mnli["train"].features["label"]) # label 3개

ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)


## [MY CODE] Tokenizer 호출 수정

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def preprocess_function(data):
    return tokenizer(data["premise"], data["hypothesis"], truncation=True)

mnli_tokenized = mnli.map(preprocess_function, batched=True)

Map: 100%|████████████████████████████████████████████████████| 9847/9847 [00:00<00:00, 50866.88 examples/s]


## [MY CODE] KEY값 확인

In [None]:
mnli_tokenized['train'][0].keys()

dict_keys(['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])

## [MY CODE] Train, val, test split 진행 및 길이 확인

In [None]:
mnli_split = mnli_tokenized['train'].train_test_split(test_size=0.2) # 데이터 10%만사용
mnli_train, mnli_val = mnli_split['train'], mnli_split['test']


In [None]:
len(mnli_train), len(mnli_val)

(314161, 78541)

## [MY CODE] MODEL 부분

In [None]:
from transformers import BertConfig

config = BertConfig()

config.hidden_size = 64  # BERT layer의 기본 hidden dimension
config.intermediate_size = 64  # FFN layer의 중간 hidden dimension
config.num_hidden_layers = 2  # BERT layer의 개수
config.num_attention_heads = 4  # Multi-head attention에서 사용하는 head 개수
config.num_labels = 3  # 마지막에 예측해야 하는 분류 문제의 class 개수

model = AutoModelForSequenceClassification.from_config(config)

## [MY CODE] 학습인자 정리

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='hf_transformer',
    num_train_epochs=10,
    per_device_train_batch_size= 128,  # GPU memory 오류 batchsize 변경
    per_device_eval_batch_size= 128, # GPU memory 오류 batchsize 변경
    logging_strategy="epoch",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    load_best_model_at_end=True
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## [MY CODE] accuracy 선언

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## [MY CODE] train 정의

In [None]:
from transformers import EarlyStoppingCallback


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=mnli_train,
    eval_dataset=mnli_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=1)]
)

  trainer = Trainer(


## [MY CODE] 학습진행

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9615,0.894614,0.583873
2,0.8611,0.857474,0.612712
3,0.797,0.84033,0.62482
4,0.743,0.848061,0.63097
5,0.6972,0.874042,0.630944
6,0.6587,0.891747,0.625406
7,0.6221,0.944542,0.623381
8,0.5902,0.965022,0.621421
9,0.5626,1.024644,0.618543
10,0.5385,1.065685,0.615067


TrainOutput(global_step=24550, training_loss=0.7031942444469196, metrics={'train_runtime': 479.2261, 'train_samples_per_second': 6555.591, 'train_steps_per_second': 51.228, 'total_flos': 131047941060540.0, 'train_loss': 0.7031942444469196, 'epoch': 10.0})

## [MY CODE] 학습 결과

In [None]:
trainer.evaluate(mnli_val)

{'eval_loss': 0.8403298258781433,
 'eval_accuracy': 0.6248201576246801,
 'eval_runtime': 7.7767,
 'eval_samples_per_second': 10099.478,
 'eval_steps_per_second': 78.953,
 'epoch': 10.0}

In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline


classifier = pipeline("sentiment-analysis", model="./hf_transformer/", device='cuda')
print(classifier("The movie was so disgusting..."))
print(classifier("The movie was so amazing!!"))

Device set to use cuda


[{'label': 'LABEL_0', 'score': 0.4697028398513794}]
[{'label': 'LABEL_0', 'score': 0.6322169303894043}]


## [LOG] 최종 accuracy 0.624