# 15. NLP Framework의 활용

**최신 NLP 기술발전을 선도하는 다양한 NLP Framework에 대해 알아보고, 가장 대표적인 Huggingface transformer를 중심으로 설계구조와 활용법을 공부해 본다.**

## 15-1. 들어가며

## 15-2. 다양한 NLP Framework의 출현

## 15-3. Huggingface transformers 개요

```bash
$ pip install transformers
```

In [None]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis', framework='tf')
classifier('We are very happy to include pipeline into the transformers repository.')

## 15-4. Huggingface transformers (1) Model

In [None]:
from transformers import TFBertForPreTraining
model = TFBertForPreTraining.from_pretrained('bert-base-cased')

print(model.__class__)

In [None]:
from transformers import TFAutoModel
model = TFAutoModel.from_pretrained("bert-base-cased")

print(model.__class__)

## 15-5. Huggingface transformers (2) Tokenizer

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
encoded = tokenizer("This is Test for aiffel")
print(encoded)

In [None]:
batch_sentences = ["Hello I'm a single sentence",
                    "And another sentence",
                    "And the very very last one"]

encoded_batch = tokenizer(batch_sentences)
print(encoded_batch)

In [None]:
batch = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
print(batch)

## 15-6. Huggingface transformers (3) Processor

In [None]:
class DataProcessor:
    """sequence classification을 위해 data를 처리하는 기본 processor"""

    def get_example_from_tensor_dict(self, tensor_dict):
        """
        tensor dict에서 example을 가져오는 메소드
        """
        raise NotImplementedError()

    def get_train_examples(self, data_dir):
        """train data에서 InputExample 클래스를 가지고 있는 것들을 모으는 메소드"""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """dev data(validation data)에서 InputExample 클래스를 가지고 있는 것들을 모으는 메소드"""
        raise NotImplementedError()

    def get_test_examples(self, data_dir):
        """test data에서 InputExample 클래스를 가지고 있는 것들을 모으는 메소드"""
        raise NotImplementedError()

    def get_labels(self):
        """data set에 사용되는 라벨들을 리턴하는 메소드"""
        raise NotImplementedError()

    def tfds_map(self, example):
        """
        tfds(tensorflow-datasets)에서 불러온 데이터를 DataProcessor에 알맞게 가공해주는 메소드
        """
        if len(self.get_labels()) > 1:
            example.label = self.get_labels()[int(example.label)]
        return example

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """tab으로 구분된 .tsv파일을 읽어들이는 클래스 메소드"""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))

## 15-7. Huggingface transformers (4) Config

In [None]:
from transformers import BertConfig

config = BertConfig.from_pretrained("bert-base-cased")
print(config.__class__)
print(config)

In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("bert-base-cased")
print(config.__class__)
print(config)

In [None]:
model = TFBertForPreTraining.from_pretrained('bert-base-cased')

config = model.config
print(config.__class__)
print(config)

## 15-8. Huggingface transformers (5) Trainer

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForPreTraining, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = TFAutoModelForPreTraining.from_pretrained('bert-base-cased')

sentence = "Hello, This is test for bert TFmodel."

input_ids = tf.constant(tokenizer.encode(sentence, add_special_tokens=True))[None, :]  # Batch size 1

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss)
pred = model.predict(input_ids)

print("=====Results=====")
print(pred)

In [None]:
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, Optional
import tensorflow as tf
import tensorflow_datasets as tfds
from transformers import (
    TFAutoModelForSequenceClassification,
    TFTrainer,
    TFTrainingArguments,
    AutoConfig,
    AutoTokenizer,
    glue_convert_examples_to_features,
)

# TFTrainingArguments 정의
training_args = TFTrainingArguments(
    output_dir='./results',              # output이 저장될 경로
    num_train_epochs=1,              # train 시킬 총 epochs
    per_device_train_batch_size=16,  # 각 device 당 batch size
    per_device_eval_batch_size=64,   # evaluation 시에 batch size
    warmup_steps=500,                # learning rate scheduler에 따른 warmup_step 설정
    weight_decay=0.01,                 # weight decay
    logging_dir='./logs',                 # log가 저장될 경로
    do_train=True,                        # train 수행여부
    do_eval=True,                        # eval 수행여부
    eval_steps=1000
)

# model, tokenizer 생성
model_name_or_path = 'bert-base-uncased'
with training_args.strategy.scope():    # training_args가 영향을 미치는 model의 범위를 지정
    model = TFAutoModelForSequenceClassification.from_pretrained(
            model_name_or_path,
            from_pt=bool(".bin" in model_name_or_path),
        )
tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
    )

In [None]:
# 데이터셋 생성
ds, info = tfds.load('glue/mrpc', with_info=True)
train_dataset = glue_convert_examples_to_features(ds['train'], tokenizer, 128, 'mrpc')
train_dataset = train_dataset.apply(tf.data.experimental.assert_cardinality(info.splits['train'].num_examples))

# TFTrainer 생성
trainer = TFTrainer(
    model=model,                          # 학습시킬 model
    args=training_args,                  # TFTrainingArguments을 통해 설정한 arguments
    train_dataset=train_dataset,   # training dataset
)

# 학습 진행
trainer.train()

# 테스트
test_dataset = glue_convert_examples_to_features(ds['test'], tokenizer, 128, 'mrpc')
test_dataset = test_dataset.apply(tf.data.experimental.assert_cardinality(info.splits['test'].num_examples))
trainer.evaluate(test_dataset)

## 15-9. 마무리하며