In [1]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np

In [5]:
dataset = load_dataset('csv', data_files='imdbs.csv', split='train')

Downloading: 100%|██████████| 2.75k/2.75k [00:00<00:00, 873kB/s]
Using custom data configuration default


Downloading and preparing dataset csv/default-11046c2826f07a01 (download: Unknown size, generated: Unknown size, post-processed: Unknown sizetotal: Unknown size) to /Users/sunny/.cache/huggingface/datasets/csv/default-11046c2826f07a01/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b...


                            

Dataset csv downloaded and prepared to /Users/sunny/.cache/huggingface/datasets/csv/default-11046c2826f07a01/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b. Subsequent calls will reuse this data.




In [6]:
type(dataset)

nlp.arrow_dataset.Dataset

In [7]:
dataset = dataset.train_test_split(test_size=0.3)

100%|██████████| 1/1 [00:00<00:00, 315.15it/s]
100%|██████████| 1/1 [00:00<00:00, 947.87it/s]


In [8]:
dataset

{'train': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 70),
 'test': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 30)}

In [9]:
train_set = dataset['train']
test_set = dataset['test']

model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
# 토크나이저 
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading: 100%|██████████| 455k/455k [00:00<00:00, 492kB/s]  


In [11]:
# 데이터셋 전처리
tokens = [ ['CLS'], 'I', 'love','Paris', ['SEP']]
input_ids = [101,1045,2293,3000,102]

# 세그먼트 ID 추가하기
    # 입력에 문장이 2개 있을 때, 한 문장을 다른 문장과 구별하는데 사용된다
    # 첫 문장의 토큰을 0 으로 매핑하면, 그 다음 문장의 토큰은 1로 매핑된다.
token_type_ids = [0,0,0,0,0]

# 어텐션-마크 만들어주기
attention_mask = [1,1,1,1,1]

# 위 단계를 한번에 수행해준다.
tokenizer('I love Paris')

{'input_ids': [101, 1045, 2293, 3000, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [12]:
tokenizer(['I love Paris', 'birds fly', 'snow fall'], padding=True, max_length=5)



{'input_ids': [[101, 1045, 2293, 3000, 102], [101, 5055, 4875, 102, 0], [101, 4586, 2991, 102, 0]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0], [1, 1, 1, 1, 0]]}

In [13]:
def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)

In [14]:
# 함수 사용하여 전처리 한번에 해주기
train_set = train_set.map(preprocess, batched=True, batch_size = len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size = len(test_set))

100%|██████████| 1/1 [00:00<00:00, 17.78it/s]
100%|██████████| 1/1 [00:00<00:00, 43.01it/s]


In [15]:
#set_format 함수를 이용 필요 열 정해주기
train_set.set_format('torch',
                     columns = ['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch',
                    columns= ['input_ids', 'attention_mask', 'label'])

#### 모델 학습

In [16]:
batch_size = 8
epochs= 2

warmup_steps = 500
weight_decay = 0.01

In [21]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    eval_accumulation_steps=True,
    logging_dir='./logs',
)

In [22]:
# 트레이너 정하기
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset= train_set,
    eval_dataset = test_set
)

In [23]:
trainer.train()

***** Running training *****
  Num examples = 70
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18
 11%|█         | 2/18 [00:47<06:11, 23.20s/it]

In [None]:
trainer.evaluate()