In [6]:
from transformers import BertTokenizer,BertForSequenceClassification
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score

In [7]:
dataset = load_dataset("imdb")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 156398.37 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 232712.81 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 249856.08 examples/s]


In [8]:
test_dataset = dataset["test"].shuffle(seed=42).select(range(500))

In [9]:
# 영어에 특화 대소문자 구분 x 최대 512문자

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")



In [15]:
def tokenize_function(examples):
    # 해당 모델이 이해하도록 토큰화, 최대길이로 패딩, 초과된 부분은 자르기
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [16]:
# 배치로 여러 샘플 한번에 토큰화
test_dataset = test_dataset.map(tokenize_function,batched=True)

Map: 100%|██████████| 500/500 [00:01<00:00, 482.40 examples/s]


In [17]:
# 선택한 컬럼의 데이터 토치 형식으로
test_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])

In [19]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [21]:
all_preds = []
all_labels = []

In [None]:
import torch.utils

# 배치 사이즈 지정
for batch in torch.utils.data.DataLoader(test_dataset, batch_size = 8):
    with torch.no_grad():
        #아웃풋 생성하기 
        output = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])
    # 각각의 점수 계산하기
    logits = output.logits
    preds = np.argmax(logits.numpy(), axis=1)
    all_preds.extend(preds)
    all_labels.extend(batch['label'].numpy())

In [29]:
accuracy = accuracy_score(all_labels, all_preds)
print(f"{accuracy:.4f}")

0.5060
