In [1]:
from datasets import load_dataset

# 예제 데이터셋을 로드
dataset = load_dataset("imdb")
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [2]:
from transformers import BertTokenizer

# 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [3]:
# 데이터 전처리 함수 정의
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [4]:
# 데이터셋을 처리
train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

In [5]:
# 훈련 데이터셋과 테스트 데이터셋을 TensorFlow 형식으로 변환
import numpy as np

train_input_ids = np.array([example['input_ids'] for example in train_dataset])
train_attention_masks = np.array([example['attention_mask'] for example in train_dataset])
train_token_type_ids = np.array([example['token_type_ids'] for example in train_dataset])
train_labels = np.array(train_dataset['label'])

test_input_ids = np.array([example['input_ids'] for example in test_dataset])
test_attention_masks = np.array([example['attention_mask'] for example in test_dataset])
test_token_type_ids = np.array([example['token_type_ids'] for example in test_dataset])
test_labels = np.array(test_dataset['label'])

In [6]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# TensorFlow 모델 로드
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# 훈련 설정
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 훈련
model.fit(
    {"input_ids": train_input_ids,
     "attention_mask": train_attention_masks,
     "token_type_ids": train_token_type_ids},
    train_labels,
    batch_size=4,
    epochs=3
)

Epoch 1/3

In [None]:
# 평가
results = model.evaluate(
    {"input_ids": test_input_ids,
     "attention_mask": test_attention_masks,
     "token_type_ids": test_token_type_ids},
    test_labels
)
print(results)