## PreTrained BERT를 이용한 감성분류 전이학습 샘플코드

In [13]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import warnings

warnings.filterwarnings('ignore')

# 모델과 토크나이저 불러오기
model_name = "bert-base-multilingual-cased"
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 전이학습용 데이터셋 구성하기
train_data = [
    ('1', '별로에요', 1), 
    ('2', '비싸요', 1),
    ('3', '좋아요! 다음에 또 구매하겠습니다.', 0),
    ('4', '훌륭합니다. 강추합니다.', 0),
    ('5', '이정도로 시장에 나오기에는 무리가 있는듯',1),
    ('6', '가성비가 썩 좋진 않네요',1),
    ('7', '그래도 살만하네요',0)
]

# 토큰화 및 패딩 처리하기
def tokenize_and_pad(sentence, max_length):
    inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    return inputs['input_ids'], inputs['attention_mask']

# 데이터셋 변환하기
def preprocess(data):
    processed = []
    max_length = 64
    for item in data:
        input_id, attention_mask = tokenize_and_pad(item[1], max_length)
        label = torch.tensor(item[2]).long()
        processed.append((input_id, attention_mask, label))
    return processed

train_data = preprocess(train_data)

# 학습 관련 파라미터 설정하기
batch_size = 3
num_epochs = 10
learning_rate = 1e-4

# 모델 학습하기
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, batch in enumerate(torch.utils.data.DataLoader(train_data, batch_size=batch_size)):
        inputs, masks, labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()
        outputs = model(inputs.squeeze(1), attention_mask=masks.squeeze(1))
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print('[%d] loss: %.3f' % (epoch + 1, running_loss / len(train_data)))


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

[1] loss: 0.478
[2] loss: 0.395
[3] loss: 0.277
[4] loss: 0.228
[5] loss: 0.158
[6] loss: 0.297
[7] loss: 0.131
[8] loss: 0.129
[9] loss: 0.110
[10] loss: 0.043


## 모델 결과 테스트

In [24]:
sentences = ["이 가격이면 그래도 살 것 같네요.", "너무 부담스럽네요.", "이런걸 살바에는 다른걸 사는게 낫죠", "오늘 본 제품 중 최고입니다."]
for test_sentence in sentences:
    test_input_id, test_attention_mask = tokenize_and_pad(test_sentence, 64)
    test_input_id = test_input_id.to(device)
    test_attention_mask = test_attention_mask.to(device)

    with torch.no_grad():
        test_output = model(test_input_id, attention_mask=test_attention_mask)
    test_probs = nn.Softmax(dim=1)(test_output.logits).squeeze()
    
    if test_probs[0] < test_probs[1]:
        print(f"{test_sentence} - 불만입니다(불만확률 : {test_probs[1]})")
    else:
        print(f"{test_sentence} - 만족입니다(불만확률 : {test_probs[1]})")


이 가격이면 그래도 살 것 같네요. - 만족입니다(불만확률 : 0.11037643998861313)
너무 부담스럽네요. - 불만입니다(불만확률 : 0.8615490794181824)
이런걸 살바에는 다른걸 사는게 낫죠 - 불만입니다(불만확률 : 0.8861760497093201)
오늘 본 제품 중 최고입니다. - 만족입니다(불만확률 : 0.014779518358409405)
