# 감정분석 - klue/bert-base model

In [1]:
# 필요한 라이브러리 설치
!pip install torch transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import numpy as np

# 샘플 데이터 (단어별 감정 점수: -1=부정, 0=중립, 1=긍정)
texts = [
    "이 영화는 정말 재미있고 감동적이었습니다",
    "음식이 너무 맛없고 서비스도 별로였어요",
    "날씨가 좋아서 기분이 좋습니다",
    "오늘은 평범한 하루였습니다"
]

# 토큰별 감정 레이블 (실제로는 더 정교한 어노테이션 필요)
token_emotions = [
    [0, 0, 0, 1, 1, 1, 0],      # 재미있고, 감동적 -> 긍정
    [0, 0, 0, -1, 0, 0, -1, 0], # 맛없고, 별로 -> 부정
    [0, 0, 1, 0, 1, 1, 0],      # 좋아서, 기분, 좋습니다 -> 긍정
    [0, 0, 0, 0, 0, 0]          # 평범한 -> 중립
]

# 감정 분류 모델
class TokenEmotionClassifier(nn.Module):
    def __init__(self, model_name, num_emotions=3):  # 부정, 중립, 긍정
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_emotions)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        return logits

# 토크나이저 및 모델 초기화
# https://huggingface.co/klue/bert-base
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')
model = TokenEmotionClassifier('klue/bert-base')

# 감정 레이블 매핑
emotion_map = {-1: 0, 0: 1, 1: 2}  # 부정=0, 중립=1, 긍정=2
emotion_names = ['부정', '중립', '긍정']

# 데이터 전처리
def preprocess_emotion_data(texts, token_emotions):
    all_input_ids = []
    all_attention_masks = []
    all_labels = []

    for text, emotions in zip(texts, token_emotions):
        encoded = tokenizer(text, truncation=True, padding='max_length',
                          max_length=128, return_tensors='pt')

        # 레이블 패딩 및 정렬
        labels = [emotion_map[e] for e in emotions]
        labels = labels + [1] * (128 - len(labels))  # 중립으로 패딩
        labels = labels[:128]

        all_input_ids.append(encoded['input_ids'].squeeze())
        all_attention_masks.append(encoded['attention_mask'].squeeze())
        all_labels.append(torch.tensor(labels))

    return torch.stack(all_input_ids), torch.stack(all_attention_masks), torch.stack(all_labels)

# 데이터 전처리
input_ids, attention_masks, labels = preprocess_emotion_data(texts, token_emotions)

# 훈련 설정
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss(ignore_index=1)  # 중립은 무시

# 간단한 훈련 루프
model.train()
for epoch in range(5):
    optimizer.zero_grad()
    logits = model(input_ids, attention_masks)
    loss = criterion(logits.view(-1, 3), labels.view(-1))
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

# 예측 함수
def predict_token_emotion(text):
    model.eval()
    encoded = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

    with torch.no_grad():
        logits = model(encoded['input_ids'], encoded['attention_mask'])
        predictions = torch.argmax(logits, dim=-1)

    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    emotions = [emotion_names[pred.item()] for pred in predictions[0]]

    result = []
    for token, emotion in zip(tokens, emotions):
        if token not in ['[CLS]', '[SEP]', '[PAD]']:
            result.append((token, emotion))

    return result

# 테스트
test_text = "이 제품은 훌륭하지만 가격이 비싸요"
result = predict_token_emotion(test_text)
print("토큰별 감정 분석 결과:", result)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Epoch 1, Loss: 1.0280
Epoch 2, Loss: 0.6790
Epoch 3, Loss: 0.4136
Epoch 4, Loss: 0.2554
Epoch 5, Loss: 0.1372
토큰별 감정 분석 결과: [('이', '긍정'), ('제품', '긍정'), ('##은', '긍정'), ('훌륭', '긍정'), ('##하', '긍정'), ('##지만', '긍정'), ('가격', '긍정'), ('##이', '긍정'), ('비싸', '긍정'), ('##요', '긍정')]
