# 품사 태깅

In [1]:
# 필요한 라이브러리 설치
!pip install torch konlpy


Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [1]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from konlpy.tag import Okt
import numpy as np
from collections import Counter, defaultdict

# 샘플 데이터
sentences = [
    "나는 학교에 간다",
    "그는 책을 읽는다",
    "우리는 공부를 한다",
    "그녀는 음악을 듣는다",
    "아이들이 놀고 있다",
    "예쁜 꽃이 핀다",
    "큰 개가 뛴다",
    "작은 새가 난다",
    "맛있는 음식을 먹는다",
    "재미있는 영화를 본다",
    "빠른 자동차가 온다",
    "높은 건물이 보인다",
    "따뜻한 날씨가 좋다"
]

pos_tags = [
    ["Noun", "Noun", "Verb"],
    ["Noun", "Noun", "Verb"],
    ["Noun", "Noun", "Verb"],
    ["Noun", "Noun", "Verb"],
    ["Noun", "Verb", "Verb"],
    ["Adjective", "Noun", "Verb"],
    ["Adjective", "Noun", "Verb"],
    ["Adjective", "Noun", "Verb"],
    ["Adjective", "Noun", "Verb"],
    ["Adjective", "Noun", "Verb"],
    ["Adjective", "Noun", "Verb"],
    ["Adjective", "Noun", "Verb"],
    ["Adjective", "Noun", "Verb"]
]

# 어휘 사전 구축
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}

for sentence in sentences:
    for word in sentence.split():
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)

for tags in pos_tags:
    for tag in tags:
        if tag not in tag_to_idx:
            tag_to_idx[tag] = len(tag_to_idx)

idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

# BiLSTM 모델 정의
class BiLSTM_POS(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim):
        super(BiLSTM_POS, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embedings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embedings(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)
        return tag_space

# 데이터셋 클래스
class POSDataset(Dataset):
    def __init__(self, sentences, tags, word_to_idx, tag_to_idx):
        self.sentences = sentences
        self.tags = tags
        self.word_to_idx = word_to_idx
        self.tag_to_idx = tag_to_idx

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx].split()
        tag_seq = self.tags[idx]

        # 단어를 인덱스로 변환
        word_indices = [self.word_to_idx.get(word, self.word_to_idx["<UNK>"]) for word in sentence]
        tag_indices = [self.tag_to_idx[tag] for tag in tag_seq]

        return torch.LongTensor(word_indices), torch.LongTensor(tag_indices)

# 데이터 로더 생성
dataset = POSDataset(sentences, pos_tags, word_to_idx, tag_to_idx)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# 모델 초기화
EMBEDDING_DIM = 50
HIDDEN_DIM = 100
model = BiLSTM_POS(len(word_to_idx), len(tag_to_idx), EMBEDDING_DIM, HIDDEN_DIM)
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(), lr=0.1)

# 훈련
for epoch in range(100):
    total_loss = 0
    for sentence, tags in dataloader:
        model.zero_grad()
        tag_scores = model(sentence)
        loss = loss_function(tag_scores.view(-1, len(tag_to_idx)), tags.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss:.4f}')

# 예측 함수
def predict_pos(sentence):
    words = sentence.split()
    word_indices = [word_to_idx.get(word, word_to_idx["<UNK>"]) for word in words]
    inputs = torch.LongTensor(word_indices).unsqueeze(0)

    with torch.no_grad():
        tag_scores = model(inputs)
        predicted_tags = torch.argmax(tag_scores, dim=2)

    result = []
    for word, tag_idx in zip(words, predicted_tags[0]):
        result.append((word, idx_to_tag[tag_idx.item()]))

    return result

# 테스트
test_sentence = "고양이가 물고기를 먹는다"
test_sentences = [
    "고양이가 물고기를 먹는다",        # 기존 테스트
    "빨간 사과가 달다",               # 형용사 + 새로운 단어
    "높은 산이 보인다",               # 형용사 포함
    "작은 아이가 뛴다",               # 학습된 형용사
    "어려운 문제를 푼다",             # 새로운 형용사
    "그는 빠르게 달린다"              # 부사 vs 형용사 구분 테스트
]
for sentence in test_sentences:
    result = predict_pos(sentence)
    print("POS 태깅 결과:", result)

Epoch 0, Loss: 17.3179
Epoch 20, Loss: 1.0607
Epoch 40, Loss: 0.2567
Epoch 60, Loss: 0.1304
Epoch 80, Loss: 0.0842
POS 태깅 결과: [('고양이가', 'Noun'), ('물고기를', 'Verb'), ('먹는다', 'Verb')]
POS 태깅 결과: [('빨간', 'Noun'), ('사과가', 'Noun'), ('달다', 'Verb')]
POS 태깅 결과: [('높은', 'Adjective'), ('산이', 'Noun'), ('보인다', 'Verb')]
POS 태깅 결과: [('작은', 'Adjective'), ('아이가', 'Noun'), ('뛴다', 'Verb')]
POS 태깅 결과: [('어려운', 'Noun'), ('문제를', 'Noun'), ('푼다', 'Verb')]
POS 태깅 결과: [('그는', 'Noun'), ('빠르게', 'Noun'), ('달린다', 'Verb')]
