# RNN 텍스트 분류기

In [31]:
corpus = [
    "자연어 처리는 재미있다",
    "Python이 자연어 처리보다 쉽다.",
    "자연어 처리 공부는 어렵다.",
    "Python 활용법 즐겁게 찾자"
]
labels = [1, 0, 0, 1]

In [32]:
# 토큰화
tokenized_corpus = [sentence.split() for sentence in corpus]

# 단어사전
vocab = {}
for tokens in tokenized_corpus:
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab) + 1


# 문자 인덱싱
indexed_corpus = []
for tokens in tokenized_corpus:
    indexed_sent = [vocab[token] for token in tokens]
    indexed_corpus.append(indexed_sent)

# 패딩 처리
max_seq_len = max(len(seq) for seq in indexed_corpus)
def pad_sequences(seq, max_len):
    if len(seq) < max_len:
        seq = seq + [0] * (max_len - len(seq))
        return seq

padded_corpus = [pad_sequences(seq, max_seq_len) for seq in indexed_corpus]

In [33]:
indexed_corpus, max_seq_len, padded_corpus

([[1, 2, 3], [4, 1, 5, 6], [1, 7, 8, 9], [10, 11, 12, 13]],
 4,
 [[1, 2, 3, 0], None, None, None])

In [34]:
import torch

inputs = torch.tensor(padded_corpus, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)

TypeError: not a sequence

In [None]:
# RNN 기반 텍스트 분류기 모델 정의
import torch.nn as nn

class RNNClassfifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(RNNClassfifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        return self.sigmoid(self.fc(hidden[-1]))

In [None]:
VOCAB_SIZE = len(vocab) + 1
EMBED_SIZE = 128
HIDDEN_SIZE = 64
NUM_CLASSES = 1

model = RNNClassfifier(
    vocab_size=VOCAB_SIZE,
    embed_size=EMBED_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_classes=NUM_CLASSES
)

print(model)

RNNClassfifier(
  (embedding): Embedding(14, 128)
  (rnn): RNN(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [None]:
import torch.optim as optim

# 학습
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
epochs = 20

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}/{epochs} | Loss == {loss.item():.4f}')

NameError: name 'inputs' is not defined

In [35]:
test_text = [
    "자연어 처리 잼다",
    "파이썬 어렵다"
]

def preprocessed_sentence(sentence, vocab, max_len):
    tokens = sentence.split()
    indices = [vocab.get(token, 0) for token in tokens]
    indices = pad_sequences(indices, max_len)
    return torch.tensor(indices, dtype=torch.long)

test_inputs = []
for sent in test_text:
    test_inputs.append(preprocessed_sentence(sent, vocab, max_seq_len))
test_inputs = torch.stack(test_inputs)

In [36]:
model.eval()
with torch.no_grad():
    outputs = model(test_inputs)
    print(outputs)

AttributeError: 'OneVsRestClassifier' object has no attribute 'eval'

In [45]:
# 데이터 준비
corpus = [
    "장녕",
    'ㅇㅈㄷㄺ',
    'ㅇㅈㄷㄹㅇ',
    'ㅇㅈㄷㅇㄹ',
    '파이썬 딥러 오어어',
    '파이썬 딥러 오어어'
]

labels = [["자연어 처리"], ["파이썬"], ["딥러닝"], ["자연어처리", '파이썬'], ["딥러"], ["파이", "딥러"]
]

In [46]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=['자연어 처리', 'Python', '딥러닝'])
y = mlb.fit_transform(labels)
y

array([[1, 0, 0],
       [0, 0, 0],
       [0, 0, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(corpus)

In [48]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

model = OneVsRestClassifier(LogisticRegression(max_iter=100))
model.fit(X_train, y)



0,1,2
,estimator,LogisticRegression()
,n_jobs,
,verbose,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [52]:
X_test = ["자연어 처리랑 딥러닝이랑 공부 잼따"]
y_test = ['자연어 처리', '딥러닝']

x_test = vectorizer.transform(X_test)
y_test = mlb.transform(y_test)

X_test, y_test



(['자연어 처리랑 딥러닝이랑 공부 잼따'],
 array([[0, 0, 0],
        [0, 0, 0]]))

In [53]:
y_pred = model.predict(X_test)
y_pred

ValueError: Expected 2D array, got 1D array instead:
array=['자연어 처리랑 딥러닝이랑 공부 잼따'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [42]:
from sklearn.metrics import classification_report, hamming_loss

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred, target_names=mlb.classes_))
print(hamming_loss(y_test, y_pred))

AttributeError: 'OneVsRestClassifier' object has no attribute 'estimators_'