### torchtext 라이브러리로 텍스트 분류
<hr>

- 1단계 : 데이터 전처리 : 숫자형식으로 변환하는 것 까지
- 2단계 : 모델 구현

- 1-1 데이터 준비 => 내장 데이터셋 활용
    * AG_NEWS 데이터셋 반복자 : 레이블 (label) + 문장의 튜플(tuple) 형태

In [1]:
import torch
from torchtext.datasets import AG_NEWS

# ==> DataPipe 타입 >> iterator 타입 형변환
train_iter = iter(AG_NEWS(split='train'))



In [2]:
## 데이터 확인 => (label, text), label 1-4
next(train_iter)

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

- (2) 데이터 처리 파이프라인 준비 

<hr>

* 어휘집(vocab), 단어 벡터(word vector), 토크나이저(tokenizer)
* 가공되지 않은 텍스트 문자열에 대한 데이터 처리 빌딜 블록
* 일반적인 NLP 데이터 처리
    * 첫번째 단계 : 가공되지 않은 학습 데이터셋으로 어휘집 생성
        => 토큰 목록 또는 반복자 받는 내장 팩토리 함수(factor function) : build_vocab_from_iterator
    * 사용자는 어휘집에 추가할 특수 기호 전달 가능

In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

train_iter = AG_NEWS(split='train')
test_iter = AG_NEWS(split='test')

In [4]:
# 토큰 제너레이터 함수 : 데이터 추출하여 토큰화
def yield_tokens(data_iter):
    for _, text in data_iter:
        # 라벨, 텍스트  ---> 텍스트 토큰화
        yield tokenizer(text)


# 단어 사전 생성
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=['<unk>'])

## <UNK> 인덱스를 0번으로 지정
vocab.set_default_index(vocab['<unk>'])



In [5]:
vocab(['<unk>','am', 'there', 'example', 'beta'])

[0, 1913, 229, 5297, 2238]

In [6]:
# 텍스트 ===> 정수 인코딩
text_pipeline = lambda x : vocab(tokenizer(x))

# 레이블 ===> 정수 인코딩
label_pipeline = lambda x : int(x) - 1


### (3) 데이터 배치(batch)와 반복자 생성
<hr>

- torch.utils.data.DataLoader : getitem(), len(), 구현한 맵 형태(map-style)
- collate_fn() : DataLoader 로부터 생성된 샘플 배치 함수
    * 입력 : DataLoader에 배치 크기(batch size)가 있는 배치(Batch) 데이터

In [7]:
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def collate_batch(batch):
    # 배치크기 만큼의 라벨, 텍스트, 오프셋 값 저장 변수
    label_list, text_list, offsets = [], [], [0]

    # 1개씩 뉴스기사, 라벨 추출 해서 저장
    for (_label, _text) in batch:
        # 라벨 인코딩 후 저장
        label_list.append(label_pipeline(_label))

        # 텍스트 인코딩 후 저장
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)

        # 텍스트 offset 즉, 텍스트 크기/길이 저장
        offsets.append(processed_text.size(0))


    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)

    return label_list.to(device), text_list.to(device), offsets.to(device)





In [8]:
train_iter = AG_NEWS(split='train')
dataloader = DataLoader(train_iter, batch_size=256, shuffle=False, collate_fn=collate_batch)
testloader = DataLoader(test_iter, batch_size=256, shuffle=False, collate_fn=collate_batch)



In [9]:
num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)

print('num class = ',num_class,', vocab_size = ' , vocab_size)

num class =  4 , vocab_size =  95811


In [10]:
for labels, texts, offsets in dataloader:
    print(labels, texts, offsets)
    break

for labels, texts, offsets in testloader:
    print(labels, texts, offsets)
    break



tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], device='cuda:0') tensor([ 431,  425,    1,  ...,  593, 1805,    1], device='cuda:0') tensor([    0,    29,    71,   111,   151,   194,   242,  

In [11]:
import torch.nn as nn 
import torch.nn.functional as F

embed_dim = 10
HIDDEN = 30
# 입력층 : EmbeddingBag Layer - 레이어와 분류(classification) 목적을 위한 선형 레이어, 텍스트의 길이는 오프셋(offset)
# 은닉층 : Linear - 4개 클래스 분류

class TextSentiment(nn.Module): 
    def __init__(self, vocab_size, embed_dim, num_class): 
        super(TextSentiment, self).__init__() 
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False) 
        self.rnn = nn.LSTM(embed_dim, hidden_size=HIDDEN)
        self.fc = nn.Linear(HIDDEN, num_class) 
        self.init_weights() 

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        output, _ = self.rnn(embedded)
        return self.fc(output)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TextSentiment(vocab_size, embed_dim, num_class).to(device)
criterion = F.cross_entropy
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)


In [12]:
import numpy as np
from torchmetrics.classification import MulticlassAccuracy
from torchmetrics.classification import MulticlassConfusionMatrix

def train(model, dataloader, criterion, optimizer):
    model.train()
    totalloss = []
    accu = MulticlassAccuracy(num_classes=num_class).to(device)
    matrix = MulticlassConfusionMatrix(num_classes=num_class).to(device)
    for labels, texts, offsets in dataloader:
        
        logits = model(texts, offsets)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        totalloss.append(loss.item())
        
        # 기울기 소실 및 폭주 예방을 위한 양극단 값 자르기
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        accu(logits, labels)
        matrix(logits, labels)
    
    print(f"trainloss : {np.mean(totalloss)}")
    print(f"confusionmatrix :\n{matrix.compute().cpu().numpy()}")
    print(f"trainaccu : {accu.compute().item()}")

    # print(logits.shape, labels.shape)
    return np.mean(totalloss)


def test(model, dataloader, criterion):
    model.eval()
    totalloss = []
    accu = MulticlassAccuracy(num_classes=num_class).to(device)
    matrix = MulticlassConfusionMatrix(num_classes=num_class).to(device)
    
    with torch.no_grad():
        for labels, texts, offsets in dataloader:
            
            logits = model(texts, offsets)
            loss = criterion(logits, labels)

            totalloss.append(loss.item())
            accu(logits, labels)
            matrix(logits, labels)

        
    # print(f"testloss : {np.mean(totalloss)}")
    print(f"confusionmatrix :\n{matrix.compute().cpu().numpy()}")
    print(f"testaccu : {accu.compute().item()}")
    return np.mean(totalloss)

In [13]:
EPOCHES = 10
for epo in range(EPOCHES):
    train(model, dataloader, criterion, optimizer)
    test(model, testloader, criterion)
    



trainloss : 0.669166087945387
confusionmatrix :
[[21019  3871  2883  2227]
 [ 3273 23728  1292  1707]
 [ 2657  1502 20546  5295]
 [ 1872  1467  4306 22355]]
trainaccu : 0.7303999662399292
confusionmatrix :
[[ 806  143  689  262]
 [  66 1447   67  320]
 [ 142   67 1173  518]
 [  41   73  199 1587]]
testaccu : 0.6596052646636963
trainloss : 0.6958914637438524
confusionmatrix :
[[19058  2306  6156  2480]
 [ 1493 26440   441  1626]
 [ 4298   709 20247  4746]
 [ 1449  1409  3844 23298]]
trainaccu : 0.7420250177383423
confusionmatrix :
[[1219  174  416   91]
 [ 157 1593   41  109]
 [ 361   79 1198  262]
 [ 153  126  259 1362]]
testaccu : 0.7068421244621277
trainloss : 0.6376678488656148
confusionmatrix :
[[19974  2086  5673  2267]
 [ 1365 26663   423  1549]
 [ 3707   609 21482  4202]
 [ 1163  1264  3355 24218]]
trainaccu : 0.769474983215332
confusionmatrix :
[[ 923  156  565  256]
 [  64 1565   56  215]
 [ 180   52 1221  447]
 [  33   85  181 1601]]
testaccu : 0.6986842155456543
trainloss : 

KeyboardInterrupt: 

In [25]:
def prediction(model, text, text_pipeline=text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text), dtype=torch.int64, device=device)
        output = model(text, torch.tensor([0], device=device))
        return (output.argmax(1) + 1).item()
    

ag_news_label = {
    1: 'World',
    2: 'Sports',
    3: 'Business',
    4: 'Sci/Tec'
}

ag_news_label[prediction(model, "australia open")]

'World'