In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.legacy import data, datasets
import random

#from torchtext import data, datasets

In [2]:
import torchtext
torchtext.__version__

'0.10.0'

In [3]:
SEED = 5
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x2128318cad0>

In [4]:
# 하이퍼파라미터

BATCH_SIZE = 64
lr = 0.001
EPOCHS =10

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
#DEVICE = torch.device("cpu")

print(" GPU ? CPU ?  - {}".format(DEVICE))

 GPU ? CPU ?  - cuda:0


In [5]:
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3080


In [6]:
# 데이터 로딩하기
TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.Field(sequential=False, batch_first=True)


In [8]:
trainset, testset = datasets.IMDB.splits(TEXT, LABEL)


In [None]:
print('trainset의 구성 요소 출력 : ', trainset.fields)


In [None]:
print('trainset의 구성 요소 출력 : ', testset.fields)

In [None]:
print(vars(trainset[0]))

In [None]:
TEXT.build_vocab(trainset, min_freq=10)
LABEL.build_vocab(trainset)


In [None]:
vocab_size = len(TEXT.vocab)
n_classes = 2

In [None]:
print('단어 집합의 크기 : {}'.format(vocab_size))
print('클래스의 개수 : {}'.format(n_classes))

In [None]:
print(TEXT.vocab.stoi)

In [None]:
# 학습용 데이터를 학습셋 80% 검증셋 20% 로 나누기
trainset, valset = trainset.split(split_ratio=0.8)


In [None]:
trainset

In [None]:
print('train 데이터의 크기 : {}' .format(len(trainset)))
print('val 데이터의 크기 : {}' .format(len(valset)))


In [None]:
train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (trainset, valset, testset), batch_size=BATCH_SIZE,
        shuffle=True, repeat=False)

In [None]:
batch = next(iter(train_iter)) # 첫번째 미니배치
print(batch.text.shape)

In [None]:
batch.text

In [None]:
print(batch)

In [None]:
print("[학습셋]: %d [검증셋]: %d [테스트셋]: %d [단어수]: %d [클래스] %d"
      % (len(trainset),len(valset), len(testset), vocab_size, n_classes))

In [None]:
class BasicLSTM(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(BasicLSTM, self).__init__()
        print("Building Basic LSTM model...")
        self.n_layers = n_layers
        self.embed = nn.Embedding(n_vocab, embed_dim)
        
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers, batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)
        self.dropout = nn.Dropout(dropout_p)


    def forward(self, x):
        x = self.embed(x)
        
        output, (hn, cn) = self.lstm(x) #lstm with input, hidden, and internal state 
        hn = hn[1]
        hn = self.dropout(hn)
        logit = self.out(hn)
        return logit



In [None]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):

        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)

        y.data.sub_(1)  # 레이블 값을 0과 1로 변환   1 -> 0  2- > 1
        optimizer.zero_grad()

        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

In [None]:
def evaluate(model, val_iter):
  
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)

        y.data.sub_(1) # 레이블 값을 0과 1로 변환

        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [None]:
model = BasicLSTM(2, 256, vocab_size, 400, n_classes, 0.5).to(DEVICE) # layers 2 , hidden_size = 256, embed 400 
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
best_val_loss = None
for e in range(1, EPOCHS+1):
    train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)

    print("[epoch: %d] val loss:%.2f | val accuracy :%.2f" % (e, val_loss, val_accuracy))
    
    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        torch.save(model.state_dict(), './txtclassification.pt')
        best_val_loss = val_loss

In [None]:
model.load_state_dict(torch.load('./txtclassification.pt'))
test_loss, test_acc = evaluate(model, test_iter)
print('테스트 오차: %5.2f | 테스트 정확도: %5.2f' % (test_loss, test_acc))