In [35]:
import torch
from torchtext import data
from torchtext import datasets
import torch.nn as nn
import os

In [36]:
TEXT = data.Field(include_lengths = True)
# TEXT = data.Field(pad_first=True, fix_length=500, include_lengths = True)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [37]:
print(vars(train_data.examples[0]))

{'text': ['Zentropa', 'has', 'much', 'in', 'common', 'with', 'The', 'Third', 'Man,', 'another', 'noir-like', 'film', 'set', 'among', 'the', 'rubble', 'of', 'postwar', 'Europe.', 'Like', 'TTM,', 'there', 'is', 'much', 'inventive', 'camera', 'work.', 'There', 'is', 'an', 'innocent', 'American', 'who', 'gets', 'emotionally', 'involved', 'with', 'a', 'woman', 'he', "doesn't", 'really', 'understand,', 'and', 'whose', 'naivety', 'is', 'all', 'the', 'more', 'striking', 'in', 'contrast', 'with', 'the', 'natives.<br', '/><br', '/>But', "I'd", 'have', 'to', 'say', 'that', 'The', 'Third', 'Man', 'has', 'a', 'more', 'well-crafted', 'storyline.', 'Zentropa', 'is', 'a', 'bit', 'disjointed', 'in', 'this', 'respect.', 'Perhaps', 'this', 'is', 'intentional:', 'it', 'is', 'presented', 'as', 'a', 'dream/nightmare,', 'and', 'making', 'it', 'too', 'coherent', 'would', 'spoil', 'the', 'effect.', '<br', '/><br', '/>This', 'movie', 'is', 'unrelentingly', 'grim--"noir"', 'in', 'more', 'than', 'one', 'sense;', 

In [38]:
import string

for example in train_data.examples:
    text = [x.lower() for x in vars(example)['text']] #소문자
    text = [x.replace("<br","") for x in text] #<br 제거
    text = [''.join(c for c in s if c not in string.punctuation) for s in text] #문장부호
    text = [s for s in text if s] #공란제거
    vars(example)['text'] = text
    
for example in test_data.examples:
    text = [x.lower() for x in vars(example)['text']]
    text = [x.replace("<br","") for x in text]
    text = [''.join(c for c in s if c not in string.punctuation) for s in text]
    text = [s for s in text if s]
    vars(example)['text'] = text

In [39]:
import random
train_data, valid_data = train_data.split(random_state = random.seed(0), split_ratio=0.8)

In [40]:
max_vocab_size = 50000
TEXT.build_vocab(train_data, max_size = max_vocab_size, vectors = "glove.6B.300d", unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [41]:
batch_size = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=batch_size,
    sort_within_batch = True,
    device=device)

In [42]:
# x = [문장길이, batch size]
# emb = [문장길이, batch size, embedding dim]

# LSTM
# out= [문장길이, batch size, hidden dim]
# h = [layer 갯수, batch size, hidden dim]
# c = [layer 갯수s, batch size, hidden dim]

# GRU
# out= [문장길이, batch size, hidden dim]
# h = [layer 갯수, batch size, hidden dim]


import torch.nn as nn 

class Sentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        
#         self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.embed = nn.Embedding.from_pretrained(TEXT.vocab.vectors, freeze=False)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * n_layers, output_dim)
        self.drop = nn.Dropout(dropout)
        
    def forward(self, x, x_length):
        
        embed = self.drop(self.embed(x))
        
        pad_embed = nn.utils.rnn.pack_padded_sequence(embed, x_length)
        
        pad_out, (hidden, cell) = self.lstm(pad_embed)
        
        out, out_len = nn.utils.rnn.pad_packed_sequence(pad_out)
        
        #out = [sent len, batch size, hid dim * num directions]
        #out over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.drop(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

In [43]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [44]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() # train_mode
    for batch in iterator:
        
        # initializing
        optimizer.zero_grad()
        
        # forward pass
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        # backward pass
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator) , epoch_acc/len(iterator) # 평균을 return

In [45]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [46]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [47]:
input_dim = len(TEXT.vocab)
embed_dim = 300
hidden_dim = 256
output_dim = 1
n_layers = 2
bidrectional = True
dropout = 0.2
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

In [48]:
model = Sentiment(input_dim, embed_dim, hidden_dim, output_dim, n_layers, bidrectional, dropout, pad_idx).to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)

In [49]:
N_EPOCHS = 10
best_val_loss = float('inf')

In [50]:
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("model"):
            os.makedirs("model")
        torch.save(model.state_dict(), './model/imdb3.pt')
        best_val_loss = val_loss
    
    print('Epoch: {:02} | Epoch Time: {}m {}s | train_loss: {:.3f} | train_acc: {:.2%} val_loss: {:.3f} |  val_acc: {:.2%}'
         .format(epoch + 1, epoch_mins, epoch_secs, train_loss, train_acc, val_loss, val_acc))

Epoch: 01 | Epoch Time: 0m 29s | train_loss: 0.543 | train_acc: 72.82% val_loss: 0.695 |  val_acc: 52.88%
Epoch: 02 | Epoch Time: 0m 29s | train_loss: 0.410 | train_acc: 81.25% val_loss: 0.316 |  val_acc: 87.34%
Epoch: 03 | Epoch Time: 0m 30s | train_loss: 0.417 | train_acc: 81.62% val_loss: 0.439 |  val_acc: 81.62%
Epoch: 04 | Epoch Time: 0m 30s | train_loss: 0.248 | train_acc: 90.54% val_loss: 0.271 |  val_acc: 89.40%
Epoch: 05 | Epoch Time: 0m 30s | train_loss: 0.163 | train_acc: 94.10% val_loss: 0.306 |  val_acc: 88.48%
Epoch: 06 | Epoch Time: 0m 30s | train_loss: 0.084 | train_acc: 97.21% val_loss: 0.337 |  val_acc: 89.10%
Epoch: 07 | Epoch Time: 0m 30s | train_loss: 0.045 | train_acc: 98.63% val_loss: 0.378 |  val_acc: 88.56%
Epoch: 08 | Epoch Time: 0m 30s | train_loss: 0.029 | train_acc: 99.28% val_loss: 0.414 |  val_acc: 87.92%
Epoch: 09 | Epoch Time: 0m 30s | train_loss: 0.019 | train_acc: 99.57% val_loss: 0.466 |  val_acc: 88.14%
Epoch: 10 | Epoch Time: 0m 30s | train_loss: 0

In [51]:
batch = next(iter(train_iterator)) # 두번째 미니배치
print(batch.text)

(tensor([[   21,     9,    11,  ...,    11,   143,     4],
        [    4,     7,    25,  ...,    61,    21,   313],
        [   83,  1043,    21,  ...,   496,   242,     0],
        ...,
        [  237, 29026,  2968,  ...,   662,   634, 26998],
        [   71,     9,    35,  ...,     1,     1,     1],
        [  287,   121,    69,  ...,     1,     1,     1]], device='cuda:0'), tensor([148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148,
        148, 148, 148, 148, 148, 148, 148, 147, 147, 147, 147, 147, 147, 147,
        147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147,
        147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147,
        147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 147, 146, 146, 146,
        146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
        146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
        146, 146], device='cuda:0'))


In [52]:
print(batch)


[torchtext.data.batch.Batch of size 100]
	[.text]:('[torch.cuda.LongTensor of size 148x100 (GPU 0)]', '[torch.cuda.LongTensor of size 100 (GPU 0)]')
	[.label]:[torch.cuda.FloatTensor of size 100 (GPU 0)]
