In [1]:
import torch
from torchtext import data
from torchtext import datasets
import torch.nn as nn
import os

In [2]:
TEXT = data.Field(pad_first=True, fix_length=500)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [3]:
print(vars(train_data.examples[0]))

{'text': ['Zentropa', 'has', 'much', 'in', 'common', 'with', 'The', 'Third', 'Man,', 'another', 'noir-like', 'film', 'set', 'among', 'the', 'rubble', 'of', 'postwar', 'Europe.', 'Like', 'TTM,', 'there', 'is', 'much', 'inventive', 'camera', 'work.', 'There', 'is', 'an', 'innocent', 'American', 'who', 'gets', 'emotionally', 'involved', 'with', 'a', 'woman', 'he', "doesn't", 'really', 'understand,', 'and', 'whose', 'naivety', 'is', 'all', 'the', 'more', 'striking', 'in', 'contrast', 'with', 'the', 'natives.<br', '/><br', '/>But', "I'd", 'have', 'to', 'say', 'that', 'The', 'Third', 'Man', 'has', 'a', 'more', 'well-crafted', 'storyline.', 'Zentropa', 'is', 'a', 'bit', 'disjointed', 'in', 'this', 'respect.', 'Perhaps', 'this', 'is', 'intentional:', 'it', 'is', 'presented', 'as', 'a', 'dream/nightmare,', 'and', 'making', 'it', 'too', 'coherent', 'would', 'spoil', 'the', 'effect.', '<br', '/><br', '/>This', 'movie', 'is', 'unrelentingly', 'grim--"noir"', 'in', 'more', 'than', 'one', 'sense;', 

In [4]:
import string

for example in train_data.examples:
    text = [x.lower() for x in vars(example)['text']] #소문자
    text = [x.replace("<br","") for x in text] #<br 제거
    text = [''.join(c for c in s if c not in string.punctuation) for s in text] #문장부호
    text = [s for s in text if s] #공란제거
    vars(example)['text'] = text
    
for example in test_data.examples:
    text = [x.lower() for x in vars(example)['text']]
    text = [x.replace("<br","") for x in text]
    text = [''.join(c for c in s if c not in string.punctuation) for s in text]
    text = [s for s in text if s]
    vars(example)['text'] = text

In [5]:
import random
train_data, valid_data = train_data.split(random_state = random.seed(0), split_ratio=0.8)

In [6]:
TEXT.build_vocab(train_data, max_size = 50000)
LABEL.build_vocab(train_data)

In [7]:
batch_size = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=batch_size,
    device=device)

In [8]:
# x = [문장길이, batch size]
# emb = [문장길이, batch size, embedding dim]

# LSTM
# out= [문장길이, batch size, hidden dim]
# h = [layer 갯수, batch size, hidden dim]
# c = [layer 갯수s, batch size, hidden dim]

# GRU
# out= [문장길이, batch size, hidden dim]
# h = [layer 갯수, batch size, hidden dim]


import torch.nn as nn 

class Sentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout, method):
        super().__init__()
        
        self.method = method
        
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=dropout)
        self.gru = nn.GRU(embed_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * n_layers, output_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.drop = nn.Dropout(dropout)
        
    def forward(self, x):
        
        emb = self.drop(self.embed(x))
        
        if(self.method == 'LSTM'):
            out, (h, c) = self.lstm(emb)            
        if(self.method == 'GRU'):
            out, h = self.gru(emb)
        out = self.drop(out[-1])        
        return self.fc(out)

In [9]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [10]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() # train_mode
    for batch in iterator:
        
        # initializing
        optimizer.zero_grad()
        
        # forward pass
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        # backward pass
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator) , epoch_acc/len(iterator) # 평균을 return

In [11]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [12]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [13]:
input_dim = len(TEXT.vocab)
embed_dim = 400
hidden_dim = 256
output_dim = 1
n_layers = 2
dropout = 0.5

In [14]:
model = Sentiment(input_dim, embed_dim, hidden_dim, output_dim, n_layers, dropout, method = 'GRU').to(device)
# model = Sentiment(input_dim, embed_dim, hidden_dim, output_dim, n_layers, dropout, method = 'LSTM').to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)

In [15]:
N_EPOCHS = 20
best_val_loss = float('inf')

In [16]:
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    val_loss, val_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("model"):
            os.makedirs("model")
        torch.save(model.state_dict(), './model/imdb2.pt')
        best_val_loss = val_loss
    
    print('Epoch: {:02} | Epoch Time: {}m {}s | train_loss: {:.3f} | train_acc: {:.2%} val_loss: {:.3f} |  val_acc: {:.2%}'
         .format(epoch + 1, epoch_mins, epoch_secs, train_loss, train_acc, val_loss, val_acc))

Epoch: 01 | Epoch Time: 0m 49s | train_loss: 0.632 | train_acc: 63.68% val_loss: 0.530 |  val_acc: 74.06%
Epoch: 02 | Epoch Time: 0m 49s | train_loss: 0.532 | train_acc: 73.57% val_loss: 0.458 |  val_acc: 80.10%
Epoch: 03 | Epoch Time: 0m 49s | train_loss: 0.368 | train_acc: 83.83% val_loss: 0.317 |  val_acc: 87.66%
Epoch: 04 | Epoch Time: 0m 49s | train_loss: 0.274 | train_acc: 88.81% val_loss: 0.357 |  val_acc: 87.28%
Epoch: 05 | Epoch Time: 0m 50s | train_loss: 0.224 | train_acc: 90.78% val_loss: 0.295 |  val_acc: 89.82%
Epoch: 06 | Epoch Time: 0m 49s | train_loss: 0.186 | train_acc: 92.60% val_loss: 0.298 |  val_acc: 89.10%
Epoch: 07 | Epoch Time: 0m 50s | train_loss: 0.155 | train_acc: 94.06% val_loss: 0.355 |  val_acc: 89.00%
Epoch: 08 | Epoch Time: 0m 49s | train_loss: 0.134 | train_acc: 94.82% val_loss: 0.326 |  val_acc: 90.20%
Epoch: 09 | Epoch Time: 0m 49s | train_loss: 0.116 | train_acc: 95.66% val_loss: 0.372 |  val_acc: 88.52%
Epoch: 10 | Epoch Time: 0m 50s | train_loss: 0