<a href="https://colab.research.google.com/github/Minsoo1036/DeepLearning-and-PyTorch/blob/main/Colab_rnn_lstm_gru_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
! pip list | grep "torch"

torch                         1.5.1
torchaudio                    0.11.0+cu113
torchsummary                  1.5.1
torchtext                     0.3.1
torchvision                   0.6.1


In [None]:
! pip install --upgrade torch==1.5.1

In [None]:
! pip install --upgrade torchsummary==1.5.1

In [None]:
! pip install --upgrade torchtext==0.3.1

In [None]:
! pip install --upgrade torchvision==0.6.1

In [None]:
! pip list | grep "torch"

torch                         1.5.1
torchaudio                    0.11.0+cu113
torchsummary                  1.5.1
torchtext                     0.3.1
torchvision                   0.6.1


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [None]:
!python -c 'import torch; print(torch.__version__) '

1.5.1


In [None]:
!ls -d /usr/local/cuda-*
!which nvcc

/usr/local/cuda-10.0  /usr/local/cuda-11    /usr/local/cuda-11.1
/usr/local/cuda-10.1  /usr/local/cuda-11.0
/usr/local/cuda/bin/nvcc


In [24]:
import os
p = os.getenv('PATH')
ld = os.getenv('LD_LIBRARY_PATH')
os.environ['PATH'] = f"/usr/local/cuda-10.1/bin:{p}"
os.environ['LD_LIBRARY_PATH'] = f"/usr/local/cuda-10.1/lib64:{ld}"
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [25]:
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())
assert torch.__version__.startswith("1.5.1")

1.5.1 True


In [None]:
! pip list | grep "torch"

torch                         1.5.1
torchaudio                    0.11.0+cu113
torchsummary                  1.5.1
torchtext                     0.3.1
torchvision                   0.6.1


In [26]:
import re
import sys
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets

In [27]:

# Data Setting
TEXT = data.Field(batch_first = True,
                  fix_length = 500,
                  tokenize=str.split,
                  pad_first=True,
                  pad_token='[PAD]',
                  unk_token='[UNK]')

LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(text_field = TEXT, 
                                             label_field = LABEL)

In [28]:
# Data Length
print(f'Train Data Length : {len(train_data.examples)}')
print(f'Test Data Length : {len(test_data.examples)}')

Train Data Length : 25000
Test Data Length : 25000


In [29]:
# Data Fields
train_data.fields

{'label': <torchtext.data.field.LabelField at 0x7f5795441190>,
 'text': <torchtext.data.field.Field at 0x7f57a5a28c50>}

In [30]:
# Data Sample
print('---- Data Sample ----')
print('Input : ')
print(' '.join(vars(train_data.examples[1])['text']),'\n')
print('Label : ')
print(vars(train_data.examples[1])['label'])

---- Data Sample ----
Input : 
I don't think most of us would tend to apply the term "must-see" to action films, but I was very impressed at how good this film was and it deservedly gets the "must-see" stamp from me.<br /><br />Mandy played by Shannon Lee (daughter of the late and great Bruce Lee and sister of the late Brandon Lee) is recruited by Martin, a professional thief to help pull off a diamond heist at a museum for a criminal syndicate, and get rewarded handsomely for it. Little do they know that another pair of thieves (Lucy and Tommy, a pair of lovebirds), who were spurned earlier by Mandy and Martin to get in on the deal, are also planning to steal the diamond.<br /><br />How each pair of thieves plans out the heist is a thrill to watch. Things go awry, as Martin and Mandy unknowingly find themselves a step behind Lucy and Tommy.<br /><br />You'll find yourself rooting for these thieves as they find that they need each other to stay alive from the crime syndicate, who are n

In [31]:
def PreProcessingText(input_sentence):
    input_sentence = input_sentence.lower() # 소문자화
    input_sentence = re.sub('<[^>]*>', repl= ' ', string = input_sentence) # "<br />" 처리
    input_sentence = re.sub('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]', repl= ' ', string = input_sentence) # 특수문자 처리 ("'" 제외)
    input_sentence = re.sub('\s+', repl= ' ', string = input_sentence) # 연속된 띄어쓰기 처리
    if input_sentence:
        return input_sentence

In [32]:
for example in train_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()
    
for example in test_data.examples:
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

In [33]:
model_config = {'emb_type' : 'glove', 'emb_dim' : 300}

In [34]:
# making vocab
TEXT.build_vocab(train_data,
                 min_freq = 2, 
                 max_size = None,
                 vectors = f"glove.6B.{model_config['emb_dim']}d")

## vector list
# charngram.100d
# fasttext.en.300d
# fasttext.simple.300d
# glove.42B.300d
# glove.840B.300d
# glove.twitter.27B.25d
# glove.twitter.27B.50d
# glove.twitter.27B.100d
# glove.twitter.27B.200d
# glove.6B.50d
# glove.6B.100d
# glove.6B.200d
# glove.6B.300d

LABEL.build_vocab(train_data)

model_config['vocab_size'] = len(TEXT.vocab)

In [35]:
# Vocabulary Info
print(f'Vocab Size : {len(TEXT.vocab)}')

print('Vocab Examples : ')
for idx, (k, v) in enumerate(TEXT.vocab.stoi.items()):
    if idx >= 10:
        break    
    print('\t', k, v)

print('---------------------------------')

# Label Info
print(f'Label Size : {len(LABEL.vocab)}')

print('Lable Examples : ')
for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()):
    print('\t', k, v)

Vocab Size : 51956
Vocab Examples : 
	 [UNK] 0
	 [PAD] 1
	 the 2
	 and 3
	 a 4
	 of 5
	 to 6
	 is 7
	 in 8
	 it 9
---------------------------------
Label Size : 2
Lable Examples : 
	 neg 0
	 pos 1


In [36]:
# Check embedding vectors
TEXT.vocab.vectors.shape

torch.Size([51956, 300])

In [37]:
# Spliting Valid set
train_data, valid_data = train_data.split(random_state = random.seed(0),
                                          split_ratio=0.8)

In [38]:
model_config['batch_size'] = 30

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=model_config['batch_size'],
    device=device)

In [39]:
# Check batch data
sample_for_check = next(iter(train_iterator))
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 30]
	[.text]:[torch.cuda.LongTensor of size 30x500 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 30 (GPU 0)]
tensor([[    1,     1,     1,  ...,   275,     5,   260],
        [    1,     1,     1,  ...,    58,     0,     0],
        [    1,     1,     1,  ...,   203,   220,   554],
        ...,
        [    1,     1,     1,  ...,    98,    82,    17],
        [    1,     1,     1,  ...,  9169, 12922,  4930],
        [    1,     1,     1,  ..., 26492,  1977,  3791]], device='cuda:0')
tensor([0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0.,
        0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1.], device='cuda:0')


In [40]:
# Check reverting data
print(' '.join([TEXT.vocab.itos[int(x)] for x in sample_for_check.text[0,:] if x not in [0,1]]))
print(LABEL.vocab.itos[int(sample_for_check.label[0])]) 

when you get your hands on a british film you expect some sort of quality and when it comes to acting camera work lighting etc this film does the business it's done by highly skilled craftsmen that alone can bring you an enjoyable one and a half hours but when you look under the layers of professionalism you don't really find anything apart from making you feel good and advocate a drug liberal view there's really nothing there the script is mediocre the plot is predictable and the ending must be one of the worst east of hollywood in all it's english it's just a shameful and cynical attempt to make another full monty why they made this film i haven't got a clue apart from making money of course
neg


In [41]:
class SentenceClassification(nn.Module):
    def __init__(self, **model_config):
        super(SentenceClassification, self).__init__()

        if model_config['emb_type'] == 'glove' or 'fasttext':
            self.emb = nn.Embedding(model_config['vocab_size'],
                                    model_config['emb_dim'],
                                    _weight = TEXT.vocab.vectors)
        else:
            self.emb = nn.Embedding(model_config['vocab_size'],
                                    model_config['emb_dim'])
        
        self.bidirectional = model_config['bidirectional']
        self.num_direction = 2 if model_config['bidirectional'] else 1
        self.model_type = model_config['model_type'] 

        self.RNN = nn.RNN (input_size = model_config['emb_dim'],
                           hidden_size = model_config['hidden_dim'],
                           dropout=model_config['dropout'],
                           bidirectional = model_config['bidirectional'],
                           batch_first = model_config['batch_first'])
        
        self.LSTM= nn.LSTM(input_size = model_config['emb_dim'],
                           hidden_size = model_config['hidden_dim'],
                           dropout=model_config['dropout'],
                           bidirectional = model_config['bidirectional'],
                           batch_first = model_config['batch_first'])
        
        self.GRU = nn.GRU (input_size = model_config['emb_dim'],
                           hidden_size = model_config['hidden_dim'],
                           dropout=model_config['dropout'],
                           bidirectional = model_config['bidirectional'],
                           batch_first = model_config['batch_first'])
        
        self.fc = nn.Linear(model_config['hidden_dim'] * self.num_direction,
                            model_config['output_dim'])
        
        self.drop = nn.Dropout(model_config['dropout'])

    def forward(self, x):
        
        emb = self.emb(x) 
        # emb : (Batch_Size, Max_Seq_Length, Emb_dim)

        if self.model_type == 'RNN':
            output, hidden = self.RNN(emb) 
        elif self.model_type == 'LSTM':
            output, (hidden, cell) = self.LSTM(emb)
        elif self.model_type == 'GRU':
            output, hidden = self.GRU(emb)
        else:
            raise NameError('Select model_type in [RNN, LSTM, GRU]')
        
        # output : (Batch_Size, Max_Seq_Length, Hidden_dim * num_direction) 
        # hidden : (num_direction, Batch_Size, Hidden_dim)
        
        last_output = output[:,-1,:]

        # last_output : (Batch_Size, Hidden_dim * num_direction)
        return self.fc(self.drop(last_output))

In [42]:
model_config.update(dict(batch_first = True,
                         model_type = 'RNN',
                         bidirectional = True,
                         hidden_dim = 128,
                         output_dim = 1,
                         dropout = 0))

In [43]:
model = SentenceClassification(**model_config).to(device)

In [44]:
predictions = model.forward(sample_for_check.text).squeeze()

In [45]:
loss_fn = nn.BCEWithLogitsLoss().to(device)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum()/len(correct)
    return acc

In [46]:
loss = loss_fn(predictions, sample_for_check.label)
acc = binary_accuracy(predictions, sample_for_check.label)

In [47]:
print(predictions)
print(loss, acc)

tensor([-0.1703,  0.0098, -0.2055, -0.0231,  0.0826, -0.0016, -0.0780, -0.1905,
         0.0180, -0.0034, -0.1470, -0.1159, -0.5296, -0.1090, -0.0193, -0.0725,
        -0.3045, -0.0887, -0.1807, -0.0810,  0.0352,  0.0361, -0.3563, -0.1840,
         0.1674,  0.0008,  0.0846,  0.1702, -0.0997,  0.1138], device='cuda:0',
       grad_fn=<SqueezeBackward0>)
tensor(0.6688, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>) tensor(0.7000, device='cuda:0')


In [48]:
def train(model, iterator, optimizer, loss_fn, idx_epoch, **model_params):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train() 
    batch_size = model_params['batch_size']

    for idx, batch in enumerate(iterator):
        
        # Initializing
        optimizer.zero_grad()
        
        # Forward 
        predictions = model(batch.text).squeeze()
        loss = loss_fn(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        sys.stdout.write(
                    "\r" + f"[Train] Epoch : {idx_epoch:^3}"\
                    f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size} ({100. * (idx + 1) / len(iterator) :.4}%)]"\
                    f"  Loss: {loss.item():.4}"\
                    f"  Acc : {acc.item():.4}"\
                    )

        # Backward 
        loss.backward()
        optimizer.step()
        
        # Update Epoch Performance
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator) , epoch_acc/len(iterator) 

In [49]:
def evaluate(model, iterator, loss_fn):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # evaluation mode
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = loss_fn(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [50]:
model_config['model_type'] = 'RNN'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [51]:
%%time

N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-RNN_glove
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.6133 | Train Acc : 0.6602
	 Epoch : 0 | Valid Loss : 0.5663 | Valid Acc : 0.7182
	 Epoch : 1 | Train Loss : 0.5549 | Train Acc : 0.7154
	 Epoch : 1 | Valid Loss : 0.5786 | Valid Acc : 0.7084
	 Epoch : 2 | Train Loss : 0.4282 | Train Acc : 0.8051
	 Epoch : 2 | Valid Loss : 0.636 | Valid Acc : 0.6554
	 Epoch : 3 | Train Loss : 0.4199 | Train Acc : 0.807
	 Epoch : 3 | Valid Loss : 0.6079 | Valid Acc : 0.7326
	 Epoch : 4 | Train Loss : 0.3042 | Train Acc : 0.8785
	 Epoch : 4 | Valid Loss : 0.5847 | Valid Acc : 0.7562
CPU times: user 5min 30s, sys: 1.96 s, total: 5min 31s
Wall time: 5min 31s


In [52]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.5711 | Test Acc : 0.7154


In [53]:
model_config['model_type'] = 'LSTM'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [55]:
%%time

N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-LSTM_glove
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.408 | Train Acc : 0.8242
	 Epoch : 0 | Valid Loss : 0.3361 | Valid Acc : 0.8616
	 Epoch : 1 | Train Loss : 0.1922 | Train Acc : 0.9309
	 Epoch : 1 | Valid Loss : 0.3484 | Valid Acc : 0.8654
	 Epoch : 2 | Train Loss : 0.06827 | Train Acc : 0.9781
	 Epoch : 2 | Valid Loss : 0.4313 | Valid Acc : 0.8656
	 Epoch : 3 | Train Loss : 0.01949 | Train Acc : 0.9953
	 Epoch : 3 | Valid Loss : 0.5766 | Valid Acc : 0.8605
	 Epoch : 4 | Train Loss : 0.009919 | Train Acc : 0.9978
	 Epoch : 4 | Valid Loss : 0.5877 | Valid Acc : 0.8492
CPU times: user 6min 53s, sys: 2.65 s, total: 6min 55s
Wall time: 6min 54s


In [56]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.3463 | Test Acc : 0.8524


In [57]:
model_config['model_type'] = 'GRU'
model = SentenceClassification(**model_config).to(device)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [58]:
%%time

N_EPOCH = 5

best_valid_loss = float('inf')
model_name = f"{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

print('---------------------------------')
print(f'Model name : {model_name}')
print('---------------------------------')

for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, loss_fn, epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {epoch}-epoch')

    print(f'\t Epoch : {epoch} | Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')
    print(f'\t Epoch : {epoch} | Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

---------------------------------
Model name : bi-GRU_glove
---------------------------------
	 Saved at 0-epoch
	 Epoch : 0 | Train Loss : 0.3869 | Train Acc : 0.819
	 Epoch : 0 | Valid Loss : 0.241 | Valid Acc : 0.8996
	 Epoch : 1 | Train Loss : 0.1392 | Train Acc : 0.9501
	 Epoch : 1 | Valid Loss : 0.3141 | Valid Acc : 0.8799
	 Epoch : 2 | Train Loss : 0.03612 | Train Acc : 0.989
	 Epoch : 2 | Valid Loss : 0.4118 | Valid Acc : 0.8834
	 Epoch : 3 | Train Loss : 0.008606 | Train Acc : 0.998
	 Epoch : 3 | Valid Loss : 0.5333 | Valid Acc : 0.8858
	 Epoch : 4 | Train Loss : 0.003652 | Train Acc : 0.9995
	 Epoch : 4 | Valid Loss : 0.5968 | Valid Acc : 0.8792
CPU times: user 6min 41s, sys: 2.67 s, total: 6min 44s
Wall time: 6min 42s


In [59]:
# Test set
model.load_state_dict(torch.load(f'./{model_name}.pt'))
test_loss, test_acc = evaluate(model, test_iterator, loss_fn)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.2598 | Test Acc : 0.8933


In [60]:
model_config['model_type'] = 'GRU'
model = SentenceClassification(**model_config).to(device)
model.load_state_dict(torch.load(f"./{'bi-' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}.pt"))

<All keys matched successfully>

In [61]:
def predict_sentiment(model, sentence):
    model.eval()
    indexed = TEXT.numericalize(TEXT.pad([TEXT.tokenize(PreProcessingText(sentence))]))
    input_data = torch.LongTensor(indexed).to(device)
    prediction = torch.sigmoid(model(input_data))
    return prediction.item()

In [62]:
test_sentence = 'this movie is FUN'
predict_sentiment(model = model, sentence = test_sentence)

0.858779788017273