In [0]:
!pip install torch

In [0]:
 #CUDA_LAUNCH_BLOCKING=1

In [0]:
import torch
from torch import nn
from string import punctuation
from collections import Counter
import numpy as np

In [0]:
###############################################################################
#######################  1. LOAD THE TRAINING TEXT  ###########################
###############################################################################
with open("reviews.txt") as f:
    reviews = f.read()
    
with open("labels.txt") as f:
    labels = f.read()


In [0]:
###############################################################################
##########################  2. TEXT PRE-PROCESSING  ###########################
###############################################################################

def preprocess(text):
    text = text.lower() #소문자
    text= "".join([ch for ch in text if ch not in punctuation])
    #문장 부호를 제외하고, ""(아무것도 없다는 뜻)를 매 글자 사이에 넣어서 연결 - 문장 부호만 제거하고 그대로.
    #띄어쓰기 남아있음.
    all_reviews = text.split("\n") #줄바꿈으로 나눔.
    #text = " ".join(text) #띄어쓰기 다시 만들어줌.
    all_words = text.split() #단어마다 쪼갬.
    
    return all_reviews, all_words


all_reviews, all_words = preprocess(reviews)

In [0]:
###############################################################################
##################  3. CREATE DICTIONARIES & ENCODE REVIEWS  ##################
###############################################################################

word_counts = Counter(all_words) #list의 개수 세는 dic 생성, key에 개수
word_list = sorted(word_counts, reverse = True) #get하면 key값들을 얻음, 많은 순서대로 정렬 - list
vocab_to_int = {word:idx+1 for idx, word in enumerate(word_list)} #단어를 1,2,3,4,..로 번호 매긴 dic로.
int_to_vocab = {idx:word for word, idx in vocab_to_int.items()} #번호를 다시 단어로 dic.
encoded_reviews = [[vocab_to_int[word] for word in review.split()] for review in all_reviews] #review들을 단어의 번호들로 표현

In [0]:
###############################################################################
#############################  4. ENCODE LABELS ###############################
###############################################################################
all_labels = labels.split("\n")
encoded_labels = [1 if label == "positive" else 0 for label in all_labels]
assert len(encoded_reviews) == len(encoded_labels), "# of encoded reivews & encoded labels must be the same!"

In [0]:
###############################################################################
#####################  5. GET RID OF LENGTH-0 REVIEWS   #######################
###############################################################################

encoded_labels = np.array( [label for idx, label in enumerate(encoded_labels) if len(encoded_reviews[idx]) > 0] )
encoded_reviews = [review for review in encoded_reviews if len(review) > 0]

In [0]:
torch.Tensor([1,2,3]).to("cuda") #이게 갑자기 안될 때가 있음

In [0]:
###############################################################################
######################  6. MAKE ALL REVIEWS SAME LENGTH  #######################
###############################################################################
def pad_text(encoded_reviews, seq_length):
    
    reviews = []
    
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
        
    return np.array(reviews)


padded_reviews = pad_text(encoded_reviews, seq_length = 200)

In [0]:
###############################################################################
##############  7. SPLIT DATA & GET (REVIEW, LABEL) DATALOADER  ###############
###############################################################################
train_ratio = 0.8
valid_ratio = (1 - train_ratio)/2 #2로 나누는 이유는? test set도 만들기 위해.
total = padded_reviews.shape[0] #25000
train_cutoff = int(total * train_ratio)
valid_cutoff = int(total * (1 - valid_ratio))

train_x, train_y = padded_reviews[:train_cutoff], encoded_labels[:train_cutoff]
valid_x, valid_y = padded_reviews[train_cutoff : valid_cutoff], encoded_labels[train_cutoff : valid_cutoff]
test_x, test_y = padded_reviews[valid_cutoff:], encoded_labels[valid_cutoff:]


from torch.utils.data import TensorDataset, DataLoader

#이부분 안해주면 TensorDataset에서 int not callable 에러 발생
#longtensor(정수)로 만들어줘야함, 추후에 간단한 형태의 코드 생각해볼수도.
train_x, train_y=torch.Tensor(train_x).long(), torch.Tensor(train_y).long()
valid_x, valid_y=torch.Tensor(valid_x).long(), torch.Tensor(valid_y).long()
test_x, test_y=torch.Tensor(test_x).long(), torch.Tensor(test_y).long()


train_data = TensorDataset(train_x, train_y)
valid_data = TensorDataset(valid_x, valid_y)
test_data = TensorDataset(test_x, test_y)


batch_size = 50
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True)

#dataloader의[0]은 batch size*문장 길이, [1]은 batch size*1(label이므로)

In [0]:
###############################################################################
#########################  8. DEFINE THE LSTM MODEL  ##########################
###############################################################################
class SentimentLSTM(nn.Module):
    
    def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
        super().__init__()
        # params: "n_" means dimension
        self.n_vocab = n_vocab     # number of unique words in vocabulary
        self.n_layers = n_layers   # number of LSTM layers 
        self.n_hidden = n_hidden   # number of hidden nodes in LSTM
        
        self.embedding = nn.Embedding(n_vocab, n_embed)
        self.lstm = nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
        self.dropout = nn.Dropout(drop_p)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()
        
        
    def forward (self, input_words):
                                             # INPUT   :  (batch_size, seq_length)
        batch_size=input_words.shape[0]
        embedded_words = self.embedding(input_words)    # (batch_size, seq_length, n_embed)
        lstm_out, h = self.lstm(embedded_words)         # (batch_size, seq_length, n_hidden)
        lstm_out = self.dropout(lstm_out)
        lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
        fc_out = self.fc(lstm_out)                      # (batch_size*seq_length, n_output)
        sigmoid_out = self.sigmoid(fc_out)              # (batch_size*seq_length, n_output)
        sigmoid_out = sigmoid_out.view(batch_size, -1)  # (batch_size, seq_length*n_output)
        
        # extract the output of ONLY the LAST output of the LAST element of the sequence
        sigmoid_last = sigmoid_out[:, -1]               # (batch_size, 1)
        
        return sigmoid_last, h
    
    
    def init_hidden (self, batch_size):  # initialize hidden weights (h,c) to 0
      #최초의 hidden state와 cell state를 0으로 하는것 같은데..
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        weights = next(self.parameters()).data #parameter들을 불러옴.
        h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device), #weights와 똑같은 data와 cuda 여부의 tensor 생성.
             weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
             
        return h

In [0]:
###############################################################################
################  9. INSTANTIATE THE MODEL W/ HYPERPARAMETERS #################
###############################################################################
n_vocab = len(vocab_to_int)+1
n_embed = 400
n_hidden = 512
n_output = 1   # 1 ("positive") or 0 ("negative")
n_layers = 2
device = 'cuda' if torch.cuda.is_available else 'cpu'


net_cpu = SentimentLSTM(n_vocab, n_embed, n_hidden, n_output, n_layers)
net=net_cpu.to(device)

###############################################################################
#######################  10. DEFINE LOSS & OPTIMIZER  #########################
###############################################################################
from torch import optim

criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr = 0.001)

In [0]:
#net=net.to(device)

In [22]:
###############################################################################
##########################  11. TRAIN THE NETWORK!  ###########################
###############################################################################
print_every = 100
step = 0
n_epochs = 4  # validation loss increases from ~ epoch 3 or 4
clip = 5  # for gradient clip to prevent exploding gradient problem in LSTM/RNN
device = 'cuda' if torch.cuda.is_available else 'cpu'

#net=net.to(device)

for epoch in range(n_epochs):
    h = net.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.cuda(), labels.cuda()
        
        # making requires_grad = False for the latest set of h
        h = tuple([each.data for each in h])   
        
        net.zero_grad()
        output, h = net(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        
        if (step % print_every) == 0:            
            ######################
            ##### VALIDATION #####
            ######################
            net.eval()
            valid_losses = []
            v_h = net.init_hidden(batch_size)
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = inputs.to(device), labels.to(device)
        
                v_h = tuple([each.data for each in v_h])
                
                v_output, v_h = net(v_inputs)
                v_loss = criterion(v_output.squeeze(), v_labels.float())
                valid_losses.append(v_loss.item())

            print("Epoch: {}/{}".format((epoch+1), n_epochs),
                  "Step: {}".format(step),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)))
            net.train()

Epoch: 1/4 Step: 100 Training Loss: 0.6804 Validation Loss: 0.6719
Epoch: 1/4 Step: 200 Training Loss: 0.6822 Validation Loss: 0.6834
Epoch: 1/4 Step: 300 Training Loss: 0.5584 Validation Loss: 0.5176
Epoch: 1/4 Step: 400 Training Loss: 0.6567 Validation Loss: 0.6038
Epoch: 2/4 Step: 500 Training Loss: 0.4862 Validation Loss: 0.4441
Epoch: 2/4 Step: 600 Training Loss: 0.5540 Validation Loss: 0.4670
Epoch: 2/4 Step: 700 Training Loss: 0.3078 Validation Loss: 0.2559
Epoch: 2/4 Step: 800 Training Loss: 0.2578 Validation Loss: 0.2488
Epoch: 3/4 Step: 900 Training Loss: 0.3034 Validation Loss: 0.2673
Epoch: 3/4 Step: 1000 Training Loss: 0.2207 Validation Loss: 0.2246
Epoch: 3/4 Step: 1100 Training Loss: 0.2144 Validation Loss: 0.2092
Epoch: 3/4 Step: 1200 Training Loss: 0.3669 Validation Loss: 0.3383
Epoch: 4/4 Step: 1300 Training Loss: 0.1768 Validation Loss: 0.1301
Epoch: 4/4 Step: 1400 Training Loss: 0.1220 Validation Loss: 0.0969
Epoch: 4/4 Step: 1500 Training Loss: 0.0469 Validation Lo

In [25]:
device

'cuda'

In [0]:
asd=torch.Tensor([1,2]).to(device)

In [0]:
st=0
for inputs, labels in test_loader:
    st+=1
    inputs, labels=inputs.to(device), labels.to(device)
    if(st==1):
      asdf=inputs

In [42]:
asdf.float().type()

'torch.cuda.FloatTensor'

In [54]:
###############################################################################
################  12. TEST THE TRAINED MODEL ON THE TEST SET  #################
###############################################################################
net.eval()
test_losses = []
num_correct = 0
test_h = net.init_hidden(batch_size)

for inputs, labels in test_loader:
    inputs, labels=inputs.to(device), labels.to(device)
    
    test_h = tuple([each.data for each in test_h])
    test_output, test_h = net(inputs)
    loss = criterion(test_output.float(), labels.float())
    test_losses.append(loss.item())
    
    preds = torch.round(test_output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

Test Loss: 0.4961
Test Accuracy: 0.83


In [0]:
###############################################################################
############  13. TEST THE TRAINED MODEL ON A RANDOM SINGLE REVIEW ############
###############################################################################
def predict(net, review, seq_length = 200):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    words = preprocess(review)
    assert False not in [word in vocab_to_int for word in words[1]], "word not in dictionary"
    encoded_words = [vocab_to_int[word] for word in words[1]]
    padded_words = pad_text([encoded_words], seq_length)
    padded_words = torch.from_numpy(padded_words).to(device)
    
    if(len(padded_words) == 0):
        "Your review must contain at least 1 word!"
        return None
    
    net.eval()
    h = net.init_hidden(1)
   # output, h = net(padded_words, h)
    output, h = net(padded_words)
    pred = torch.round(output.squeeze())
    msg = "This is a positive review." if pred == 1 else "This is a negative review."
    
    return msg


review1 = "It made me cry."
review2 = "It was so good it made me cry."
review3 = "It's ok."
review4 = "This movie had the best acting and the dialogue was so good. I loved it."
review5 = "Garbage"
                       ### OUTPUT ###
predict(net, review1)  ## negative ##
predict(net, review2)  ## positive ##
predict(net, review3)  ## negative ##
predict(net, review4)  ## positive ##
predict(net, review5)  ## negative ##
    

In [85]:
predict(net, "It was perfect")

'This is a negative review.'