In [186]:
# prepare data

from torchnlp.datasets import imdb_dataset

train, test = imdb_dataset(train=True, test=True)
#print(train[0])
#print(test[0])

train_x = [item['text'].lower() for item in train]
train_y = [1 if (item['sentiment'] == 'pos') else 0 for item in train]
#print(train_x[0])
#print(train_y[0])

test_x = [item['text'].lower() for item in test]
test_y = [1 if (item['sentiment'] == 'pos') else 0 for item in test]
#print(test_x[0])
#print(test_y[0])

In [187]:
# text preprocessing
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_multiple_whitespaces, remove_stopwords, strip_punctuation

filters = [strip_tags, strip_multiple_whitespaces, remove_stopwords, strip_punctuation, strip_multiple_whitespaces]
preprocessed_train_x = [preprocess_string(review, filters) for review in train_x]
preprocessed_test_x = [preprocess_string(review, filters) for review in test_x]

print(preprocessed_train_x[0])
print(preprocessed_test_x[0])

['bromwell', 'high', 'cartoon', 'comedy', 'ran', 'time', 'programs', 'school', 'life', 'teachers', '35', 'years', 'teaching', 'profession', 'lead', 'believe', 'bromwell', 'high', 's', 'satire', 'closer', 'reality', 'teachers', 'scramble', 'survive', 'financially', 'insightful', 'students', 'right', 'pathetic', 'teachers', 'pomp', 'pettiness', 'situation', 'remind', 'schools', 'knew', 'students', 'saw', 'episode', 'student', 'repeatedly', 'tried', 'burn', 'school', 'immediately', 'recalled', 'high', 'classic', 'line', 'inspector', 'i', 'm', 'sack', 'teachers', 'student', 'welcome', 'bromwell', 'high', 'expect', 'adults', 'age', 'think', 'bromwell', 'high', 'far', 'fetched', 'pity', 'isn', 't']
['went', 'saw', 'movie', 'night', 'coaxed', 'friends', 'mine', 'i', 'll', 'admit', 'reluctant', 'knew', 'ashton', 'kutcher', 'able', 'comedy', 'wrong', 'kutcher', 'played', 'character', 'jake', 'fischer', 'well', 'kevin', 'costner', 'played', 'ben', 'randall', 'professionalism', 'sign', 'good', 'm

In [188]:
# only use the top_n_words for vocab since most of the words are too rare
import numpy as np

n_vocab = 5000
max_review_length = 200

with open("data/aclimdb/imdb.vocab", 'r') as f:
    vocab = f.read().splitlines()[:n_vocab]
#print(vocab[:10])

# convert data to index for embedding
vocab_to_int = {word:i+1 for i, word in enumerate(vocab)}
#print(list(vocab_to_int.items())[:10])

int_to_vocab = {i:word for word, i in vocab_to_int.items()}
#print(list(int_to_vocab.items())[:10])

def get_encode(word):
    if word in vocab_to_int:
        return vocab_to_int[word]
    else:
        return np.nan

#print(preprocessed_train_x[0])
encoded_train_x = [[get_encode(word)  for word in review] for review in preprocessed_train_x]
#print(encoded_train_x[0][:10])
encoded_train_x = [[word for word in review if (not np.isnan(word))][:max_review_length] for review in encoded_train_x]
print(encoded_train_x[0][:10])

#print(preprocessed_test_x[0])
encoded_test_x = [[get_encode(word) for word in review] for review in preprocessed_test_x]
#print(encoded_test_x[0][:10])
encoded_test_x = [[word for word in review if (not np.isnan(word))][:max_review_length] for review in encoded_test_x]
print(encoded_test_x[0][:10])


# pad the data sequence
def pad_text(encoded_reviews, seq_length):
    reviews = []
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
    return np.array(reviews)


padded_train_x = pad_text(encoded_train_x, max_review_length)
padded_test_x = pad_text(encoded_test_x, max_review_length)

[322, 1058, 208, 2097, 58, 391, 117, 151, 4939, 469]
[422, 214, 16, 305, 351, 1869, 9, 938, 672, 491]


In [189]:
# prepare the dateset
from torch.utils.data import TensorDataset, DataLoader
from torch import IntTensor
import torch


batch_size = 100

train_data = TensorDataset(torch.tensor(padded_train_x, dtype=torch.int64), torch.tensor(train_y, dtype=torch.float32))
test_data = TensorDataset(torch.tensor(padded_test_x, dtype=torch.int64), torch.tensor(test_y, dtype=torch.float32))
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [190]:
# define model
from torch import nn

n_embedding = 200 # embedding vector size
n_hidden = 200
n_layers = 1
n_output = 1

p_drop = 0.5

class SentimentLstm(nn.Module):
    def __init__(self):
        super().__init__()

        self.embedding = nn.Embedding(n_vocab+1, n_embedding)
        self.lstm = nn.LSTM(n_embedding, n_hidden, n_layers, batch_first=True, dropout=p_drop)
        self.dropout = nn.Dropout(p_drop)
        #self.lstm = nn.LSTM(n_embedding, n_hidden, n_layers, batch_first=True)
        self.fc = nn.Linear(n_hidden, n_output)
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        outputs, hiddens = self.lstm(embedded)
        outputs = self.dropout(outputs)
        outputs = outputs.contiguous().view(-1, n_hidden)
        outputs = self.fc(outputs)
        outputs = self.sigmoid(outputs)
        outputs = outputs.view(batch_size, -1)
        outputs = outputs[:, -1]

        return outputs, hiddens

    #def init_hidden(self, batch_size):
    #    device = "cuda" if torch.cuda.is_available() else "cpu"
    #    weights = next(self.parameters()).data
    #    hiddens = (weights.new(n_layers, batch_size, n_hidden).zero_().to(device),
    #         weights.new(n_layers, batch_size, n_hidden).zero_().to(device))
        
    #    return hiddens

In [191]:
# create and train the model
from torch import optim
import torch

model = SentimentLstm()
model.train()

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

step = 0
n_epoches = 20
max_norm = 5

for epoch in range(n_epoches):
    #hiddens = model.init_hidden(batch_size)

    for inputs, labels in train_loader:
        step += 1
        inputs, labels = inputs.to(device), labels.to(device)

        #hiddens = tuple([each.data for each in hiddens])

        model.zero_grad()
        output, hiddens = model(inputs)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), max_norm)
        optimizer.step()

        if (step % 100)  == 0:
            print("epoch {} training Loss: {:.4f}".format(epoch, loss.item()))

epoch 0 training Loss: 0.5361
epoch 0 training Loss: 0.4672
epoch 1 training Loss: 0.4409
epoch 1 training Loss: 0.3587
epoch 1 training Loss: 0.3271
epoch 2 training Loss: 0.2503
epoch 2 training Loss: 0.3008
epoch 3 training Loss: 0.1975
epoch 3 training Loss: 0.3376
epoch 3 training Loss: 0.2737
epoch 4 training Loss: 0.2175
epoch 4 training Loss: 0.2234
epoch 5 training Loss: 0.1027
epoch 5 training Loss: 0.1206
epoch 5 training Loss: 0.2138
epoch 6 training Loss: 0.1273
epoch 6 training Loss: 0.1266
epoch 7 training Loss: 0.1109
epoch 7 training Loss: 0.1162
epoch 7 training Loss: 0.0821
epoch 8 training Loss: 0.0603
epoch 8 training Loss: 0.0756
epoch 9 training Loss: 0.0639
epoch 9 training Loss: 0.0714
epoch 9 training Loss: 0.0863
epoch 10 training Loss: 0.0378
epoch 10 training Loss: 0.0308
epoch 11 training Loss: 0.0159
epoch 11 training Loss: 0.0649
epoch 11 training Loss: 0.0382
epoch 12 training Loss: 0.0084
epoch 12 training Loss: 0.1202
epoch 13 training Loss: 0.0088
ep

In [192]:
# save the mdoel

torch.save(model, "lstm.model")


In [193]:
# test the model
import numpy as np

model.eval()
test_losses = []
num_correct = 0

for inputs, labels in test_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    output, _ = model(inputs)
    loss = criterion(output.squeeze(), labels.float())
    test_losses.append(loss.item())
    
    preds = torch.round(output.squeeze())
    correct_tensor = preds.eq(labels.float().view_as(preds))
    correct = np.squeeze(correct_tensor.numpy())
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)))
print("Test Accuracy: {:.2f}".format(num_correct/len(test_loader.dataset)))

Test Loss: 0.8722
Test Accuracy: 0.85
