In [1]:
import torch
from torch import nn
from torch.autograd import Variable as V


emb = nn.Embedding(10000, 20, padding_idx=0)
inp = V(torch.LongTensor([1, 2, 5, 2, 10]))
out = emb(inp)
out

Variable containing:

Columns 0 to 9 
-1.3856 -0.9772  0.9081 -0.7059  0.8401 -0.7006  0.1452  1.7527 -1.5233  0.5608
 0.1024  1.2889 -0.1652  2.4374  0.6860 -0.8651 -0.0569 -0.3290 -0.3376  0.6104
-0.0752 -0.9036  1.6446  0.0394  1.7200  0.8020  0.8306 -0.4192  0.5159  0.2212
 0.1024  1.2889 -0.1652  2.4374  0.6860 -0.8651 -0.0569 -0.3290 -0.3376  0.6104
-0.0380  0.3274 -1.9475  0.7265  0.3106 -0.9905  0.5445 -0.3918 -0.7089  0.4364

Columns 10 to 19 
-0.1867  0.1660  1.3522  1.0327  0.1075  0.9357 -1.2392 -0.0001  1.1365 -0.3622
 0.8614  0.5901  2.0195 -0.5507  0.3969 -0.0277  2.0770  1.6329 -0.3054  0.5680
-0.9626 -0.4658  0.0678 -1.9309  0.1012  1.0150  2.1796 -1.4209 -0.3320  0.2843
 0.8614  0.5901  2.0195 -0.5507  0.3969 -0.0277  2.0770  1.6329 -0.3054  0.5680
 0.3169 -2.0373  1.1344  0.8220  0.4814 -2.2129  0.0020  0.6707 -1.1446  0.8665
[torch.FloatTensor of size 5x20]

In [2]:
# http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
import glob
import pathlib
import re


remove_marks_regex = re.compile('[,\.\(\)\[\]\*:;]|<.*?>')
shift_marks_regex = re.compile('([?!])')

def text2ids(text, vacab_dict):
    text = remove_marks_regex.sub('', text)
    text = shift_marks_regex.sub(r' \1 ', text)
    tokens = text.split()
    return [vacab_dict.get(token, 0) for token in tokens]

def list2tensor(token_idxes, max_len=100, padding=True):
    if len(token_idxes) > max_len:
        token_idxes = token_idxes[:max_len]
    n_tokens = len(token_idxes)
    if padding:
        token_idxes = token_idxes + [0] * (max_len - len(token_idxes))
    return torch.LongTensor(token_idxes), n_tokens

In [3]:
from torch.utils.data import Dataset


class IMDBDataset(Dataset):
    def __init__(self, dir_path, train=True, max_len=100, padding=True):
        self.max_len = max_len
        self.padding = padding
        path = pathlib.Path(dir_path)
        vocab_path = path.joinpath('imdb.vocab')
        self.vocab_array = vocab_path.open().read().strip().splitlines()
        self.vocab_dict = dict((w, i+1) for (i, w) in enumerate(self.vocab_array))
        if train:
            target_path = path.joinpath('train')
        else:
            target_path = path.joinpath('test')
        pos_files = sorted(glob.glob(str(target_path.joinpath('pos/*.txt'))))
        neg_files = sorted(glob.glob(str(target_path.joinpath('neg/*.txt'))))
        self.labeled_files = list(zip([0]*len(neg_files), neg_files)) + list(zip([1]*len(pos_files), pos_files))
        
    @property
    def vocab_size(self):
        return len(self.vocab_array)
    
    def __len__(self):
        return len(self.labeled_files)
    
    def __getitem__(self, idx):
        label, f = self.labeled_files[idx]
        data = open(f).read().lower()
        data = text2ids(data, self.vocab_dict)
        data, n = list2tensor(data, self.max_len, self.padding)
        return data, label, n

In [4]:
from torch.utils.data import DataLoader


train_data = IMDBDataset('./aclImdb/')
test_data = IMDBDataset('./aclImdb/', train=False)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, num_workers=4)

In [5]:
class SequenceTaggingNet(nn.Module):
    def __init__(self, num_embeddings, embedding_dim=50, hidden_size=50, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x, h0=None, l=None):
        x = self.emb(x)
        x, h = self.lstm(x, h0)
        if l is not None:
            x = x[list(range(len(x))), l-1, :]
        else:
            x = x[:, -1, :]
        x = self.linear(x)
        x = x.squeeze()
        return x

In [6]:
def eval_net(net, data_loader):
    net.eval()
    ys = []
    ypreds = []
    for x, y, l in data_loader:
        x = V(x, volatile=True)
        y = V(y, volatile=True)
        y_pred = net(x, l=l)
        y_pred = (y_pred > 0).long()
        ys.append(y.data)
        ypreds.append(y_pred.data)
    ys = torch.cat(ys)
    ypreds = torch.cat(ypreds)
    acc = (ys == ypreds).float().sum() / len(ys)
    return acc

In [7]:
from statistics import mean
from torch import optim


net = SequenceTaggingNet(train_data.vocab_size+1, num_layers=2)
opt = optim.Adam(net.parameters())
loss_f = nn.BCEWithLogitsLoss()
for epoch in range(10):
    losses = []
    net.train()
    for x, y, l in train_loader:
        x = V(x)
        y = V(y.float()).float()
        y_pred = net(x, l=l)
        loss = loss_f(y_pred, y)
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.data[0])
    train_acc = eval_net(net, train_loader)
    val_acc = eval_net(net, test_loader)
    print(epoch, mean(losses), train_acc, val_acc)

0 0.6651672329896551 0.6784 0.65876
1 0.6114181152466313 0.75264 0.70544
2 0.47725831536228397 0.85144 0.77424
3 0.3472488750620266 0.89936 0.79236
4 0.26870434299644913 0.9316 0.79744
5 0.20496220736647658 0.95676 0.79912
6 0.1490502182889701 0.97184 0.78936
7 0.10384575003648505 0.98588 0.7876
8 0.07579018946086792 0.99024 0.78312
9 0.05390472393399557 0.99332 0.78392
