In [1]:
import torch
from torch import nn
from torch.autograd import Variable as V


emb = nn.Embedding(10000, 20, padding_idx=0)
inp = V(torch.LongTensor([1, 2, 5, 2, 10]))
out = emb(inp)
out

Variable containing:

Columns 0 to 9 
 1.3466 -0.1936  0.2819  0.0078  0.3915 -0.3868 -0.3683  2.5334 -1.7047  2.2971
-0.1275 -0.3767 -0.2802  0.4301 -2.2578 -1.9774  0.4781  0.4948 -2.6422 -0.3606
-1.0050 -0.5673  0.2331 -1.4810 -0.0858  0.3819 -0.3272 -0.3714  0.8162  0.5598
-0.1275 -0.3767 -0.2802  0.4301 -2.2578 -1.9774  0.4781  0.4948 -2.6422 -0.3606
-1.6596 -0.7401  0.0841  0.4323 -0.2487 -0.7484  0.9155  0.7197 -0.3392  1.0007

Columns 10 to 19 
-0.8721 -0.3254  0.5644  1.7397 -0.8100  2.4798 -0.2664  0.1063  0.6808  0.3436
 0.5718 -1.6982  0.9012 -0.2116  0.1281  0.0507 -0.9467 -0.1902 -1.1626  0.2493
 0.6968  0.9578 -1.1826 -1.0494 -0.8746 -1.3724 -1.6439  0.9874 -0.5436 -0.2417
 0.5718 -1.6982  0.9012 -0.2116  0.1281  0.0507 -0.9467 -0.1902 -1.1626  0.2493
-1.5601  0.3365  1.2847 -0.1421  0.4739  0.7873 -1.5906  1.0184 -1.0457 -0.2532
[torch.FloatTensor of size 5x20]

In [2]:
# http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
import glob
import pathlib
import re


remove_marks_regex = re.compile('[,\.\(\)\[\]\*:;]|<.*?>')
shift_marks_regex = re.compile('([?!])')

def text2ids(text, vacab_dict):
    text = remove_marks_regex.sub('', text)
    text = shift_marks_regex.sub(r' \1 ', text)
    tokens = text.split()
    return [vacab_dict.get(token, 0) for token in tokens]

def list2tensor(token_idxes, max_len=100, padding=True):
    if len(token_idxes) > max_len:
        token_idxes = token_idxes[:max_len]
    n_tokens = len(token_idxes)
    if padding:
        token_idxes = token_idxes + [0] * (max_len - len(token_idxes))
    return torch.LongTensor(token_idxes), n_tokens

In [3]:
from torch.utils.data import Dataset


class IMDBDataset(Dataset):
    def __init__(self, dir_path, train=True, max_len=100, padding=True):
        self.max_len = max_len
        self.padding = padding
        path = pathlib.Path(dir_path)
        vocab_path = path.joinpath('imdb.vocab')
        self.vocab_array = vocab_path.open().read().strip().splitlines()
        self.vocab_dict = dict((w, i+1) for (i, w) in enumerate(self.vocab_array))
        if train:
            target_path = path.joinpath('train')
        else:
            target_path = path.joinpath('test')
        pos_files = sorted(glob.glob(str(target_path.joinpath('pos/*.txt'))))
        neg_files = sorted(glob.glob(str(target_path.joinpath('neg/*.txt'))))
        self.labeled_files = list(zip([0]*len(neg_files), neg_files)) + list(zip([1]*len(pos_files), pos_files))
        
    @property
    def vocab_size(self):
        return len(self.vocab_array)
    
    def __len__(self):
        return len(self.labeled_files)
    
    def __getitem__(self, idx):
        label, f = self.labeled_files[idx]
        data = open(f).read().lower()
        data = text2ids(data, self.vocab_dict)
        data, n = list2tensor(data, self.max_len, self.padding)
        return data, label, n

In [4]:
from torch.utils.data import DataLoader


train_data = IMDBDataset('./aclImdb/')
test_data = IMDBDataset('./aclImdb/', train=False)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, num_workers=4)

In [5]:
class SequenceTaggingNet(nn.Module):
    def __init__(self, num_embeddings, embedding_dim=50, hidden_size=50, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x, h0=None, l=None):
        x = self.emb(x)
        x, h = self.lstm(x, h0)
        if l is not None:
            x = x[list(range(len(x))), l-1, :]
        else:
            x = x[:, -1, :]
        x = self.linear(x)
        x = x.squeeze()
        return x

In [6]:
def eval_net(net, data_loader):
    net.eval()
    ys = []
    ypreds = []
    for x, y, l in data_loader:
        x = V(x, volatile=True)
        y = V(y, volatile=True)
        y_pred = net(x, l=l)
        y_pred = (y_pred > 0).long()
        ys.append(y.data)
        ypreds.append(y_pred.data)
    ys = torch.cat(ys)
    ypreds = torch.cat(ypreds)
    acc = (ys == ypreds).float().sum() / len(ys)
    return acc

In [None]:
from statistics import mean
from torch import optim


net = SequenceTaggingNet(train_data.vocab_size+1, num_layers=2)
opt = optim.Adam(net.parameters())
loss_f = nn.BCEWithLogitsLoss()
for epoch in range(10):
    losses = []
    net.train()
    for x, y, l in train_loader:
        x = V(x)
        y = V(y.float()).float()
        y_pred = net(x, l=l)
        loss = loss_f(y_pred, y)
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.data[0])
    train_acc = eval_net(net, train_loader)
    val_acc = eval_net(net, test_loader)
    print(epoch, mean(losses), train_acc, val_acc)

0 0.6599768895627288 0.7222 0.6944
1 0.5338538296311103 0.79404 0.7454
2 0.4424481616948572 0.83256 0.76496
3 0.3784253418902912 0.86392 0.77488
4 0.31670292347783935 0.89372 0.78396
5 0.26823607367723035 0.91796 0.7894


In [None]:
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression


train_X, train_y = load_svmlight_file('./aclImdb/train/labeledBow.feat')
test_X, test_y = load_svmlight_file('./aclImdb/test/labeledBow.feat', n_features=train_X.shape[1])
model = LogisticRegression(C=0.1, max_iter=1000)
model.fit(train_X, train_y)
model.score(train_X, train_y), model.score(test_X, test_y)

In [None]:
class SequenceTaggingNet(nn.Module):
    def __init__(self, num_embeddings, embedding_dim=50, hidden_size=50, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x, h0=None, l=None):
        x = self.emb(x)
        if l is not None:
            x = nn.utils.rnn.pack_padded_sequence(x, l, batch_first=True)
        x, h = self.lstm(x, h0)
        if l is not None:
            hidden_state, cell_state = h
            x = hidden_state[-1]
        else:
            x = x[:, -1, :]
        x = self.linear(x).squeeze()
        return x

In [None]:
for epoch in range(10):
    losses = []
    net.train()
    for x, y, l in train_loader:
        l, sort_idx = torch.sort(l, descending=True)
        x = x[sort_idx]
        y = y[sort_idx]
        x = V(x)
        y = V(y.float())
        y_pred = net(x, l=list(l))
        loss = loss_f(y_pred, y)
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.data[0])
    train_acc = eval_net(net, train_loader)
    val_acc = eval_net(net, test_loader)
    print(epoch, mean(losses), train_acc, val_acc)

In [None]:
import string


all_chars = string.printable
vocab_size = len(all_chars)
vocab_dic = dict(c, i) for (i, c) in enumerate(all_chars)

def str2ints(s, vocab_dict):
    return [vocab_dict[c] for c in s]

def ints2str(x, vocab_array):
    return "".join([vocab_array[i] for i in x])

In [None]:
# curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > tinyshakespeare.txt


class ShakespeareDataset(Dataset):
    def __init__(self, path, chunk_size=200):
        data = str2ints(open(path).read().strip(), vocab_dict)
        data = torch.LongTensor(data).split(chunk_size)
        if len(data[-1]) < chunk_size:
            data = data[:-1]
        self.data = data
        self.n_chunks = len(self.data)
        
    def __len__(self):
        return self.n_chunks
    
    def __getitem__(self, idx):
        return self.data[idx]