In [1]:
import torch
from torch import nn
from torch.autograd import Variable as V


emb = nn.Embedding(10000, 20, padding_idx=0)
inp = V(torch.LongTensor([1, 2, 5, 2, 10]))
out = emb(inp)
out

Variable containing:

Columns 0 to 9 
 1.3984  0.5117 -1.2055  0.5170  0.1951  0.6920  1.2170  0.0148 -1.3198 -0.0557
 0.7494 -0.2517 -0.6541  1.8658  0.7754 -0.1279 -0.8152  0.6345  1.6759 -0.8899
 0.8958 -1.0117 -0.0512  0.9927  0.3822  0.0791  0.8742 -1.0904 -0.7844  2.8853
 0.7494 -0.2517 -0.6541  1.8658  0.7754 -0.1279 -0.8152  0.6345  1.6759 -0.8899
-0.4250 -0.7694  1.0069  0.2646 -1.1900  0.4994 -0.2357 -0.7308  1.9614 -0.0608

Columns 10 to 19 
-0.6374  0.9245 -1.1659  0.7989  0.4804 -0.8946  0.1104  0.4608  0.1480 -1.9502
 0.6689 -0.2627 -0.5234 -1.7893 -1.6666  0.4311  1.2278  0.2549 -0.6590  0.5883
-0.7339 -1.1809 -0.7005 -0.2419 -0.9851 -0.1792  0.7095  0.6903  1.5535  1.3207
 0.6689 -0.2627 -0.5234 -1.7893 -1.6666  0.4311  1.2278  0.2549 -0.6590  0.5883
 0.6197  0.1856  0.2308 -0.6874 -0.0595  0.2519 -2.3235  1.3661  0.3801 -0.3416
[torch.FloatTensor of size 5x20]

In [2]:
# http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
import glob
import pathlib
import re


remove_marks_regex = re.compile('[,\.\(\)\[\]\*:;]|<.*?>')
shift_marks_regex = re.compile('([?!])')

def text2ids(text, vacab_dict):
    text = remove_marks_regex.sub('', text)
    text = shift_marks_regex.sub(r' \1 ', text)
    tokens = text.split()
    return [vacab_dict.get(token, 0) for token in tokens]

def list2tensor(token_idxes, max_len=100, padding=True):
    if len(token_idxes) > max_len:
        token_idxes = token_idxes[:max_len]
    n_tokens = len(token_idxes)
    if padding:
        token_idxes = token_idxes + [0] * (max_len - len(token_idxes))
    return torch.LongTensor(token_idxes), n_tokens

In [3]:
from torch.utils.data import Dataset


class IMDBDataset(Dataset):
    def __init__(self, dir_path, train=True, max_len=100, padding=True):
        self.max_len = max_len
        self.padding = padding
        path = pathlib.Path(dir_path)
        vocab_path = path.joinpath('imdb.vocab')
        self.vocab_array = vocab_path.open().read().strip().splitlines()
        self.vocab_dict = dict((w, i+1) for (i, w) in enumerate(self.vocab_array))
        if train:
            target_path = path.joinpath('train')
        else:
            target_path = path.joinpath('test')
        pos_files = sorted(glob.glob(str(target_path.joinpath('pos/*.txt'))))
        neg_files = sorted(glob.glob(str(target_path.joinpath('neg/*.txt'))))
        self.labeled_files = list(zip([0]*len(neg_files), neg_files)) + list(zip([1]*len(pos_files), pos_files))
        
    @property
    def vocab_size(self):
        return len(self.vocab_array)
    
    def __len__(self):
        return len(self.labeled_files)
    
    def __getitem__(self, idx):
        label, f = self.labeled_files[idx]
        data = open(f).read().lower()
        data = text2ids(data, self.vocab_dict)
        data, n = list2tensor(data, self.max_len, self.padding)
        return data, label, n

In [4]:
from torch.utils.data import DataLoader


train_data = IMDBDataset('./aclImdb/')
test_data = IMDBDataset('./aclImdb/', train=False)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, num_workers=4)

In [5]:
class SequenceTaggingNet(nn.Module):
    def __init__(self, num_embeddings, embedding_dim=50, hidden_size=50, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x, h0=None, l=None):
        x = self.emb(x)
        x, h = self.lstm(x, h0)
        if l is not None:
            x = x[list(range(len(x))), l-1, :]
        else:
            x = x[:, -1, :]
        x = self.linear(x)
        x = x.squeeze()
        return x

In [6]:
def eval_net(net, data_loader):
    net.eval()
    ys = []
    ypreds = []
    for x, y, l in data_loader:
        x = V(x, volatile=True)
        y = V(y, volatile=True)
        y_pred = net(x, l=l)
        y_pred = (y_pred > 0).long()
        ys.append(y.data)
        ypreds.append(y_pred.data)
    ys = torch.cat(ys)
    ypreds = torch.cat(ypreds)
    acc = (ys == ypreds).float().sum() / len(ys)
    return acc

In [None]:
from statistics import mean
from torch import optim


net = SequenceTaggingNet(train_data.vocab_size+1, num_layers=2)
opt = optim.Adam(net.parameters())
loss_f = nn.BCEWithLogitsLoss()
for epoch in range(10):
    losses = []
    net.train()
    for x, y, l in train_loader:
        x = V(x)
        y = V(y.float()).float()
        y_pred = net(x, l=l)
        loss = loss_f(y_pred, y)
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.data[0])
    train_acc = eval_net(net, train_loader)
    val_acc = eval_net(net, test_loader)
    print(epoch, mean(losses), train_acc, val_acc)

0 0.6599768895627288 0.7222 0.6944
1 0.5338538296311103 0.79404 0.7454
2 0.4424481616948572 0.83256 0.76496
3 0.3784253418902912 0.86392 0.77488
4 0.31670292347783935 0.89372 0.78396
5 0.26823607367723035 0.91796 0.7894


In [7]:
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression


train_X, train_y = load_svmlight_file('./aclImdb/train/labeledBow.feat')
test_X, test_y = load_svmlight_file('./aclImdb/test/labeledBow.feat', n_features=train_X.shape[1])
model = LogisticRegression(C=0.1, max_iter=1000)
model.fit(train_X, train_y)
model.score(train_X, train_y), model.score(test_X, test_y)

(0.89876, 0.39608)

In [7]:
class SequenceTaggingNet(nn.Module):
    def __init__(self, num_embeddings, embedding_dim=50, hidden_size=50, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x, h0=None, l=None):
        x = self.emb(x)
        if l is not None:
            x = nn.utils.rnn.pack_padded_sequence(x, l, batch_first=True)
        x, h = self.lstm(x, h0)
        if l is not None:
            hidden_state, cell_state = h
            x = hidden_state[-1]
        else:
            x = x[:, -1, :]
        x = self.linear(x).squeeze()
        return x

In [None]:
for epoch in range(10):
    losses = []
    net.train()
    for x, y, l in train_loader:
        l, sort_idx = torch.sort(l, descending=True)
        x = x[sort_idx]
        y = y[sort_idx]
        x = V(x)
        y = V(y.float())
        y_pred = net(x, l=list(l))
        loss = loss_f(y_pred, y)
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.data[0])
    train_acc = eval_net(net, train_loader)
    val_acc = eval_net(net, test_loader)
    print(epoch, mean(losses), train_acc, val_acc)

In [8]:
import string


all_chars = string.printable
vocab_size = len(all_chars)
vocab_dict = dict((c, i) for (i, c) in enumerate(all_chars))

def str2ints(s, vocab_dict):
    return [vocab_dict[c] for c in s]

def ints2str(x, vocab_array):
    return "".join([vocab_array[i] for i in x])

In [9]:
# curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > tinyshakespeare.txt


class ShakespeareDataset(Dataset):
    def __init__(self, path, chunk_size=200):
        data = str2ints(open(path).read().strip(), vocab_dict)
        data = torch.LongTensor(data).split(chunk_size)
        if len(data[-1]) < chunk_size:
            data = data[:-1]
        self.data = data
        self.n_chunks = len(self.data)
        
    def __len__(self):
        return self.n_chunks
    
    def __getitem__(self, idx):
        return self.data[idx]

In [10]:
ds = ShakespeareDataset('./tinyshakespeare.txt', chunk_size=200)
loader = DataLoader(ds, batch_size=32, shuffle=True, num_workers=4)

In [11]:
class SequenceGenerationNet(nn.Module):
    def __init__(self, num_embeddings, embedding_dim=50, hidden_size=50, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, num_embeddings)
        
    def forward(self, x, h0=None):
        x = self.emb(x)
        x, h = self.lstm(x, h0)
        x = self.linear(x)
        return x, h

In [12]:
def generate_seq(net, start_phrase='The King said', length=200, temperature=0.8):
    net.eval()
    result = []
    start_tensor = torch.LongTensor(str2ints(start_phrase, vocab_dict))
    x0 = V(start_tensor.unsqueeze(0), volatile=True)
    o, h = net(x0)
    out_dist = o[:, -1].data.view(-1).exp()
    top_i = torch.multinomial(out_dist, 1)[0]
    result.append(top_i)
    for i in range(length):
        inp = torch.LongTensor([[top_i]])
        o, h = net(V(inp), h)
        out_dist = o.data.view(-1).exp()
        top_i = torch.multinomial(out_dist, 1)[0]
        result.append(top_i)
    return start_phrase + ints2str(result, all_chars)

In [33]:
from statistics import mean
from torch import optim


net = SequenceGenerationNet(vocab_size, 20, 50, num_layers=2, dropout=0.1)
opt = optim.Adam(net.parameters())
loss_f = nn.CrossEntropyLoss()
for epoch in range(50):
    net.train()
    losses = []
    for data in loader:
        x = V(data[:, :-1])
        y = V(data[:, 1:])
        y_pred, _ = net(x)
        # https://discuss.pytorch.org/t/runtimeerror-input-is-not-contiguous/930/8
        loss = loss_f(y_pred.view(-1, vocab_size), y.contiguous().view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        losses.append(loss.data[0])
    print('=================================================================================')
    print(epoch, mean(losses))
    print(generate_seq(net))

0 3.494255264827183
The King saidecCy sErntTtAthrnmTmoel
bdGsuoetSS.este,sur:orot 
tt
aghaea ,tkobbnb
v.
dEfM ut 
 
Iuec anns:y mdcu
Tsga
hut suwounRloeto .iboldbartyers aoaEcmg of iusmn pniha C,tu
c mrtadt hua  dtvtm. m Ithoed ueitse
1 3.0875517668042862
The King saidn
Hor tem, aee bun ake hy tho
laigrCte

: saoa edt bote.
Thr,
M ban l sne; pes~on eeiy.Tufunr urhmef foeeE,ro tocrd teo wegtt tu'dr rkelrem
shileh amhauusha
ehik aleed lctesrmee bo dlhah oert:
Ath ewat
2 2.702447132383074
The King saidsd'tet;8s I
Nrrhioaav ihs wo. huurut'g wherse gicik Airate eeetedy a the mef eind
Hhto yhod wair, iee the, sfoit;
No haf ymlle hy shaue. ledrlestr !,f mons sihe fary't

ECTAATBTSRD:
Q
Roed
To sad, gudy
3 2.489848403930664
The King saidce Pnshegal co ersibl hwe taed for tis diyiy gon;
matend art, her malk werted dusser ans, eiyech Khee wht ardise thulm conpwhanl ceipod the oom, wocve b nec,
Nhfit on:
RuP eis, thesud eody
hi; lhif sav
4 2.3617121750967844
The King said souing oy it I shabuisTi

In [27]:
import re
import collections
import itertools


# http://www.manythings.org/anki/spa-eng.zip
remove_marks_regex = re.compile('[,\.\(\)\[\]\*:;¿¡]|<.*?>')
shift_marks_regex = re.compile('([?!\.])')
unk = 0
sos = 1
eos = 2

def normalize(text):
    text = text.lower()
    text = remove_marks_regex.sub('', text)
    text = shift_marks_regex.sub(r' \1', text)
    return text

def parse_line(line):
    line = normalize(line.strip())
    src, trg = line.split('\t')
    src_tokens = src.strip().split()
    trg_tokens = trg.strip().split()
    return src_tokens, trg_tokens

def build_vocab(tokens):
    counts = collections.Counter(tokens)
    sorted_counts = sorted(counts.items(), key=lambda c: c[1], reverse=True)
    word_list = ['<UNK>', '<SOS>', '<EOS>'] + [x[0] for x in sorted_counts]
    word_dict = dict((w, i) for i, w in enumerate(word_list))
    return word_list, word_dict

def words2tensor(words, word_dict, max_len, padding=0):
    words = words + ['<EOS>']
    words = [word_dict.get(w, 0) for w in words]
    seq_len = len(words)
    if seq_len < max_len + 1:
        words = words + [padding] * (max_len + 1 - seq_len)
    return torch.LongTensor(words), seq_len

In [28]:
class TranslationPairDataset(Dataset):
    def __init__(self, path, max_len=15):
        def filter_pair(p):
            return not (len(p[0]) > max_len or len(p[1]) > max_len)
        
        with open(path) as fp:
            pairs = map(parse_line, fp)
            pairs = filter(filter_pair, pairs)
            pairs = list(pairs)
        src = [p[0] for p in pairs]
        trg = [p[1] for p in pairs]
        self.src_word_list, self.src_word_dict = build_vocab(itertools.chain.from_iterable(src))
        self.trg_word_list, self.trg_word_dict = build_vocab(itertools.chain.from_iterable(trg))
        self.src_data = [words2tensor(words, self.src_word_dict, max_len) for words in src]
        self.trg_data = [words2tensor(words, self.trg_word_dict, max_len, -100) for words in trg]
        
    def __len__(self):
        return len(self.src_data)
    
    def __getitem__(self, idx):
        src, lsrc = self.src_data[idx]
        trg, ltrg = self.trg_data[idx]
        return src, lsrc, trg, ltrg

In [29]:
batch_size = 64
max_len = 10
path = './spa.txt'
ds = TranslationPairDataset(path, max_len=max_len)
loader = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=4)

In [30]:
class Encoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim=50, hidden_size=50, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        
    def forward(self, x, h0=None, l=None):
        x = self.emb(x)
        if l is not None:
            x = nn.utils.rnn.pack_padded_sequence(x, l, batch_first=True)
        _, h = self.lstm(x, h0)
        return h

In [31]:
class Decoder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim=50, hidden_size=50, num_layers=1, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.linear = nn.Linear(hidden_size, num_embeddings)
    
    def forward(self, x, h):
        x = self.emb(x)
        x, h = self.lstm(x, h)
        x = x.view(-1, self.lstm.hidden_size)
        x = self.linear(x)
        return x, h

In [32]:
def translate(input_str, enc, dec, max_len=15):
    words = normalize(input_str).split()
    input_tensor, seq_len = words2tensor(words, ds.src_word_dict, max_len=max_len)
    input_tensor = input_tensor.unsqueeze(0)
    seq_len = [seq_len]
    sos_inputs = torch.LongTensor([sos]).unsqueeze(1)
    ctx = enc(V(input_tensor, volatile=True), l=seq_len)
    z = V(sos_inputs, volatile=True)
    h = ctx
    results = []
    for i in range(max_len):
        o, h = dec(z, h)
        wi = o.data.max(1)[1].view(1)
        if wi[0] == eos:
            break
        results.append(wi[0])
        z = V(wi.view(1, 1), volatile=True)
    return " ".join(ds.trg_word_list[i] for i in results)

In [33]:
enc = Encoder(len(ds.src_word_list), 100, 100, 2)
dec = Decoder(len(ds.trg_word_list), 100, 100, 2)
translate('I am a student.', enc, dec)

'monja lavásemos lavásemos lavásemos lavásemos cuarentas preso residentes residentes reventado reventado prius vivir vivir reventado'

In [34]:
enc = Encoder(len(ds.src_word_list), 100, 100, 1, dropout=0.1)
dec = Decoder(len(ds.trg_word_list), 100, 100, 1, dropout=0.1)
opt_enc = optim.Adam(enc.parameters(), 0.002)
opt_dec = optim.Adam(dec.parameters(), 0.01)
loss_f = nn.CrossEntropyLoss()

In [35]:
from statistics import mean


for epoc in range(10):
    enc.train()
    dec.train()
    losses = []
    for x, lx, y, ly in loader:
        sos_inputs = torch.LongTensor([sos] * len(x)).unsqueeze(1)
        lx, sort_idx = lx.sort(descending=True)
        x, y = x[sort_idx], y[sort_idx]
        x, y = V(x), V(y)
        loss = 0
        ctx = enc(x, l=list(lx))
        z = V(sos_inputs)
        h = ctx
        for i in range(max_len):
            o, h = dec(z, h)
            loss += loss_f(o, y[:, i])
            wi = o.data.max(1)[1].unsqueeze(1)
            z = V(wi)
        enc.zero_grad()
        dec.zero_grad()
        loss.backward()
        opt_enc.step()
        opt_dec.step()
        losses.append(loss.data[0])
    enc.eval()
    dec.eval()
    print('===================================================================================')
    print(epoc, mean(losses))
    print(translate('I am a student.', enc, dec, max_len=max_len))
    print(translate('He likes to eat pizza.', enc, dec, max_len=max_len))
    print(translate('She is my mother.', enc, dec, max_len=max_len))

0 48.09250288635291
soy un
a le gusta a a
ella es mi padre
1 38.03870529137383
soy un
a gusta gusta a a
ella es mi madre madre
2 32.706015105630485
soy un estudiante
a gusta le gusta pizza pizza
ella es mi madre
3 29.407002432078837
soy un estudiante
a gustan comer pizza pizza
ella es mi
4 27.15345342578806
soy un estudiante
a gusta le gustan
ella es mi
5 25.54423598377782
soy estudiante
le gusta pizza pizza pizza pizza
ella es mi madre
6 24.301904748510832
soy un estudiante
a gusta le pizza pizza pizza
ella es mi madre
7 23.38268745464463
soy un estudiante
a gusta le gusta pizza pizza
ella es mi madre
8 22.60478694361459
soy estudiante estudiante
a gusta quiere pizza pizza
ella es mi madre
9 22.026191638336087
soy estudiante
a gusta comer pizza pizza
ella es mi madre
