In [64]:
from time import time
from random import random, choice

import numpy as np
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.nn import init
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

import torchtext

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

np.random.seed(42)
CUDA = torch.cuda.is_available()

CUDA

True

In [2]:
# alphabet from the paper
# https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf
ALPHABET = ['<UNK>'] + ['\n'] + [s for s in """ abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}"""]
char2int = {s: i for s, i in zip(ALPHABET, range(len(ALPHABET)))}

MAX_WORD_LEN = 8  # chars in word (try 32?)
MAX_TEXT_LEN = 256  # words in text

BATCH_SIZE = 32
VALID_SIZE = 0.1

# Data preparation

Чтобы использовать CNN на слова, нужно фиксировать длину слова.

In [3]:
class HieracialIMDB(torchtext.datasets.imdb.IMDB):
    """
    Zero vector used for padding
    """
    noise_level = 0
    alphabet = ALPHABET

    def __getitem__(self, idx):
        item = super(HieracialIMDB, self).__getitem__(idx)
        _text_tensor = self.preprocess(item.text)

        label = int(item.label == 'pos')
        return _text_tensor, label
    
    def preprocess(self, text, with_noise=True):
        _text_tensor = torch.zeros([MAX_WORD_LEN * MAX_TEXT_LEN, len(self.alphabet)])

        for i, token in enumerate(text):
            if i >= MAX_TEXT_LEN:
                break
            if with_noise:
                token = self.noise_generator(token)
            for j, char in enumerate(token):
                if j >= MAX_WORD_LEN:
                    break
                _text_tensor[i*MAX_WORD_LEN + j, char2int.get(char, char2int['<UNK>'])] = 1.
        return _text_tensor
    
#     def _encode_word(self, word):
#         word_tensor = torch.zeros([MAX_WORD_LEN, len(ALPHABET)])
        
#         for i, char in enumerate(word):
#             word_tensor[i,char2int[char]] = 1.
        
#         return word_tensor

    def noise_generator(self, string):
        # removed '' symbol from alphabet for safety on word vectors
        noised = ""
        for c in string:
            if random() > self.noise_level:
                noised += c
            if random() < self.noise_level:
                noised += choice(self.alphabet)
        return noised


In [54]:
def get_train_valid_loader(dataset, valid_size, batch_size, random_seed=42, shuffle=True, num_workers=4):

    len_dataset = len(dataset)
    indices = list(range(len_dataset))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_actual_size = int(len_dataset * valid_size)

    train_idx, valid_idx = indices[:-val_actual_size], indices[-val_actual_size:]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=4
    )

    return train_loader, valid_loader

def onehot2text(one_hotted_text, batch_size=None, show_pad=False):
    if batch_size is None:
        text = ''
        max_values, idx = torch.max(one_hotted_text, 1)
        for c, i in enumerate(idx):
            if max_values[c] == 0:
                if show_pad:
                    symb = '<PAD>'
                else:
                    symb = ''
            else:
                symb = ALPHABET[i]
            text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts

def get_metrics_from_dataset(model, test_dataset, noise_level=None):
    """
    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []
    
    if noise_level is not None:
        test_dataset.noise_level = noise_level

    test_dataloader = torch.utils.data.DataLoader(
        test_dataset, batch_size=BATCH_SIZE
    )

    for text, label in test_dataloader:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(1, 0, 2)  # (1, 0, 2) for RNN
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    f1 = f1_score(lables, predictions)
    model.train()
    return {'accuracy': acc, 'f1': f1}

def get_metrics_from_dataloader(model, test_dataloader):
    """
    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []
    
    for text, label in test_dataloader:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(1, 0, 2)
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    f1 = f1_score(lables, predictions)
    model.train()
    return {'accuracy': acc, 'f1': f1}


In [5]:
# without spacy tokenizer it's commas all after the words =(

text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=MAX_TEXT_LEN, tensor_type=torch.FloatTensor, batch_first=True,
    use_vocab=False, tokenize='spacy'
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

In [6]:
%%time
train, test = HieracialIMDB.splits(text_field, label_field)

CPU times: user 1min 6s, sys: 623 ms, total: 1min 7s
Wall time: 1min 7s


In [7]:
onehot2text(train[0][0])  # no spaces is onehot2text problem, not a data one

"thismusicalisdecidedlymixed,andnoneoftheelementsreallyfittogether,butitsomehowmanagestobemostlyenjoyable.theplotcontainssomeoftheelementsofwodehouse'snovel,butnoneofitsvirtues,thoughheco-wrotethescript.thesongs,thoughcharming,havenothingtodowiththisparticularfilm,andareunusuallycrudelysqueezedintotheplot,evenbypre-oklahomastandards.burnsandallendotheirusualshtickquitecompetently,butitmissesthetoneoftherestofthefilmbyaboutfortyiqpoints.<br/><br/>thereareafewhighpoints.reginaldgardinerdoesgoodworkwhenheremembersthatthisisatalkie,andstopsmugginglikeasilentactor.andthereareafewbitsofwritingwhichcouldonlyhavebeenwrittenbywodehouse,thoughmostofthefilmfeelsliketheproductionofoneofthehollywoodmeetingshelaterparodied."

In [7]:
dataloader, val_dataloader = get_train_valid_loader(train, VALID_SIZE, BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(
    test, batch_size=BATCH_SIZE
)
# from https://github.com/akurniawan/pytorch-transformer

# Model

In [47]:
# https://github.com/akurniawan/pytorch-transformer
class MultiHeadAttention(nn.Module):
    def __init__(self,
                 query_dim,
                 key_dim,
                 num_units,
                 dropout_p=0.5,
                 h=8,
                 is_masked=False):
        super(MultiHeadAttention, self).__init__()

        if query_dim != key_dim:
            raise ValueError("query_dim and key_dim must be the same")
        if num_units % h != 0:
            raise ValueError("num_units must be dividable by h")
        if query_dim != num_units:
            raise ValueError("to employ residual connection, the number of "
                             "query_dim and num_units must be the same")

        self._num_units = num_units
        self._h = h
        self._key_dim = Variable(torch.FloatTensor([key_dim]))
        if CUDA:
            self._key_dim = self._key_dim.cuda()
        self._dropout_p = dropout_p
        self._is_masked = is_masked

        self.query_layer = nn.Linear(query_dim, num_units, bias=False)
        self.key_layer = nn.Linear(key_dim, num_units, bias=False)
        self.value_layer = nn.Linear(key_dim, num_units, bias=False)
        self.bn = nn.BatchNorm1d(num_units)

    def forward(self, query, keys):
        Q = self.query_layer(query)
        K = self.key_layer(keys)
        V = self.value_layer(keys)

        # split each Q, K and V into h different values from dim 2
        # and then merge them back together in dim 0
        chunk_size = int(self._num_units / self._h)
        Q = torch.cat(Q.split(split_size=chunk_size, dim=2), dim=0)
        K = torch.cat(K.split(split_size=chunk_size, dim=2), dim=0)
        V = torch.cat(V.split(split_size=chunk_size, dim=2), dim=0)

        # calculate QK^T
        attention = torch.matmul(Q, K.transpose(1, 2))
        # normalize with sqrt(dk)
        attention = attention / torch.sqrt(self._key_dim)
        # use masking (usually for decoder) to prevent leftward
        # information flow and retains auto-regressive property
        # as said in the paper
        if self._is_masked:
            diag_vals = attention[0].sign().abs()
            diag_mat = diag_vals.tril()
            diag_mat = diag_mat.unsqueeze(0).expand(attention.size())
            # we need to enforce converting mask to Variable, since
            # in pytorch we can't do operation between Tensor and
            # Variable
            mask = Variable(
                torch.ones(diag_mat.size()) * (-2**32 + 1), requires_grad=False)
            # this is some trick that I use to combine the lower diagonal
            # matrix and its masking. (diag_mat-1).abs() will reverse the value
            # inside diag_mat, from 0 to 1 and 1 to zero. with this
            # we don't need loop operation andn could perform our calculation
            # faster
            attention = (attention * diag_mat) + (mask * (diag_mat-1).abs())
        # put it to softmax
        attention = F.softmax(attention, dim=-1)
        # apply dropout
        attention = F.dropout(attention, self._dropout_p)
        # multiplyt it with V
        attention = torch.matmul(attention, V)
        # convert attention back to its input original size
        restore_chunk_size = int(attention.size(0) / self._h)
        attention = torch.cat(
            attention.split(split_size=restore_chunk_size, dim=0), dim=2)
        # residual connection
        attention += query
        # apply batch normalization
#         attention = self.bn(attention.transpose(1, 2)).transpose(1, 2)

        return attention

In [49]:
class AttentionedYoonKimModel(nn.Module):
    def __init__(self,
                 n_filters,
                 cnn_kernel_size,
                 hidden_dim_out,
                 dropout=0.5,
                 init_function=None,
                 embedding_dim=len(ALPHABET),
                 pool_kernel_size=MAX_WORD_LEN,
                 heads=1):
        """
        CharCNN-WordRNN model with multi-head attention
        Default pooling is MaxOverTime pooling
        """
        assert cnn_kernel_size % 2  # for 'same' padding

        super(AttentionedYoonKimModel, self).__init__()
        self.dropout = dropout
        self.init_function = init_function
        self.embedding_dim = embedding_dim
        self.n_filters = n_filters
        self.cnn_kernel_size = cnn_kernel_size
        self.hidden_dim_out = hidden_dim_out
        self.heads = heads

        self.embedding = nn.Linear(len(ALPHABET), embedding_dim)
        self.chars_cnn = nn.Sequential(
            nn.Conv1d(embedding_dim, n_filters, kernel_size=cnn_kernel_size, stride=1, padding=int(cnn_kernel_size - 1) // 2),  # 'same' padding
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=pool_kernel_size)
        )
        if init_function is not None:
            self.chars_cnn[0].weight = init_function(self.chars_cnn[0].weight)

        _conv_stride = 1  # by default
        _pool_stride = pool_kernel_size  # by default
        # I am not sure this formula is always correct:
        self.conv_dim = n_filters * max(1, int(((MAX_WORD_LEN - cnn_kernel_size) / _conv_stride - pool_kernel_size) / _pool_stride + 1))

        self.words_rnn = nn.GRU(self.conv_dim, hidden_dim_out, dropout=dropout)
        self.attention = MultiHeadAttention(hidden_dim_out, hidden_dim_out, hidden_dim_out, dropout_p=self.dropout, h=self.heads)
        self.projector = nn.Linear(hidden_dim_out, 2)
        
    def forward(self, x):
        batch_size = x.size(1)
        # TODO: hadrcode! (for CUDA)
        words_tensor = Variable(torch.zeros(MAX_TEXT_LEN, batch_size, self.conv_dim)).cuda()
        
        for i in range(MAX_TEXT_LEN):
            word = x[i * MAX_WORD_LEN : (i + 1) * MAX_WORD_LEN, :]
            word = self.embedding(word)
            word = word.permute(1, 2, 0)
            word = self.chars_cnn(word)
            word = word.view(word.size(0), -1)
            words_tensor[i, :] = word

        x, _ = self.words_rnn(words_tensor)
        x = self.attention(x, x)
        x = self.projector(x[-1])
        return x


In [63]:
def model_params_num(model):
    return sum(np.prod(list(p.size())) for p in model.parameters())

def mk_dataline(model_type, epochs, lr, noise_level_train, noise_level_test, acc_train, acc_test,
                f1_train, f1_test, dropout, model, init_function=None):
    return {
        'task': 'IMDB binary classification',
        'model_type': model_type,
        'trainable_params': model_params_num(model), 'dropout': dropout, 'init_function': init_function,
        'epochs': epochs, 'lr': lr,
        'noise_level_train': noise_level_train, 'noise_level_test': noise_level_test,
        'acc_train': acc_train, 'acc_test': acc_test,
        'f1_train': f1_train, 'f1_test': f1_test,
        'model_desc': str(model),
        'data_desc': 'Maxlen 512'
    }

In [8]:
results = []

In [50]:
def run_model_with(noise_level, n_filters, cnn_kernel_size, hidden_dim_out, dropout=0.5,
                   lr=1e-4, epochs=30, heads=1, init_function=None, _model=None):
    start_time = time()
    HieracialIMDB.noise_level = noise_level

    if _model is None:
        model = AttentionedYoonKimModel(
            n_filters=n_filters, cnn_kernel_size=cnn_kernel_size, hidden_dim_out=hidden_dim_out, dropout=dropout,
            init_function=init_function
        )
        if CUDA:
            model.cuda()
        model.train()
    
    else:
        model = _model
    
    model_name = '_AttentionedYoonKim_lr%s_dropout%s_noise_level%s_spacy_wordlen8_heads%s' % (
        int(-np.log10(lr)), model.dropout, noise_level, model.heads
    )

    writer = SummaryWriter(comment=model_name)
    print('Writer: %s' % list(writer.all_writers.keys()))

    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):

        for batch_idx, (text, label) in enumerate(dataloader):
            optimizer.zero_grad()

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(1, 0, 2)
            prediction = model(text)
            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        if epoch % 10 == 0:
        if epoch % 10 == 0:
            print('Epoch %s. Global step %s. T=%s min' % (epoch, global_step, (time() - start_time) / 60.))
            print('Loss               : %s' % loss.data[0])

        # in-batch
        _, idx = torch.max(prediction, 1)
        _labels = label.data.tolist()
        _predictions = idx.data.tolist()
        acc = accuracy_score(_labels, _predictions)
        f1 = f1_score(_labels, _predictions)
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        writer.add_scalar('f1_train', f1, global_step=global_step)
        if epoch % 10 == 0:
            print('In-batch accuracy  :', acc)

        # validation
        acc = get_accuracy(model, val_dataloader)
        writer.add_scalar('accuracy_val', acc, global_step=global_step)
        if epoch % 10 == 0:
            print('Validation accuracy:', acc)
            print()

    # Test

    acc = get_accuracy(model, test_dataloader)
    print('Final test accuracy:', acc)
    writer.add_scalar('accuracy_test_final', acc, global_step=global_step)
    print()
    model.eval()
    # model is in EVAL mode!
    return model


In [None]:
model = run_model_with(
    noise_level=0.025, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5,
    lr=1e-3, epochs=30, heads=1
)

Epoch 0. Global step 704
Loss               : 0.6714678406715393
In-batch accuracy  : 1.0
Validation accuracy: 0.5048

Epoch 10. Global step 7744
Loss               : 0.0345512330532074
In-batch accuracy  : 1.0
Validation accuracy: 0.8504



In [38]:
model = run_model_with(
    noise_level=0.025, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5,
    lr=1e-3, epochs=30, heads=1
)

torch.Size([256, 32, 128])


RuntimeError: cuDNN requires contiguous input tensor

In [30]:
model = run_model_with(
    noise_level=0.025, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5,
    lr=1e-3, epochs=30, heads=1
)

torch.Size([256, 32, 128])


RuntimeError: cuDNN requires contiguous input tensor