# Yoon Kim

In [1]:
from random import random, choice

import numpy as np
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

import torchtext

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

np.random.seed(42)
CUDA = torch.cuda.is_available()

CUDA

True

In [2]:
# alphabet from the paper
# https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf
ALPHABET = ['<UNK>'] + ['\n'] + [s for s in """ abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}"""]
char2int = {s: i for s, i in zip(ALPHABET, range(len(ALPHABET)))}

MAX_WORD_LEN = 16  # chars in word (try 32?)
MAX_TEXT_LEN = 256  # words in text

BATCH_SIZE = 32
VALID_SIZE = 0.1

# Data preparation

Чтобы использовать CNN на слова, нужно фиксировать длину слова.

In [3]:
class HieracialIMDB(torchtext.datasets.imdb.IMDB):
    """
    Zero vector used for padding
    """
    noise_level = 0
    alphabet = ALPHABET

    def __getitem__(self, idx):
        item = super(HieracialIMDB, self).__getitem__(idx)
        _text_tensor = self.preprocess(item.text)

        label = int(item.label == 'pos')
        return _text_tensor, label
    
    def preprocess(self, text, with_noise=True):
        _text_tensor = torch.zeros([MAX_WORD_LEN * MAX_TEXT_LEN, len(self.alphabet)])

        for i, token in enumerate(text):
            if i >= MAX_TEXT_LEN:
                break
            if with_noise:
                token = self.noise_generator(token)
            for j, char in enumerate(token):
                if j >= MAX_WORD_LEN:
                    break
                _text_tensor[i*MAX_WORD_LEN + j, char2int.get(char, char2int['<UNK>'])] = 1.
        return _text_tensor
    
#     def _encode_word(self, word):
#         word_tensor = torch.zeros([MAX_WORD_LEN, len(ALPHABET)])
        
#         for i, char in enumerate(word):
#             word_tensor[i,char2int[char]] = 1.
        
#         return word_tensor

    def noise_generator(self, string):
        # removed '' symbol from alphabet for safety on word vectors
        noised = ""
        for c in string:
            if random() > self.noise_level:
                noised += c
            if random() < self.noise_level:
                noised += choice(self.alphabet)
        return noised


In [4]:
def get_train_valid_loader(dataset, valid_size, batch_size, random_seed=42, shuffle=True, num_workers=4):

    len_dataset = len(dataset)
    indices = list(range(len_dataset))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_actual_size = int(len_dataset * valid_size)

    train_idx, valid_idx = indices[:-val_actual_size], indices[-val_actual_size:]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=4
    )

    return train_loader, valid_loader

def get_accuracy(model, test_dataloader):
    """
    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []

    for text, label in test_dataloader:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(1, 0, 2)
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    model.train()
    return acc


def onehot2text(one_hotted_text, batch_size=None, show_pad=False):
    if batch_size is None:
        text = ''
        max_values, idx = torch.max(one_hotted_text, 1)
        for c, i in enumerate(idx):
            if max_values[c] == 0:
                if show_pad:
                    symb = '<PAD>'
                else:
                    symb = ''
            else:
                symb = ALPHABET[i]
            text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts

In [5]:
# without spacy tokenizer it's commas all after the words =(

text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=MAX_TEXT_LEN, tensor_type=torch.FloatTensor, batch_first=True,
    use_vocab=False#, tokenize='spacy'
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

In [6]:
%%time
train, test = HieracialIMDB.splits(text_field, label_field)

CPU times: user 3.74 s, sys: 788 ms, total: 4.53 s
Wall time: 8.69 s


In [7]:
onehot2text(train[0][0])  # no spaces is onehot2text problem, not a data one

"thismusicalisdecidedlymixed,andnoneoftheelementsreallyfittogether,butitsomehowmanagestobemostlyenjoyable.theplotcontainssomeoftheelementsofwodehouse'snovel,butnoneofitsvirtues,thoughheco-wrotethescript.thesongs,thoughcharming,havenothingtodowiththisparticularfilm,andareunusuallycrudelysqueezedintotheplot,evenbypre-oklahomastandards.burnsandallendotheirusualshtickquitecompetently,butitmissesthetoneoftherestofthefilmbyaboutfortyiqpoints.<br/><br/>thereareafewhighpoints.reginaldgardinerdoesgoodworkwhenheremembersthatthisisatalkie,andstopsmugginglikeasilentactor.andthereareafewbitsofwritingwhichcouldonlyhavebeenwrittenbywodehouse,thoughmostofthefilmfeelsliketheproductionofoneofthehollywoodmeetingshelaterparodied."

In [8]:
dataloader, val_dataloader = get_train_valid_loader(train, VALID_SIZE, BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(
    test, batch_size=BATCH_SIZE
)

# Model

Статья: https://arxiv.org/abs/1508.06615

Модель принципиально работает так же, но есть некоторые сильные упрощения:
  * нету highway-слоя
  * тут используется фильтры только одного размера (а не трёх, как в оригинальной статье)

In [26]:
class YoonKimModel(nn.Module):
    def __init__(self, n_filters, cnn_kernel_size, hidden_dim_out,
                 dropout=0.5, init_function=None, embedding_dim=len(ALPHABET), pool_kernel_size=MAX_WORD_LEN):
        """
        Default pooling is MaxOverTime pooling
        """
        assert cnn_kernel_size % 2  # for 'same' padding

        super(YoonKimModel, self).__init__()
        self.dropout = dropout
        self.init_function = init_function
        self.embedding_dim = embedding_dim
        self.n_filters = n_filters
        self.cnn_kernel_size = cnn_kernel_size
        self.hidden_dim_out = hidden_dim_out

        self.embedding = nn.Linear(len(ALPHABET), embedding_dim)
        self.chars_cnn = nn.Sequential(
            nn.Conv1d(embedding_dim, n_filters, kernel_size=cnn_kernel_size, stride=1, padding=int(cnn_kernel_size - 1) // 2),  # 'same' padding
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=pool_kernel_size)
        )
        if init_function is not None:
            self.conv[0].weight = init_function(self.conv[0].weight)

        _conv_stride = 1  # by default
        _pool_stride = pool_kernel_size  # by default
        # I am not sure this formula is always correct:
        self.conv_dim = n_filters * max(1, int(((MAX_WORD_LEN - cnn_kernel_size) / _conv_stride - pool_kernel_size) / _pool_stride + 1))

        self.words_rnn = nn.GRU(self.conv_dim, hidden_dim_out, dropout=dropout)
        self.projector = nn.Linear(hidden_dim_out, 2)
        
    def forward(self, x):
        batch_size = x.size(1)
        words_tensor = Variable(torch.zeros(MAX_TEXT_LEN, batch_size, self.conv_dim)).cuda()
        # TODO: external dependency
        if CUDA:
            words_tensor.cuda()

        for i in range(MAX_TEXT_LEN):
            word = x[i * MAX_WORD_LEN : (i + 1) * MAX_WORD_LEN, :]
            word = self.embedding(word)
            word = word.permute(1, 2, 0)
            word = self.chars_cnn(word)
            word = word.view(word.size(0), -1)
            words_tensor[i, :] = word

        x, _ = self.words_rnn(words_tensor)
        x = self.projector(x[-1])
        return x

#     def describe(self):
#         return 'YoonKimModel_filters_%s_cnn_%s_pool_%s_stride_%s'

In [48]:
def run_model_with(noise_level, n_filters, cnn_kernel_size, hidden_dim_out, dropout=0.5,
                   lr=1e-4, epochs=30, _model=None):
    HieracialIMDB.noise_level = noise_level

    if _model is None:
        model = YoonKimModel(
            n_filters=n_filters, cnn_kernel_size=cnn_kernel_size, hidden_dim_out=hidden_dim_out, dropout=dropout
        )
        if CUDA:
            model.cuda()
        model.train()
    
    else:
        model = _model

    writer = SummaryWriter(comment='_YoonKim_lr%s_dropout%s_noise_level%s' %
                           (int(-np.log10(lr)), dropout, noise_level))

    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):

        for batch_idx, (text, label) in enumerate(dataloader):
            optimizer.zero_grad()

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(1, 0, 2)
            prediction = model(text)
            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        print('Epoch %s. Global step %s' % (epoch, global_step))
        print('Loss               : %s' % loss.data[0])

        _, idx = torch.max(prediction, 1)
        acc = accuracy_score(label.data.tolist(), idx.data.tolist())
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        print('In-batch accuracy  :', acc)

        acc = get_accuracy(model, val_dataloader)
        print('Validation accuracy:', acc)
        writer.add_scalar('accuracy_val', acc, global_step=global_step)
        print()

    # Test

    acc = get_accuracy(model, test_dataloader)
    print('Final test accuracy:', acc)
    writer.add_scalar('accuracy_test_final', acc, global_step=global_step)
    print()
    model.eval()
    # model is in EVAL mode!
    return model


In [None]:
%%time
model = run_model_with(
    noise_level=0, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5,
    lr=1e-4, epochs=10
)

adding scalar
adding scalar
adding scalar
adding scalar
adding scalar
adding scalar
adding scalar
adding scalar
adding scalar
adding scalar


# Нужное ненужное

In [29]:
for i in dataloader:
    item = i
    break

In [38]:
model = YoonKimModel(n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5)

In [42]:
model.cuda()

YoonKimModel(
  (embedding): Linear(in_features=74, out_features=74, bias=True)
  (chars_cnn): Sequential(
    (0): Conv1d(74, 256, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=16, stride=16, padding=0, dilation=1, ceil_mode=False)
  )
  (words_rnn): GRU(256, 128, dropout=0.5)
  (projector): Linear(in_features=128, out_features=2, bias=True)
)

In [33]:
text = Variable(item[0].cuda()).permute(1, 0, 2)

In [45]:
label = Variable(torch.LongTensor(item[1])).cuda()

In [46]:
F.cross_entropy(model(text), label)

Variable containing:
 0.6893
[torch.cuda.FloatTensor of size 1 (GPU 0)]

In [76]:
item[0]


( 0  ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

( 1  ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

( 2  ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
 ... 

( 29 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

( 30 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...   

In [50]:
item[0][:,0:MAX_WORD_LEN,:]


(0 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

(1 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

(2 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
...

(29,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

(30,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
 

In [23]:
Variable(item[0][0:MAX_WORD_LEN,:]).permute(0, 2, 0)

Variable containing:
(0 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

(1 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

(2 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
...

(29,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

(30,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0

In [51]:
item[0].permute(1, 0, 2)[0:MAX_WORD_LEN,:].permute(1, 2, 0)


(0 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

(1 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

(2 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
...

(29,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
     ...       ⋱       ...    
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0

(30,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
 

In [81]:
model.chars_cnn(model.embedding(Variable(item[0].permute(1, 0, 2)[0:MAX_WORD_LEN,:])).permute(1, 2 0)

Variable containing:
(0 ,.,.) = 
  0.1022 -0.0039 -0.0890  ...   0.0393  0.1056 -0.0511
  0.0761 -0.0932  0.0260  ...  -0.0641  0.1511 -0.0113
  0.0657 -0.0369  0.0588  ...   0.0276  0.1004  0.1344
           ...             ⋱             ...          
  0.1275 -0.0825  0.1114  ...   0.0630  0.0929  0.0624
 -0.0074 -0.0157 -0.0175  ...  -0.0782  0.0081  0.0615
 -0.0074 -0.0157 -0.0175  ...  -0.0782  0.0081  0.0615

(1 ,.,.) = 
  0.0255 -0.0172  0.1354  ...  -0.0489 -0.0233  0.0601
  0.1022 -0.0039 -0.0890  ...   0.0393  0.1056 -0.0511
  0.0827 -0.0978  0.1132  ...  -0.0385  0.0374 -0.0501
           ...             ⋱             ...          
  0.0824 -0.0364  0.0261  ...  -0.0096  0.0528  0.0438
  0.0761 -0.0932  0.0260  ...  -0.0641  0.1511 -0.0113
  0.0761 -0.0932  0.0260  ...  -0.0641  0.1511 -0.0113

(2 ,.,.) = 
  0.0824 -0.0364  0.0261  ...  -0.0096  0.0528  0.0438
  0.0857  0.0450 -0.0070  ...  -0.0097 -0.0208  0.1098
  0.1022 -0.0039 -0.0890  ...   0.0393  0.1056 -0.0511
      

In [79]:
model.chars_cnn(
    Variable(item[0].permute(1, 0, 2)[0:MAX_WORD_LEN,:].permute(1, 2, 0))
)

Variable containing:
( 0 ,.,.) = 
  0.1181
  0.0433
  0.0000
   ⋮    
  0.0908
  0.0478
  0.1122

( 1 ,.,.) = 
  0.1486
  0.0545
  0.0115
   ⋮    
  0.1511
  0.1207
  0.0366

( 2 ,.,.) = 
  0.1427
  0.0846
  0.0000
   ⋮    
  0.1290
  0.1473
  0.1483
... 

(29 ,.,.) = 
  0.0911
  0.0000
  0.0000
   ⋮    
  0.0569
  0.0477
  0.0612

(30 ,.,.) = 
  0.0919
  0.0242
  0.0022
   ⋮    
  0.1367
  0.0610
  0.0957

(31 ,.,.) = 
  0.0919
  0.0242
  0.0022
   ⋮    
  0.1367
  0.0610
  0.0957
[torch.FloatTensor of size 32x256x1]