# HRED

In [1]:
from random import random, choice

import numpy as np
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.nn import init
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

import torchtext

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

np.random.seed(42)
CUDA = torch.cuda.is_available()

CUDA

True

In [2]:
# alphabet from the paper
# https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf
ALPHABET = ['<UNK>'] + ['\n'] + [s for s in """ abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}"""]
char2int = {s: i for s, i in zip(ALPHABET, range(len(ALPHABET)))}

MAX_WORD_LEN = 16  # chars in word (try 32?)
MAX_TEXT_LEN = 256  # words in text

BATCH_SIZE = 32
VALID_SIZE = 0.1

# Data Preparation

In [3]:
class HieracialIMDB(torchtext.datasets.imdb.IMDB):
    """
    Zero vector used for padding
    """
    noise_level = 0
    alphabet = ALPHABET

    def __getitem__(self, idx):
        item = super(HieracialIMDB, self).__getitem__(idx)
        _text_tensor = self.preprocess(item.text)

        label = int(item.label == 'pos')
        return _text_tensor, label
    
    def preprocess(self, text, with_noise=True):
        _text_tensor = torch.zeros([MAX_WORD_LEN * MAX_TEXT_LEN, len(self.alphabet)])

        for i, token in enumerate(text):
            if i >= MAX_TEXT_LEN:
                break
            if with_noise:
                token = self.noise_generator(token)
            for j, char in enumerate(token):
                if j >= MAX_WORD_LEN:
                    break
                _text_tensor[i*MAX_WORD_LEN + j, char2int.get(char, char2int['<UNK>'])] = 1.
        return _text_tensor
    
#     def _encode_word(self, word):
#         word_tensor = torch.zeros([MAX_WORD_LEN, len(ALPHABET)])
        
#         for i, char in enumerate(word):
#             word_tensor[i,char2int[char]] = 1.
        
#         return word_tensor

    def noise_generator(self, string):
        # removed '' symbol from alphabet for safety on word vectors
        noised = ""
        for c in string:
            if random() > self.noise_level:
                noised += c
            if random() < self.noise_level:
                noised += choice(self.alphabet)
        return noised


In [9]:
def get_train_valid_loader(dataset, valid_size, batch_size, random_seed=42, shuffle=True, num_workers=4):

    len_dataset = len(dataset)
    indices = list(range(len_dataset))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_actual_size = int(len_dataset * valid_size)

    train_idx, valid_idx = indices[:-val_actual_size], indices[-val_actual_size:]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=4
    )

    return train_loader, valid_loader

def get_accuracy(model, test_dataloader, noise_level=0):
    """
    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []

    for text, label in test_dataloader:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(1, 0, 2)
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    model.train()
    return acc


def onehot2text(one_hotted_text, batch_size=None, show_pad=False):
    if batch_size is None:
        text = ''
        max_values, idx = torch.max(one_hotted_text, 1)
        for c, i in enumerate(idx):
            if max_values[c] == 0:
                if show_pad:
                    symb = '<PAD>'
                else:
                    symb = ''
            else:
                symb = ALPHABET[i]
            text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts

In [5]:
# without spacy tokenizer it's commas all after the words =(

text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=MAX_TEXT_LEN, tensor_type=torch.FloatTensor, batch_first=True,
    use_vocab=False, tokenize='spacy'
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

In [6]:
%%time
train, test = HieracialIMDB.splits(text_field, label_field)

CPU times: user 1min 13s, sys: 1.41 s, total: 1min 14s
Wall time: 1min 20s


In [7]:
dataloader, val_dataloader = get_train_valid_loader(train, VALID_SIZE, BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(
    test, batch_size=BATCH_SIZE
)

# Model

Like YoonKim model but with RNN on char level

In [40]:
class HREDModel(nn.Module):
    def __init__(self, hidden_dim_charRNN, hidden_dim_wordRNN,
                 dropout=0.5, init_function=None, embedding_dim=len(ALPHABET), pool_kernel_size=MAX_WORD_LEN):
        super(HREDModel, self).__init__()
        self.dropout = dropout
        self.embedding_dim = embedding_dim
        self.hidden_dim_charRNN = hidden_dim_charRNN
        self.hidden_dim_out = hidden_dim_wordRNN

        self.embedding = nn.Linear(len(ALPHABET), embedding_dim)
        self.char_rnn = nn.GRU(embedding_dim, hidden_dim_charRNN, dropout=dropout)
        self.word_rnn = nn.GRU(hidden_dim_charRNN, hidden_dim_wordRNN, dropout=dropout)
        self.projector = nn.Linear(hidden_dim_wordRNN, 2)
        
    def forward(self, x):
        batch_size = x.size(1)
        # TODO: hadrcode! (for CUDA)
        words_tensor = Variable(torch.zeros(MAX_TEXT_LEN, batch_size, self.hidden_dim_charRNN)).cuda()
        
        for i in range(MAX_TEXT_LEN):
            word = x[i * MAX_WORD_LEN : (i + 1) * MAX_WORD_LEN, :]
            word = self.embedding(word)
            word, _ = self.char_rnn(word)
            words_tensor[i, :] = word[-1]

        x, _ = self.word_rnn(words_tensor)
        x = self.projector(x[-1])
        return x


In [50]:
def model_params_num(model):
    return sum(np.prod(list(p.size())) for p in model.parameters())

def mk_dataline(model_type, epochs, lr, noise_level_train, noise_level_val, train_acc, val_acc,
                dropout, model, init_function=None):
    return {
        'model_type': model_type,
        'trainable_params': model_params_num(model), 'dropout': dropout, init_function = init_function,
        'epochs': epochs, 'lr': lr,
        'noise_level_train': noise_level_train, 'noise_level_val': noise_level_val,
        'train_acc': train_acc, 'val_acc': val_acc,
        'model_desc': str(model)
    }

SyntaxError: invalid syntax (<ipython-input-50-ee656b204fe4>, line 1)

In [48]:
def run_model_with(noise_level, hidden_dim_charRNN, hidden_dim_wordRNN, dropout=0.5,
                   lr=1e-4, epochs=30, init_function=None, _model=None):
    HieracialIMDB.noise_level = noise_level

    if _model is None:
        model = HREDModel(
            hidden_dim_charRNN=hidden_dim_charRNN, hidden_dim_wordRNN=hidden_dim_wordRNN, dropout=dropout
        )
        if CUDA:
            model.cuda()
        model.train()
    
    else:
        model = _model

    writer = SummaryWriter(comment='_HRED_lr%s_dropout%s_noise_level%s' %
                           (int(-np.log10(lr)), dropout, noise_level))

    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):

        for batch_idx, (text, label) in enumerate(dataloader):
            optimizer.zero_grad()

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(1, 0, 2)
            prediction = model(text)
            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        print('Epoch %s. Global step %s' % (epoch, global_step))
        print('Loss               : %s' % loss.data[0])

        _, idx = torch.max(prediction, 1)
        acc = accuracy_score(label.data.tolist(), idx.data.tolist())
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        print('In-batch accuracy  :', acc)

        acc = get_accuracy(model, val_dataloader)
        print('Validation accuracy:', acc)
        writer.add_scalar('accuracy_val', acc, global_step=global_step)
        print()

    # Test

    acc = get_accuracy(model, test_dataloader)
    print('Final test accuracy:', acc)
    writer.add_scalar('accuracy_test_final', acc, global_step=global_step)
    print()
    model.eval()
    # model is in EVAL mode!
    return model


In [49]:
%%time
model = run_model_with(
    noise_level=0, hidden_dim_charRNN=64, hidden_dim_wordRNN=256, dropout=0.5,
    lr=1e-3, epochs=20
)

Epoch 0. Global step 704
Loss               : 0.6931390762329102
In-batch accuracy  : 0.5
Validation accuracy: 0.4988

Epoch 1. Global step 1408
Loss               : 0.7017548084259033
In-batch accuracy  : 0.0
Validation accuracy: 0.5012

Epoch 2. Global step 2112
Loss               : 0.7052668333053589
In-batch accuracy  : 0.5
Validation accuracy: 0.5012

Epoch 3. Global step 2816
Loss               : 0.6934149265289307
In-batch accuracy  : 0.25
Validation accuracy: 0.5012

Epoch 4. Global step 3520
Loss               : 0.6925745010375977
In-batch accuracy  : 0.75
Validation accuracy: 0.4988

Epoch 5. Global step 4224
Loss               : 0.6931474804878235
In-batch accuracy  : 0.5
Validation accuracy: 0.5012

Epoch 6. Global step 4928
Loss               : 0.6825548410415649
In-batch accuracy  : 1.0
Validation accuracy: 0.4988

Epoch 7. Global step 5632
Loss               : 0.6900827884674072
In-batch accuracy  : 0.75
Validation accuracy: 0.4988

Epoch 8. Global step 6336
Loss        

In [51]:
%%time
model = run_model_with(
    noise_level=0, hidden_dim_charRNN=64, hidden_dim_wordRNN=256, dropout=0.5,
    lr=1e-2, epochs=10
)

Epoch 0. Global step 704
Loss               : nan
In-batch accuracy  : 1.0
Validation accuracy: 0.4988

Epoch 1. Global step 1408
Loss               : nan
In-batch accuracy  : 0.5
Validation accuracy: 0.4988



Process Process-187:
Process Process-186:
Process Process-185:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process Process-188:
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 50, in _worker_loop
    r = index_queue.get()
Tracebac

KeyboardInterrupt: 