In [1]:
from time import time
from random import random, choice

import numpy as np
from sklearn.metrics import accuracy_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.nn import init
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

import torchtext

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

np.random.seed(42)
CUDA = torch.cuda.is_available()

CUDA

True

In [2]:
# alphabet from the paper
# https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf
ALPHABET = ['<UNK>'] + ['\n'] + [s for s in """ abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}"""]
char2int = {s: i for s, i in zip(ALPHABET, range(len(ALPHABET)))}

MAX_WORD_LEN = 8  # chars in word (try 32?)
MAX_TEXT_LEN = 256  # words in text

BATCH_SIZE = 32
VALID_SIZE = 0.1

NOISE_LEVELS = [0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2]

# Data preparation

Чтобы использовать CNN на слова, нужно фиксировать длину слова.

In [3]:
class HieracialIMDB(torchtext.datasets.imdb.IMDB):
    """
    Zero vector used for padding
    """
    noise_level = 0
    alphabet = ALPHABET

    def __getitem__(self, idx):
        item = super(HieracialIMDB, self).__getitem__(idx)
        _text_tensor = self.preprocess(item.text)

        label = int(item.label == 'pos')
        return _text_tensor, label
    
    def preprocess(self, text, with_noise=True):
        _text_tensor = torch.zeros([MAX_WORD_LEN * MAX_TEXT_LEN, len(self.alphabet)])

        for i, token in enumerate(text):
            if i >= MAX_TEXT_LEN:
                break
            if with_noise:
                token = self.noise_generator(token)
            for j, char in enumerate(token):
                if j >= MAX_WORD_LEN:
                    break
                _text_tensor[i*MAX_WORD_LEN + j, char2int.get(char, char2int['<UNK>'])] = 1.
        return _text_tensor
    
#     def _encode_word(self, word):
#         word_tensor = torch.zeros([MAX_WORD_LEN, len(ALPHABET)])
        
#         for i, char in enumerate(word):
#             word_tensor[i,char2int[char]] = 1.
        
#         return word_tensor

    def noise_generator(self, string):
        # removed '' symbol from alphabet for safety on word vectors
        noised = ""
        for c in string:
            if random() > self.noise_level:
                noised += c
            if random() < self.noise_level:
                noised += choice(self.alphabet)
        return noised


In [4]:
def get_train_valid_loader(dataset, valid_size, batch_size, random_seed=42, shuffle=True, num_workers=4):

    len_dataset = len(dataset)
    indices = list(range(len_dataset))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_actual_size = int(len_dataset * valid_size)

    train_idx, valid_idx = indices[:-val_actual_size], indices[-val_actual_size:]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=4
    )

    return train_loader, valid_loader

def onehot2text(one_hotted_text, batch_size=None, show_pad=False):
    if batch_size is None:
        text = ''
        max_values, idx = torch.max(one_hotted_text, 1)
        for c, i in enumerate(idx):
            if max_values[c] == 0:
                if show_pad:
                    symb = '<PAD>'
                else:
                    symb = ''
            else:
                symb = ALPHABET[i]
            text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts

def get_metrics(model, test_data, noise_level=None):
    """
    :param test_data: dataset or dataloader

    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []
    
    if isinstance(test_data, torch.utils.data.Dataset):
        if noise_level is not None:
            test_data.noise_level = noise_level

        test_dataloader = torch.utils.data.DataLoader(
            test_data, batch_size=BATCH_SIZE
        )
    else:
        assert isinstance(test_data, torch.utils.data.DataLoader)
        test_dataloader = test_data

    for text, label in test_dataloader:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(1, 0, 2)  # (1, 0, 2) for RNN
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    f1 = f1_score(lables, predictions)
    model.train()
    return {'accuracy': acc, 'f1': f1}


In [5]:
# without spacy tokenizer it's commas all after the words =(

text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=MAX_TEXT_LEN, tensor_type=torch.FloatTensor, batch_first=True,
    use_vocab=False, tokenize='spacy'
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

In [6]:
%%time
train, test = HieracialIMDB.splits(text_field, label_field)

CPU times: user 1min 7s, sys: 1.09 s, total: 1min 8s
Wall time: 1min 13s


In [7]:
onehot2text(train[0][0])  # no spaces is onehot2text problem, not a data one

"thismusicalisdecidedlmixed,andnoneoftheelementsreallyfittogether,butitsomehowmanagestobemostlyenjoyabl.theplotcontainssomeoftheelementsofwodehous'snovel,butnoneofitsvirtues,thoughheco-wrotethescript.thesongs,thoughcharming,havenothingtodowiththisparticulfilm,andareunusuallcrudelysqueezedintotheplot,evenbypre-oklahomastandard.burnsandallendotheirusualshtickquitecompeten,butitmissesthetoneoftherestofthefilmbyaboutfortyiqpoints.</><br/>thereareafewhighpoints.reginaldgardinerdoesgoodworkwhenherememberthatthisisatalkie,andstopsmugginglikeasilentactor.andthereareafewbitsofwritingwhichcouldonlyhavebeenwrittenbywodehous,thoughmostofthefilmfeelsliketheproductiofoneofthehollywoomeetingshelaterparodied."

In [8]:
dataloader, val_dataloader = get_train_valid_loader(train, VALID_SIZE, BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(
    test, batch_size=BATCH_SIZE
)
# from https://github.com/akurniawan/pytorch-transformer

# Model

In [9]:
# https://github.com/akurniawan/pytorch-transformer
class MultiHeadAttention(nn.Module):
    def __init__(self,
                 query_dim,
                 key_dim,
                 num_units,
                 dropout_p=0.5,
                 h=8,
                 is_masked=False):
        super(MultiHeadAttention, self).__init__()

        if query_dim != key_dim:
            raise ValueError("query_dim and key_dim must be the same")
        if num_units % h != 0:
            raise ValueError("num_units must be dividable by h")
        if query_dim != num_units:
            raise ValueError("to employ residual connection, the number of "
                             "query_dim and num_units must be the same")

        self._num_units = num_units
        self._h = h
        self._key_dim = Variable(torch.FloatTensor([key_dim]))
        if CUDA:
            self._key_dim = self._key_dim.cuda()
        self._dropout_p = dropout_p
        self._is_masked = is_masked

        self.query_layer = nn.Linear(query_dim, num_units, bias=False)
        self.key_layer = nn.Linear(key_dim, num_units, bias=False)
        self.value_layer = nn.Linear(key_dim, num_units, bias=False)
        self.bn = nn.BatchNorm1d(num_units)

    def forward(self, query, keys):
        Q = self.query_layer(query)
        K = self.key_layer(keys)
        V = self.value_layer(keys)

        # split each Q, K and V into h different values from dim 2
        # and then merge them back together in dim 0
        chunk_size = int(self._num_units / self._h)
        Q = torch.cat(Q.split(split_size=chunk_size, dim=2), dim=0)
        K = torch.cat(K.split(split_size=chunk_size, dim=2), dim=0)
        V = torch.cat(V.split(split_size=chunk_size, dim=2), dim=0)

        # calculate QK^T
        attention = torch.matmul(Q, K.transpose(1, 2))
        # normalize with sqrt(dk)
        attention = attention / torch.sqrt(self._key_dim)
        # use masking (usually for decoder) to prevent leftward
        # information flow and retains auto-regressive property
        # as said in the paper
        if self._is_masked:
            diag_vals = attention[0].sign().abs()
            diag_mat = diag_vals.tril()
            diag_mat = diag_mat.unsqueeze(0).expand(attention.size())
            # we need to enforce converting mask to Variable, since
            # in pytorch we can't do operation between Tensor and
            # Variable
            mask = Variable(
                torch.ones(diag_mat.size()) * (-2**32 + 1), requires_grad=False)
            # this is some trick that I use to combine the lower diagonal
            # matrix and its masking. (diag_mat-1).abs() will reverse the value
            # inside diag_mat, from 0 to 1 and 1 to zero. with this
            # we don't need loop operation andn could perform our calculation
            # faster
            attention = (attention * diag_mat) + (mask * (diag_mat-1).abs())
        # put it to softmax
        attention = F.softmax(attention, dim=-1)
        # apply dropout
        attention = F.dropout(attention, self._dropout_p)
        # multiplyt it with V
        attention = torch.matmul(attention, V)
        # convert attention back to its input original size
        restore_chunk_size = int(attention.size(0) / self._h)
        attention = torch.cat(
            attention.split(split_size=restore_chunk_size, dim=0), dim=2)
        # residual connection
        attention += query
        # apply batch normalization
#         attention = self.bn(attention.transpose(1, 2)).transpose(1, 2)

        return attention

In [10]:
class AttentionedYoonKimModel(nn.Module):
    def __init__(self,
                 n_filters,
                 cnn_kernel_size,
                 hidden_dim_out,
                 dropout=0.5,
                 init_function=None,
                 embedding_dim=len(ALPHABET),
                 pool_kernel_size=MAX_WORD_LEN,
                 heads=1):
        """
        CharCNN-WordRNN model with multi-head attention
        Default pooling is MaxOverTime pooling
        """
        assert cnn_kernel_size % 2  # for 'same' padding

        super(AttentionedYoonKimModel, self).__init__()
        self.dropout = dropout
        self.init_function = init_function
        self.embedding_dim = embedding_dim
        self.n_filters = n_filters
        self.cnn_kernel_size = cnn_kernel_size
        self.hidden_dim_out = hidden_dim_out
        self.heads = heads

        self.embedding = nn.Linear(len(ALPHABET), embedding_dim)
        self.chars_cnn = nn.Sequential(
            nn.Conv1d(embedding_dim, n_filters, kernel_size=cnn_kernel_size, stride=1, padding=int(cnn_kernel_size - 1) // 2),  # 'same' padding
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=pool_kernel_size)
        )
        if init_function is not None:
            self.chars_cnn[0].weight = init_function(self.chars_cnn[0].weight)

        _conv_stride = 1  # by default
        _pool_stride = pool_kernel_size  # by default
        # I am not sure this formula is always correct:
        self.conv_dim = n_filters * max(1, int(((MAX_WORD_LEN - cnn_kernel_size) / _conv_stride - pool_kernel_size) / _pool_stride + 1))

        self.words_rnn = nn.GRU(self.conv_dim, hidden_dim_out, dropout=dropout)
        self.attention = MultiHeadAttention(hidden_dim_out, hidden_dim_out, hidden_dim_out, dropout_p=self.dropout, h=self.heads)
        self.projector = nn.Linear(hidden_dim_out, 2)
        
    def forward(self, x):
        batch_size = x.size(1)
        # TODO: hadrcode! (for CUDA)
        words_tensor = Variable(torch.zeros(MAX_TEXT_LEN, batch_size, self.conv_dim)).cuda()
        
        for i in range(MAX_TEXT_LEN):
            word = x[i * MAX_WORD_LEN : (i + 1) * MAX_WORD_LEN, :]
            word = self.embedding(word)
            word = word.permute(1, 2, 0)
            word = self.chars_cnn(word)
            word = word.view(word.size(0), -1)
            words_tensor[i, :] = word

        x, _ = self.words_rnn(words_tensor)
        x = self.attention(x, x)
        x = self.projector(x[-1])
        return x


In [11]:
def model_params_num(model):
    return sum(np.prod(list(p.size())) for p in model.parameters())

def mk_dataline(model_type, epochs, lr, noise_level_train, noise_level_test, acc_train, acc_test,
                f1_train, f1_test, dropout, model, init_function=None):
    return {
        'task': 'IMDB binary classification',
        'model_type': model_type,
        'trainable_params': model_params_num(model), 'dropout': dropout, 'init_function': init_function,
        'epochs': epochs, 'lr': lr,
        'noise_level_train': noise_level_train, 'noise_level_test': noise_level_test,
        'acc_train': acc_train, 'acc_test': acc_test,
        'f1_train': f1_train, 'f1_test': f1_test,
        'model_desc': str(model),
        'data_desc': 'MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256'
    }

In [12]:
results = []

In [20]:
def run_model_with(noise_level, n_filters, cnn_kernel_size, hidden_dim_out, dropout=0.5,
                   lr=1e-4, epochs=30, heads=1, print_every=10, init_function=None, _model=None):
    start_time = time()
    HieracialIMDB.noise_level = noise_level

    if _model is None:
        model = AttentionedYoonKimModel(
            n_filters=n_filters, cnn_kernel_size=cnn_kernel_size, hidden_dim_out=hidden_dim_out, dropout=dropout,
            init_function=init_function, heads=heads
        )
        if CUDA:
            model.cuda()
        model.train()
    
    else:
        model = _model
    
    model_name = '_AttentionedYoonKim_lr%s_dropout%s_noise_level%s_spacy_wordlen8_heads%s' % (
        int(-np.log10(lr)), model.dropout, noise_level, model.heads
    )

    writer = SummaryWriter(comment=model_name)
    print('Writer: %s' % list(writer.all_writers.keys()))

    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):

        for batch_idx, (text, label) in enumerate(dataloader):
            optimizer.zero_grad()

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(1, 0, 2)
            prediction = model(text)
            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        if epoch % print_every == 0:
            print('Epoch %s. Global step %s. T=%s min' % (epoch, global_step, (time() - start_time) / 60.))
            print('Loss               : %s' % loss.data[0])

        # in-batch
        _, idx = torch.max(prediction, 1)
        _labels = label.data.tolist()
        _predictions = idx.data.tolist()
        acc = accuracy_score(_labels, _predictions)
        f1 = f1_score(_labels, _predictions)
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        writer.add_scalar('f1_train', f1, global_step=global_step)
        if epoch % print_every == 0:
            print('In-batch accuracy  :', acc)

        # validation
        metrics = get_metrics(model, val_dataloader)
        if epoch % print_every == 0:
            print('Validation accuracy: %s, f1: %s' % (metrics['accuracy'], metrics['f1']))
            print()

        writer.add_scalar('accuracy_val', metrics['accuracy'], global_step=global_step)
        writer.add_scalar('f1_val', metrics['f1'], global_step=global_step)

    # Test
    metrics_test = None

    print('Calculating validation metrics... Time %s min' % ((time() - start_time) / 60.))
    metrics_train = get_metrics(model, dataloader)
    acc_train = metrics_train['accuracy']
    f1_train = metrics_train['f1']

    for test_noise in NOISE_LEVELS:
        metrics = get_metrics(model, test, test_noise)
        if test_noise == noise_level:
            metrics_test = metrics

        acc_test = metrics['accuracy']
        f1_test = metrics['f1']
        results.append(mk_dataline(
            model_type='AttentionedYoonKim', epochs=epochs, lr=lr,
            noise_level_train=noise_level, acc_train=acc_train, f1_train=f1_train,
            noise_level_test=test_noise, acc_test=acc_test, f1_test=f1_test,
            dropout=dropout, model=model,
            init_function=init_function
        ))

    print('Final test metrics: %s, Time %s min' % (metrics_test, ((time() - start_time) / 60.)))
    if metrics_test is not None:
        writer.add_scalar('accuracy_test_final', metrics_test['accuracy'], global_step=global_step)
        writer.add_scalar('f1_test_final', metrics_test['f1'], global_step=global_step)
    print()
    model.eval()
    # model is in EVAL mode!
    return model

# Main exp

In [16]:
results = []

In [None]:
for noise_level in tqdm(NOISE_LEVELS):
    run_model_with(
        noise_level=noise_level, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5, lr=1e-3, epochs=30, heads=1
    )

  0%|          | 0/10 [00:00<?, ?it/s]

Writer: ['runs/May08_18-58-19_phobos-aijun_AttentionedYoonKim_lr3_dropout0.5_noise_level0_spacy_wordlen8_heads1']
Epoch 0. Global step 704. T=2.3250306129455565 min
Loss               : 0.6998939514160156
In-batch accuracy  : 0.25


  'precision', 'predicted', average, warn_for)


Validation accuracy: 0.4976, f1: 0.1581769436997319



  'recall', 'true', average, warn_for)


Epoch 10. Global step 7744. T=26.54800899028778 min
Loss               : 0.0964764803647995
In-batch accuracy  : 1.0
Validation accuracy: 0.8564, f1: 0.8547146904087414

Epoch 20. Global step 14784. T=50.86627300182978 min
Loss               : 3.135204315185547e-05
In-batch accuracy  : 1.0
Validation accuracy: 0.8524, f1: 0.8540925266903914



In [None]:
for noise_level in tqdm(NOISE_LEVELS):
    run_model_with(
        noise_level=noise_level, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5, lr=1e-3, epochs=30, heads=4
    )

  0%|          | 0/10 [00:00<?, ?it/s]

Writer: ['runs/May09_13-43-24_phobos-aijun_AttentionedYoonKim_lr3_dropout0.5_noise_level0_spacy_wordlen8_heads4']
Epoch 0. Global step 704. T=2.3463613828023275 min
Loss               : 0.6847973465919495
In-batch accuracy  : 1.0
Validation accuracy: 0.5036, f1: 0.25195901145268235

Epoch 10. Global step 7744. T=26.84283440510432 min
Loss               : 0.06819652020931244
In-batch accuracy  : 1.0
Validation accuracy: 0.8536, f1: 0.8604118993135013

Epoch 20. Global step 14784. T=51.34217491547267 min
Loss               : 0.0001227855682373047
In-batch accuracy  : 1.0
Validation accuracy: 0.85, f1: 0.851602690937871



  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)


Calculating validation metrics... Time 73.48504288991292 min


 10%|█         | 1/10 [1:26:44<13:00:43, 5204.87s/it]

Final test metrics: {'accuracy': 0.72036, 'f1': 0.7240794095591429}, Time 86.74774930874507 min

Writer: ['runs/May09_15-10-09_phobos-aijun_AttentionedYoonKim_lr3_dropout0.5_noise_level0.01_spacy_wordlen8_heads4']
Epoch 0. Global step 704. T=2.336410439014435 min
Loss               : 0.6932069659233093
In-batch accuracy  : 0.5
Validation accuracy: 0.4932, f1: 0.6207722238850644



  'recall', 'true', average, warn_for)


Epoch 10. Global step 7744. T=26.901063871383666 min
Loss               : 0.31047651171684265
In-batch accuracy  : 0.75
Validation accuracy: 0.8504, f1: 0.8458367683429514

Epoch 20. Global step 14784. T=51.47357052564621 min
Loss               : 1.9057666063308716
In-batch accuracy  : 0.75
Validation accuracy: 0.846, f1: 0.8385744234800839

Calculating validation metrics... Time 73.69015954732895 min


 20%|██        | 2/10 [2:53:45<11:35:03, 5212.96s/it]

Final test metrics: {'accuracy': 0.77724, 'f1': 0.7659002059775526}, Time 87.01751616795858 min

Writer: ['runs/May09_16-37-10_phobos-aijun_AttentionedYoonKim_lr3_dropout0.5_noise_level0.025_spacy_wordlen8_heads4']
Epoch 0. Global step 704. T=2.3406466404596964 min
Loss               : 0.6962195634841919
In-batch accuracy  : 0.25
Validation accuracy: 0.498, f1: 0.17704918032786884

Epoch 10. Global step 7744. T=26.86790764729182 min
Loss               : 0.3474670350551605
In-batch accuracy  : 0.75
Validation accuracy: 0.8552, f1: 0.8604471858134156



  'recall', 'true', average, warn_for)


Epoch 20. Global step 14784. T=51.39845228592555 min
Loss               : 0.32556477189064026
In-batch accuracy  : 0.75
Validation accuracy: 0.8472, f1: 0.8520526723470179

Calculating validation metrics... Time 73.52286429802577 min


 30%|███       | 3/10 [4:20:34<10:08:01, 5211.64s/it]

Final test metrics: {'accuracy': 0.80208, 'f1': 0.8010614345448698}, Time 86.81639763514201 min

Writer: ['runs/May09_18-03-59_phobos-aijun_AttentionedYoonKim_lr3_dropout0.5_noise_level0.05_spacy_wordlen8_heads4']
Epoch 0. Global step 704. T=2.340618352095286 min
Loss               : 0.6931796073913574
In-batch accuracy  : 0.5


  'precision', 'predicted', average, warn_for)


Validation accuracy: 0.5012, f1: 0.29026750142287994

Epoch 10. Global step 7744. T=26.870443864663443 min
Loss               : 0.23138980567455292
In-batch accuracy  : 0.75
Validation accuracy: 0.8512, f1: 0.8498789346246973

Epoch 20. Global step 14784. T=51.39888294935226 min
Loss               : 0.01804119348526001
In-batch accuracy  : 1.0
Validation accuracy: 0.8448, f1: 0.8375209380234506



  'recall', 'true', average, warn_for)


Calculating validation metrics... Time 73.58182549476624 min


 40%|████      | 4/10 [5:47:28<8:41:12, 5212.08s/it] 

Final test metrics: {'accuracy': 0.7522, 'f1': 0.7483241925655089}, Time 86.889908182621 min

Writer: ['runs/May09_19-30-53_phobos-aijun_AttentionedYoonKim_lr3_dropout0.5_noise_level0.075_spacy_wordlen8_heads4']
Epoch 0. Global step 704. T=2.332847269376119 min
Loss               : 0.6834204792976379
In-batch accuracy  : 0.75
Validation accuracy: 0.4972, f1: 0.08448652585579024



  'recall', 'true', average, warn_for)


In [22]:
import pandas as pd

In [23]:
results_df = pd.DataFrame(results)

In [34]:
print(results_df['model_desc'][218])

AttentionedYoonKimModel(
  (embedding): Linear(in_features=74, out_features=74, bias=True)
  (chars_cnn): Sequential(
    (0): Conv1d(74, 256, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (words_rnn): GRU(256, 128, dropout=0.5)
  (attention): MultiHeadAttention(
    (query_layer): Linear(in_features=128, out_features=128, bias=False)
    (key_layer): Linear(in_features=128, out_features=128, bias=False)
    (value_layer): Linear(in_features=128, out_features=128, bias=False)
    (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
  )
  (projector): Linear(in_features=128, out_features=2, bias=True)
)


In [35]:
print(results_df['model_desc'][100])

AttentionedYoonKimModel(
  (embedding): Linear(in_features=74, out_features=74, bias=True)
  (chars_cnn): Sequential(
    (0): Conv1d(74, 256, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (words_rnn): GRU(256, 128, dropout=0.5)
  (attention): MultiHeadAttention(
    (query_layer): Linear(in_features=128, out_features=128, bias=False)
    (key_layer): Linear(in_features=128, out_features=128, bias=False)
    (value_layer): Linear(in_features=128, out_features=128, bias=False)
    (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
  )
  (projector): Linear(in_features=128, out_features=2, bias=True)
)


In [41]:
results_4heads = results_df.iloc[-100:]

In [42]:
results_4heads

Unnamed: 0,acc_test,acc_train,data_desc,dropout,epochs,f1_test,f1_train,init_function,lr,model_desc,model_type,noise_level_test,noise_level_train,task,trainable_params
120,0.72036,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.724079,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.000,0.000,IMDB binary classification,298416
121,0.71260,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.718268,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.010,0.000,IMDB binary classification,298416
122,0.68996,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.698815,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.025,0.000,IMDB binary classification,298416
123,0.67116,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.683162,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.050,0.000,IMDB binary classification,298416
124,0.65020,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.669013,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.075,0.000,IMDB binary classification,298416
125,0.62444,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.649363,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.100,0.000,IMDB binary classification,298416
126,0.60816,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.639137,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.125,0.000,IMDB binary classification,298416
127,0.59036,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.623257,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.150,0.000,IMDB binary classification,298416
128,0.57716,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.614464,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.175,0.000,IMDB binary classification,298416
129,0.56752,0.983822,"MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256",0.5,30,0.612556,0.984064,,0.001,AttentionedYoonKimModel(\n (embedding): Linea...,AttentionedYoonKim,0.200,0.000,IMDB binary classification,298416


In [43]:
results_4heads.to_csv('results/Attentioned4HYoonKim.csv')

In [48]:
results_1heads = results_df.iloc[:-100]
results_1heads.to_csv('results/Attentioned1HYoonKim.csv')

In [50]:
results_1heads.iloc[4]

acc_test                                                         0.686
acc_train                                                     0.992178
data_desc                         MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256
dropout                                                            0.5
epochs                                                              30
f1_test                                                       0.665445
f1_train                                                       0.99213
init_function                                                     None
lr                                                               0.001
model_desc           AttentionedYoonKimModel(\n  (embedding): Linea...
model_type                                          AttentionedYoonKim
noise_level_test                                                 0.075
noise_level_train                                                    0
task                                        IMDB binary classification
traina

In [47]:
results_4heads.iloc[4]

acc_test                                                        0.6502
acc_train                                                     0.983822
data_desc                         MAX_WORD_LEN = 8, MAX_TEXT_LEN = 256
dropout                                                            0.5
epochs                                                              30
f1_test                                                       0.669013
f1_train                                                      0.984064
init_function                                                     None
lr                                                               0.001
model_desc           AttentionedYoonKimModel(\n  (embedding): Linea...
model_type                                          AttentionedYoonKim
noise_level_test                                                 0.075
noise_level_train                                                    0
task                                        IMDB binary classification
traina

In [None]:
for noise_level in tqdm(NOISE_LEVELS[1:]):
    run_model_with(
        noise_level=noise_level, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5, lr=1e-3, epochs=30, heads=1
    )


  0%|          | 0/9 [00:00<?, ?it/s][A

Writer: ['runs/May08_13-32-35_phobos-aijun_AttentionedYoonKim_lr3_dropout0.5_noise_level0.01_spacy_wordlen8_heads1']







Exception in thread Thread-5:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Epoch 0. Global step 704. T=2.350466295083364 min
Loss               : 0.6909279823303223
In-batch accuracy  : 0.5
Validation accuracy: 0.4972, f1: 0.6334208223972003



  'recall', 'true', average, warn_for)


Epoch 10. Global step 7744. T=27.021090630690257 min
Loss               : 0.12470026314258575
In-batch accuracy  : 1.0
Validation accuracy: 0.8424, f1: 0.8330508474576271



# Other exps

In [14]:
%%time
run_model_with(
    noise_level=0.025, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5, lr=1e-3, epochs=1, heads=1, print_every=1
)

Writer: ['runs/May08_18-24-24_phobos-aijun_AttentionedYoonKim_lr3_dropout0.5_noise_level0.025_spacy_wordlen8_heads1']
Epoch 0. Global step 704. T=2.3408157149950664 min
Loss               : 0.7010097503662109
In-batch accuracy  : 0.25
Validation accuracy: 0.5012, f1: 0.2728862973760933

Calculating validation metrics... Time 2.4388691782951355 min
Final test metrics: {'accuracy': 0.49872, 'f1': 0.25324752711238235}, Time 15.579216667016347 min

CPU times: user 32min 16s, sys: 34.8 s, total: 32min 51s
Wall time: 15min 34s


AttentionedYoonKimModel(
  (embedding): Linear(in_features=74, out_features=74, bias=True)
  (chars_cnn): Sequential(
    (0): Conv1d(74, 256, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (words_rnn): GRU(256, 128, dropout=0.5)
  (attention): MultiHeadAttention(
    (query_layer): Linear(in_features=128, out_features=128, bias=False)
    (key_layer): Linear(in_features=128, out_features=128, bias=False)
    (value_layer): Linear(in_features=128, out_features=128, bias=False)
    (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True)
  )
  (projector): Linear(in_features=128, out_features=2, bias=True)
)

In [None]:
run_model_with(
    noise_level=0.1, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5, lr=1e-3, epochs=30, heads=1
)

Writer: ['runs/May07_19-43-00_phobos-aijun_AttentionedYoonKim_lr3_dropout0.5_noise_level0.1_spacy_wordlen8_heads1']
Epoch 0. Global step 704. T=2.3722752690315247 min
Loss               : 0.689220130443573
In-batch accuracy  : 0.5
Validation accuracy: 0.496, f1: 0.4241316270566728



  'recall', 'true', average, warn_for)


Epoch 10. Global step 7744. T=27.09961405197779 min
Loss               : 0.25278401374816895
In-batch accuracy  : 1.0
Validation accuracy: 0.8252, f1: 0.8232915487262433

Epoch 20. Global step 14784. T=51.80704816977183 min
Loss               : 0.019888877868652344
In-batch accuracy  : 1.0


In [None]:
model = run_model_with(
    noise_level=0.025, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5,
    lr=1e-3, epochs=30, heads=1
)

Epoch 0. Global step 704
Loss               : 0.6714678406715393
In-batch accuracy  : 1.0
Validation accuracy: 0.5048

Epoch 10. Global step 7744
Loss               : 0.0345512330532074
In-batch accuracy  : 1.0
Validation accuracy: 0.8504



In [38]:
model = run_model_with(
    noise_level=0.025, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5,
    lr=1e-3, epochs=30, heads=1
)

torch.Size([256, 32, 128])


RuntimeError: cuDNN requires contiguous input tensor

In [30]:
model = run_model_with(
    noise_level=0.025, n_filters=256, cnn_kernel_size=5, hidden_dim_out=128, dropout=0.5,
    lr=1e-3, epochs=30, heads=1
)

torch.Size([256, 32, 128])


RuntimeError: cuDNN requires contiguous input tensor