In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

from random import random, choice

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

CUDA = torch.cuda.is_available()

import numpy as np

from sklearn.metrics import accuracy_score

import torchtext
from collections import Counter

In [2]:
MAXLEN = 256

BATCH_SIZE = 32
VALID_SIZE = 0.1

NOISE_LEVEL = 0.1

In [3]:
text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

ALPHABET = [' ', 'e', 't', 'a', 'i', 'o', 's', 'n', 'r', 'h', 'l', 'd', 'c', 'm', 'u', 'f', 'g', 'y', 'b', 'w', 'p',\
            '.', 'v', ',', 'k', "'", '/', '>', '<', '-', '"', 'j', 'x', ')', '(', '!', 'z', 'q', '0', '1', '?', ':',\
            '9', '2', '*', ';', '3', '5', '8', '4', '7', '&', '6', 'é', '\x96', '`', '$', '\x85', '_', '%', '=', '#',\
            'UNK', 'PAD']

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

In [4]:
def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.

def preprocess_text_nobatch(text, maxlen=MAXLEN):
    one_hotted_text = np.zeros((maxlen, ALPHABET_LEN))
    for i, char in enumerate(text):
        if i >= MAXLEN:
            break
        one_hotted_text[i, char2int.get(char, char2int['UNK'])] = 1.
    if i < MAXLEN:
        for j in range(i+1, MAXLEN):
            one_hotted_text[j, char2int['PAD']] = 1.

    return torch.FloatTensor(one_hotted_text)

def onehot2text(one_hotted_text, batch_size=None):
    if batch_size is None:
        text = ''
        _, idx = torch.max(one_hotted_text, 1)
        for i in idx:
            symb = ALPHABET[i]
            if symb == 'PAD':
                break
            else:
                text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts

def noise_generator(string, noise_level, chars=ALPHABET+['']):
    noised = ""
    for c in string:
        if random() > noise_level:
            noised += c
        if random() < noise_level:
            noised += choice(chars)
    return noised

class CharIMDB(torchtext.datasets.imdb.IMDB):
    noise_level = 0

    def __getitem__(self, idx):
        item = super(CharIMDB, self).__getitem__(idx)
        text = item.text
        text = noise_generator(text, self.noise_level)
        label = int(item.label == 'pos')
        return preprocess_text_nobatch(text), label

def get_train_valid_loader(dataset, valid_size, batch_size, random_seed=42, shuffle=True, num_workers=4):

    len_dataset = len(dataset)
    indices = list(range(len_dataset))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_actual_size = int(len_dataset * valid_size)

    train_idx, valid_idx = indices[:-val_actual_size], indices[-val_actual_size:]
    
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=4
    )

    return train_loader, valid_loader

def get_accuracy(model, test_dataset):
    """
    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []

    for text, label in test_dataset:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(1, 0, 2)
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    model.train()
    return acc


In [5]:
CharIMDB.noise_level = NOISE_LEVEL
train, test = CharIMDB.splits(text_field, label_field)

dataloader, val_dataloader = get_train_valid_loader(train, valid_size=VALID_SIZE, batch_size=BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(
    test, batch_size=BATCH_SIZE
)

In [6]:
from sru import SRU

In [6]:
from torch.nn import init

In [7]:
class CharRNN(nn.Module):

    def __init__(self, hidden, dropout=0.5, num_layers=2):
        super(CharRNN, self).__init__()

        self.embed = nn.Linear(ALPHABET_LEN, ALPHABET_LEN)
        self.embed.weight = init.xavier_normal(self.embed.weight)
        self.dropout = nn.Dropout(dropout)
#         self.rnn = SRU(ALPHABET_LEN, hidden, dropout=dropout, num_layers=num_layers)
        self.rnn = nn.GRU(ALPHABET_LEN, hidden, dropout=dropout, num_layers=num_layers)
        self.fc = nn.Linear(hidden, 2)

    def forward(self, x):
        """
        :param x: (seq_len, batch_size, signal_dim)
        """
        x = self.embed(x)
        x = self.dropout(x)
        x, c_states = self.rnn(x)
        x = self.fc(x[-1])
        return x


In [8]:
def run_model_with(noise_level, hidden, lr=1e-4, dropout=0.5, layers=1, epochs=30):
    CharIMDB.noise_level = noise_level

    model = CharRNN(hidden, num_layers=layers)
    if CUDA:
        model.cuda()
    model.train()

    writer = SummaryWriter(comment='_charGRU_dropout%s_embed_layers%s' % (dropout, layers))
    
    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):

        for batch_idx, (text, label) in enumerate(dataloader):
            optimizer.zero_grad()

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(1, 0, 2)
            prediction = model(text)

            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        print('Global step: %s' % global_step)
        print('Loss after epoch %s: %s' % (epoch, loss.data[0]))

        _, idx = torch.max(prediction, 1)
        acc = accuracy_score(label.data.tolist(), idx.data.tolist())
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        print('In-batch accuracy:', acc)

        acc = get_accuracy(model, val_dataloader)
        print('Validation accuracy:', acc)
        writer.add_scalar('accuracy_val', acc, global_step=global_step)
        print()

    # Test

    acc = get_accuracy(model, test_dataloader)
    print('Final test accuracy:', acc)
    writer.add_scalar('accuracy_test_final', acc, global_step=global_step)
    print()
    model.eval()
    # model is in EVAL mode!
    return model


In [9]:
run_model_with(noise_level=0, hidden=256)

SRU loaded for gpu 0
Global step: 704
Loss after epoch 0: 0.6822301745414734
In-batch accuracy: 0.75
Validation accuracy: 0.4972

Global step: 1408
Loss after epoch 1: 0.6573439240455627
In-batch accuracy: 1.0
Validation accuracy: 0.4988

Global step: 2112
Loss after epoch 2: 0.6799705028533936
In-batch accuracy: 0.75
Validation accuracy: 0.4936

Global step: 2816
Loss after epoch 3: 0.7229750156402588
In-batch accuracy: 0.25
Validation accuracy: 0.4856

Global step: 3520
Loss after epoch 4: 0.7095352411270142
In-batch accuracy: 0.25
Validation accuracy: 0.4916

Global step: 4224
Loss after epoch 5: 0.7096357345581055
In-batch accuracy: 0.25
Validation accuracy: 0.4948

Global step: 4928
Loss after epoch 6: 0.643151581287384
In-batch accuracy: 1.0
Validation accuracy: 0.494

Global step: 5632
Loss after epoch 7: 0.7027839422225952
In-batch accuracy: 0.5
Validation accuracy: 0.492

Global step: 6336
Loss after epoch 8: 0.6687096953392029
In-batch accuracy: 1.0
Validation accuracy: 0.491

Process Process-93:
Process Process-96:
Process Process-94:
Process Process-95:
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 55, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 55, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "<ipython-input-4-cd72a0080c8b>", line 54, in __getitem__
    return preprocess_text_nobatch(text), label
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", 

KeyboardInterrupt: 

In [16]:
model = run_model_with(noise_level=0, hidden=256, epochs=100, layers=1, lr=1e-3)

Global step: 704
Loss after epoch 0: 0.6877833604812622
In-batch accuracy: 0.5
Validation accuracy: 0.5012

Global step: 1408
Loss after epoch 1: 0.6759294271469116
In-batch accuracy: 0.5
Validation accuracy: 0.5224

Global step: 2112
Loss after epoch 2: 0.6749019026756287
In-batch accuracy: 0.5
Validation accuracy: 0.526

Global step: 2816
Loss after epoch 3: 0.74471116065979
In-batch accuracy: 0.0
Validation accuracy: 0.5172

Global step: 3520
Loss after epoch 4: 0.6857150793075562
In-batch accuracy: 0.5
Validation accuracy: 0.5076

Global step: 4224
Loss after epoch 5: 0.6910227537155151
In-batch accuracy: 0.75
Validation accuracy: 0.514

Global step: 4928
Loss after epoch 6: 0.696495532989502
In-batch accuracy: 0.75
Validation accuracy: 0.512

Global step: 5632
Loss after epoch 7: 0.7819569110870361
In-batch accuracy: 0.0
Validation accuracy: 0.5048

Global step: 6336
Loss after epoch 8: 0.629566490650177
In-batch accuracy: 0.75
Validation accuracy: 0.5008

Global step: 7040
Loss a

Global step: 52800
Loss after epoch 74: 0.5699837803840637
In-batch accuracy: 0.75
Validation accuracy: 0.5792

Global step: 53504
Loss after epoch 75: 0.5464096665382385
In-batch accuracy: 0.75
Validation accuracy: 0.5852

Global step: 54208
Loss after epoch 76: 0.5671347379684448
In-batch accuracy: 0.75
Validation accuracy: 0.6012

Global step: 54912
Loss after epoch 77: 0.6364257335662842
In-batch accuracy: 0.5
Validation accuracy: 0.5824

Global step: 55616
Loss after epoch 78: 0.4664842486381531
In-batch accuracy: 1.0
Validation accuracy: 0.5872

Global step: 56320
Loss after epoch 79: 0.7226962447166443
In-batch accuracy: 0.5
Validation accuracy: 0.5896

Global step: 57024
Loss after epoch 80: 0.5735809803009033
In-batch accuracy: 0.75
Validation accuracy: 0.5892

Global step: 57728
Loss after epoch 81: 0.8028784990310669
In-batch accuracy: 0.25
Validation accuracy: 0.6004

Global step: 58432
Loss after epoch 82: 0.9222941994667053
In-batch accuracy: 0.25
Validation accuracy: 0.5

In [9]:
model = run_model_with(noise_level=0, hidden=256, epochs=100, layers=1, lr=1e-3)

Global step: 704
Loss after epoch 0: 0.6701354384422302
In-batch accuracy: 0.75
Validation accuracy: 0.5116

Global step: 1408
Loss after epoch 1: 0.7083399295806885
In-batch accuracy: 0.25
Validation accuracy: 0.5012

Global step: 2112
Loss after epoch 2: 0.6849496364593506
In-batch accuracy: 0.5
Validation accuracy: 0.5028

Global step: 2816
Loss after epoch 3: 0.7296500205993652
In-batch accuracy: 0.5
Validation accuracy: 0.5204

Global step: 3520
Loss after epoch 4: 0.6584969162940979
In-batch accuracy: 0.75
Validation accuracy: 0.514

Global step: 4224
Loss after epoch 5: 0.70598965883255
In-batch accuracy: 0.5
Validation accuracy: 0.5136

Global step: 4928
Loss after epoch 6: 0.6887087821960449
In-batch accuracy: 0.5
Validation accuracy: 0.5124

Global step: 5632
Loss after epoch 7: 0.7495694160461426
In-batch accuracy: 0.25
Validation accuracy: 0.5488

Global step: 6336
Loss after epoch 8: 0.5603877902030945
In-batch accuracy: 0.75
Validation accuracy: 0.6056

Global step: 7040


Process Process-579:
Process Process-577:
Process Process-580:
Process Process-578:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 55, in _worker_loop
    samples = collate_

KeyboardInterrupt: 