In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

from random import random, choice

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

CUDA = torch.cuda.is_available()

import numpy as np

from sklearn.metrics import accuracy_score

import torchtext
from collections import Counter

In [2]:
MAXLEN = 256

BATCH_SIZE = 32
VALID_SIZE = 0.1

NOISE_LEVEL = 0.1

In [3]:
text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

ALPHABET = [' ', 'e', 't', 'a', 'i', 'o', 's', 'n', 'r', 'h', 'l', 'd', 'c', 'm', 'u', 'f', 'g', 'y', 'b', 'w', 'p',\
            '.', 'v', ',', 'k', "'", '/', '>', '<', '-', '"', 'j', 'x', ')', '(', '!', 'z', 'q', '0', '1', '?', ':',\
            '9', '2', '*', ';', '3', '5', '8', '4', '7', '&', '6', 'é', '\x96', '`', '$', '\x85', '_', '%', '=', '#',\
            'UNK', 'PAD']

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

In [4]:
def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.

def preprocess_text_nobatch(text, maxlen=MAXLEN):
    one_hotted_text = np.zeros((maxlen, ALPHABET_LEN))
    for i, char in enumerate(text):
        if i >= MAXLEN:
            break
        one_hotted_text[i, char2int.get(char, char2int['UNK'])] = 1.
    if i < MAXLEN:
        for j in range(i+1, MAXLEN):
            one_hotted_text[j, char2int['PAD']] = 1.

    return torch.FloatTensor(one_hotted_text)

def onehot2text(one_hotted_text, batch_size=None):
    if batch_size is None:
        text = ''
        _, idx = torch.max(one_hotted_text, 1)
        for i in idx:
            symb = ALPHABET[i]
            if symb == 'PAD':
                break
            else:
                text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts

def noise_generator(string, noise_level, chars=ALPHABET+['']):
    noised = ""
    for c in string:
        if random() > noise_level:
            noised += c
        if random() < noise_level:
            noised += choice(chars)
    return noised

class CharIMDB(torchtext.datasets.imdb.IMDB):
    noise_level = 0

    def __getitem__(self, idx):
        item = super(CharIMDB, self).__getitem__(idx)
        text = item.text
        text = noise_generator(text, self.noise_level)  # это плохо
        label = int(item.label == 'pos')
        return preprocess_text_nobatch(text), label

def get_train_valid_loader(dataset, valid_size, batch_size, random_seed=42, shuffle=True, num_workers=4):

    len_dataset = len(dataset)
    indices = list(range(len_dataset))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_actual_size = int(len_dataset * valid_size)

    train_idx, valid_idx = indices[:-val_actual_size], indices[-val_actual_size:]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=4
    )

    return train_loader, valid_loader

def get_accuracy(model, test_dataset):
    """
    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []

    for text, label in test_dataset:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(0, 2, 1)  # (1, 0, 2) for RNN
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    model.train()
    return acc


In [5]:
CharIMDB.noise_level = NOISE_LEVEL
train, test = CharIMDB.splits(text_field, label_field)

dataloader, val_dataloader = get_train_valid_loader(train, valid_size=VALID_SIZE, batch_size=BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(
    test, batch_size=BATCH_SIZE
)

# Model

In [11]:
class CharCNN(nn.Module):
    
    def __init__(self, init_function, dropout=0.5):  #, hidden_dim=256, kernel_size=16):
        super(CharCNN, self).__init__()
        self.init_function = init_function
        self.dropout = dropout
        
        self.conv = nn.Sequential(
            nn.Conv1d(ALPHABET_LEN, 256, kernel_size=16, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=64, stride=8)
        )
        self.conv[0].weight = init_function(self.conv[0].weight)

        self.fc = nn.Linear(5888, 2)  # 30464 for MAXLEN=1024

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


In [12]:
from torch.nn import init

In [13]:
def run_model_with(noise_level, init_function, lr=1e-4, dropout=0.5, epochs=30):
    CharIMDB.noise_level = noise_level

    model = CharCNN(init_function=init_function)
    if CUDA:
        model.cuda()
    model.train()

    writer = SummaryWriter(comment='_charCNN_smaller2_lr%s_noise%s_NOdropout' % (
        int(-np.log10(lr)), noise_level
    ))
    
    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):
    #     if epoch == 10:
    #         optimizer = optim.Adam(params=model.parameters(), lr=10**-5)

        for batch_idx, (text, label) in enumerate(dataloader):
            optimizer.zero_grad()

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(0, 2, 1)  # (1, 0, 2) for RNN
            prediction = model(text)

            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        print('Loss after epoch %s:' % epoch)
        print('Global step: %s' % global_step)
        print(loss.data[0])

        _, idx = torch.max(prediction, 1)
        acc = accuracy_score(label.data.tolist(), idx.data.tolist())
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        print('In-batch accuracy:', acc)

        acc = get_accuracy(model, val_dataloader)
        print('Validation accuracy:', acc)
        writer.add_scalar('accuracy_val', acc, global_step=global_step)
        print()

    # Test

    acc = get_accuracy(model, test_dataloader)
    print('Final test accuracy:', acc)
    writer.add_scalar('accuracy_test_final', acc, global_step=global_step)
    print()
    model.eval()
    # model is in EVAL mode!
    return model

In [22]:
%%time
run_model_with(noise_level=0.1, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.6670222878456116
In-batch accuracy: 0.75
Validation accuracy: 0.5112

Loss after epoch 1:
Global step: 1408
0.6755492091178894
In-batch accuracy: 0.5
Validation accuracy: 0.5172

Loss after epoch 2:
Global step: 2112
0.699364960193634
In-batch accuracy: 0.25
Validation accuracy: 0.5364

Loss after epoch 3:
Global step: 2816
0.6891615390777588
In-batch accuracy: 0.75
Validation accuracy: 0.6676

Loss after epoch 4:
Global step: 3520
0.5262390375137329
In-batch accuracy: 0.5
Validation accuracy: 0.7088

Loss after epoch 5:
Global step: 4224
0.41326266527175903
In-batch accuracy: 0.75
Validation accuracy: 0.7372

Loss after epoch 6:
Global step: 4928
0.5759977102279663
In-batch accuracy: 0.5
Validation accuracy: 0.7484

Loss after epoch 7:
Global step: 5632
0.4313564896583557
In-batch accuracy: 0.5
Validation accuracy: 0.746

Loss after epoch 8:
Global step: 6336
0.5776655673980713
In-batch accuracy: 0.75
Validation accuracy: 0.7544

Loss after epoch

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=61184, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc3): Linear(in_features=1024, out_features=2, bias=True)
)

In [32]:
%%time
run_model_with(noise_level=0.01, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.7154096364974976
In-batch accuracy: 0.5
Validation accuracy: 0.6276

Loss after epoch 1:
Global step: 1408
0.3147427439689636
In-batch accuracy: 1.0
Validation accuracy: 0.7424

Loss after epoch 2:
Global step: 2112
0.43052104115486145
In-batch accuracy: 0.75
Validation accuracy: 0.7732

Loss after epoch 3:
Global step: 2816
0.14214535057544708
In-batch accuracy: 1.0
Validation accuracy: 0.8052

Loss after epoch 4:
Global step: 3520
0.16783644258975983
In-batch accuracy: 1.0
Validation accuracy: 0.8112

Loss after epoch 5:
Global step: 4224
0.5878106951713562
In-batch accuracy: 0.5
Validation accuracy: 0.82

Loss after epoch 6:
Global step: 4928
0.1835513859987259
In-batch accuracy: 1.0
Validation accuracy: 0.8224

Loss after epoch 7:
Global step: 5632
0.036214977502822876
In-batch accuracy: 1.0
Validation accuracy: 0.8056

Loss after epoch 8:
Global step: 6336
0.036427706480026245
In-batch accuracy: 1.0
Validation accuracy: 0.8236

Loss after epo

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(16,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=30464, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc3): Linear(in_features=1024, out_features=2, bias=True)
)

In [38]:
%%time
run_model_with(noise_level=0.01, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.785068154335022
In-batch accuracy: 0.25
Validation accuracy: 0.6084

Loss after epoch 1:
Global step: 1408
0.3312198519706726
In-batch accuracy: 1.0
Validation accuracy: 0.7292

Loss after epoch 2:
Global step: 2112
0.2843222916126251
In-batch accuracy: 1.0
Validation accuracy: 0.7424

Loss after epoch 3:
Global step: 2816
0.1944502741098404
In-batch accuracy: 1.0
Validation accuracy: 0.7604

Loss after epoch 4:
Global step: 3520
0.24288120865821838
In-batch accuracy: 0.75
Validation accuracy: 0.8008

Loss after epoch 5:
Global step: 4224
0.36140114068984985
In-batch accuracy: 0.75
Validation accuracy: 0.8036

Loss after epoch 6:
Global step: 4928
0.2669448256492615
In-batch accuracy: 0.75
Validation accuracy: 0.8048

Loss after epoch 7:
Global step: 5632
0.18363156914710999
In-batch accuracy: 1.0
Validation accuracy: 0.8092

Loss after epoch 8:
Global step: 6336
0.48155519366264343
In-batch accuracy: 0.75
Validation accuracy: 0.8184

Loss after e

CharCNN(
  (conv): Sequential(
    (0): Conv1d(64, 256, kernel_size=(16,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=30464, out_features=2, bias=True)
)

In [14]:
%%time
run_model_with(noise_level=0.01, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.6622531414031982
In-batch accuracy: 0.5
Validation accuracy: 0.5076

Loss after epoch 1:
Global step: 1408
0.6478318572044373
In-batch accuracy: 1.0
Validation accuracy: 0.654

Loss after epoch 2:
Global step: 2112
0.5919772386550903
In-batch accuracy: 0.5
Validation accuracy: 0.684

Loss after epoch 3:
Global step: 2816
0.8037890195846558
In-batch accuracy: 0.0
Validation accuracy: 0.6764

Loss after epoch 4:
Global step: 3520
0.48778462409973145
In-batch accuracy: 1.0
Validation accuracy: 0.7124

Loss after epoch 5:
Global step: 4224
0.41007521748542786
In-batch accuracy: 1.0
Validation accuracy: 0.6928

Loss after epoch 6:
Global step: 4928
0.5600640177726746
In-batch accuracy: 0.75
Validation accuracy: 0.7276

Loss after epoch 7:
Global step: 5632
0.5205470323562622
In-batch accuracy: 0.75
Validation accuracy: 0.7052

Loss after epoch 8:
Global step: 6336
0.534763753414154
In-batch accuracy: 0.75
Validation accuracy: 0.7228

Loss after epoch 9

CharCNN(
  (conv): Sequential(
    (0): Conv1d(64, 256, kernel_size=(16,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=5888, out_features=2, bias=True)
)