In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

from random import random, choice

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

CUDA = torch.cuda.is_available()

import numpy as np

from sklearn.metrics import accuracy_score

import torchtext
from collections import Counter

In [2]:
MAXLEN = 1024

BATCH_SIZE = 32
VALID_SIZE = 0.1

NOISE_LEVEL = 0.1

In [3]:
text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

ALPHABET = [' ', 'e', 't', 'a', 'i', 'o', 's', 'n', 'r', 'h', 'l', 'd', 'c', 'm', 'u', 'f', 'g', 'y', 'b', 'w', 'p',\
            '.', 'v', ',', 'k', "'", '/', '>', '<', '-', '"', 'j', 'x', ')', '(', '!', 'z', 'q', '0', '1', '?', ':',\
            '9', '2', '*', ';', '3', '5', '8', '4', '7', '&', '6', 'é', '\x96', '`', '$', '\x85', '_', '%', '=', '#',\
            'UNK', 'PAD']

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

In [4]:
def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.

def preprocess_text_nobatch(text, maxlen=MAXLEN):
    one_hotted_text = np.zeros((maxlen, ALPHABET_LEN))
    for i, char in enumerate(text):
        if i >= MAXLEN:
            break
        one_hotted_text[i, char2int.get(char, char2int['UNK'])] = 1.
    if i < MAXLEN:
        for j in range(i+1, MAXLEN):
            one_hotted_text[j, char2int['PAD']] = 1.

    return torch.FloatTensor(one_hotted_text)

def onehot2text(one_hotted_text, batch_size=None):
    if batch_size is None:
        text = ''
        _, idx = torch.max(one_hotted_text, 1)
        for i in idx:
            symb = ALPHABET[i]
            if symb == 'PAD':
                break
            else:
                text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts

def noise_generator(string, noise_level, chars=ALPHABET+['']):
    noised = ""
    for c in string:
        if random() > noise_level:
            noised += c
        if random() < noise_level:
            noised += choice(chars)
    return noised

class CharIMDB(torchtext.datasets.imdb.IMDB):
    noise_level = 0

    def __getitem__(self, idx):
        item = super(CharIMDB, self).__getitem__(idx)
        text = item.text
        text = noise_generator(text, self.noise_level)
        label = int(item.label == 'pos')
        return preprocess_text_nobatch(text), label

def get_train_valid_loader(dataset, valid_size, batch_size, random_seed=42, shuffle=True, num_workers=4):

    len_dataset = len(dataset)
    indices = list(range(len_dataset))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_actual_size = int(len_dataset * valid_size)

    train_idx, valid_idx = indices[:-val_actual_size], indices[-val_actual_size:]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=4
    )

    return train_loader, valid_loader

def get_accuracy(model, test_dataset):
    """
    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []

    for text, label in test_dataset:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(0, 2, 1)  # (1, 0, 2) for RNN
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    model.train()
    return acc


In [5]:
CharIMDB.noise_level = NOISE_LEVEL
train, test = CharIMDB.splits(text_field, label_field)

dataloader, val_dataloader = get_train_valid_loader(train, valid_size=VALID_SIZE, batch_size=BATCH_SIZE)

test_dataloader = torch.utils.data.DataLoader(
    test, batch_size=BATCH_SIZE
)

# Model

In [17]:
class CharCNN(nn.Module):
    
    def __init__(self, init_function, dropout=0.5):  #, hidden_dim=256, kernel_size=16):
        super(CharCNN, self).__init__()
        self.init_function = init_function
        self.dropout = dropout
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(ALPHABET_LEN, 256, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )
        self.conv1[0].weight = init_function(self.conv1[0].weight)

        self.conv2 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )
        self.conv2[0].weight = init_function(self.conv2[0].weight)
        
        self.conv3 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.conv3[0].weight = init_function(self.conv3[0].weight)

        self.conv4 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU()    
        )
        self.conv4[0].weight = init_function(self.conv4[0].weight)

        self.conv5 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.conv5[0].weight = init_function(self.conv5[0].weight)

        self.conv6 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )   
        self.conv6[0].weight = init_function(self.conv6[0].weight)

        self.fc1 = nn.Sequential(
            nn.Linear(8704, 1024),  # MAXLEN = 1024
            nn.ReLU(),
            nn.Dropout(p=dropout)
        )
        
        self.fc2 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(p=dropout)
        )

        self.fc3 = nn.Linear(1024, 2)

    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)

        # collapse
        x = x.view(x.size(0), -1)
        # linear layer
        x = self.fc1(x)
        # linear layer
        x = self.fc2(x)
        # linear layer
        x = self.fc3(x)
        return x


In [11]:
from torch.nn import init

In [18]:
def run_model_with(noise_level, init_function, lr=1e-4, dropout=0.5, epochs=30):
    CharIMDB.noise_level = noise_level

    model = CharCNN(init_function=init_function)
    if CUDA:
        model.cuda()
    model.train()
    
    writer = SummaryWriter(comment='_charCNN_BIG_lr%s_noise%s_dropout%s_init_' % (
        int(-np.log10(lr)), noise_level, dropout, str(init_function).split()[1]
    ))
    
    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):
    #     if epoch == 10:
    #         optimizer = optim.Adam(params=model.parameters(), lr=10**-5)

        for batch_idx, (text, label) in enumerate(dataloader):
            optimizer.zero_grad()

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(0, 2, 1)  # (1, 0, 2) for RNN
            prediction = model(text)

            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        print('Loss after epoch %s:' % epoch)
        print('Global step: %s' % global_step)
        print(loss.data[0])

        _, idx = torch.max(prediction, 1)
        acc = accuracy_score(label.data.tolist(), idx.data.tolist())
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        print('In-batch accuracy:', acc)

        acc = get_accuracy(model, val_dataloader)
        print('Validation accuracy:', acc)
        writer.add_scalar('accuracy_val', acc, global_step=global_step)
        print()

    # Test

    acc = get_accuracy(model, test_dataloader)
    print('Final test accuracy:', acc)
    writer.add_scalar('accuracy_test_final', acc, global_step=global_step)
    print()
    model.eval()
    # model is in EVAL mode!
    return model

In [19]:
run_model_with(noise_level=0.1, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.6915237903594971
In-batch accuracy: 0.5
Validation accuracy: 0.4988

Loss after epoch 1:
Global step: 1408
0.7383397817611694
In-batch accuracy: 0.5
Validation accuracy: 0.5424

Loss after epoch 2:
Global step: 2112
0.6445028781890869
In-batch accuracy: 0.75
Validation accuracy: 0.576

Loss after epoch 3:
Global step: 2816
1.007167100906372
In-batch accuracy: 0.0
Validation accuracy: 0.6452

Loss after epoch 4:
Global step: 3520
0.2538266181945801
In-batch accuracy: 1.0
Validation accuracy: 0.6412

Loss after epoch 5:
Global step: 4224
0.44906681776046753
In-batch accuracy: 0.75
Validation accuracy: 0.7124

Loss after epoch 6:
Global step: 4928
0.46940934658050537
In-batch accuracy: 0.75
Validation accuracy: 0.728

Loss after epoch 7:
Global step: 5632
0.6819859743118286
In-batch accuracy: 0.5
Validation accuracy: 0.7132

Loss after epoch 8:
Global step: 6336
1.3677139282226562
In-batch accuracy: 0.5
Validation accuracy: 0.7284

Loss after epoch 9

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv4): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv5): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv6): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=8704, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc2): Se

In [26]:
run_model_with(noise_level=0.15, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.6712560653686523
In-batch accuracy: 0.75
Validation accuracy: 0.5184

Loss after epoch 1:
Global step: 1408
0.6933161020278931
In-batch accuracy: 0.5
Validation accuracy: 0.5148

Loss after epoch 2:
Global step: 2112
0.6862149238586426
In-batch accuracy: 0.75
Validation accuracy: 0.514

Loss after epoch 3:
Global step: 2816
0.7016793489456177
In-batch accuracy: 0.5
Validation accuracy: 0.5704

Loss after epoch 4:
Global step: 3520
0.9100275039672852
In-batch accuracy: 0.25
Validation accuracy: 0.6268

Loss after epoch 5:
Global step: 4224
0.7255017161369324
In-batch accuracy: 0.5
Validation accuracy: 0.608

Loss after epoch 6:
Global step: 4928
0.43671470880508423
In-batch accuracy: 1.0
Validation accuracy: 0.6308

Loss after epoch 7:
Global step: 5632
0.7293297648429871
In-batch accuracy: 0.5
Validation accuracy: 0.6644

Loss after epoch 8:
Global step: 6336
0.48420223593711853
In-batch accuracy: 0.75
Validation accuracy: 0.636

Loss after epoch 

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv4): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv5): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv6): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=8704, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc2): Se

In [28]:
run_model_with(noise_level=0.2, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.6963770985603333
In-batch accuracy: 0.5
Validation accuracy: 0.4988

Loss after epoch 1:
Global step: 1408
0.6533082723617554
In-batch accuracy: 0.75
Validation accuracy: 0.4952

Loss after epoch 2:
Global step: 2112
0.6950268149375916
In-batch accuracy: 0.25
Validation accuracy: 0.5156

Loss after epoch 3:
Global step: 2816
0.684116780757904
In-batch accuracy: 0.5
Validation accuracy: 0.5128

Loss after epoch 4:
Global step: 3520
0.6747205257415771
In-batch accuracy: 0.75
Validation accuracy: 0.5168

Loss after epoch 5:
Global step: 4224
0.7066308259963989
In-batch accuracy: 0.0
Validation accuracy: 0.4924

Loss after epoch 6:
Global step: 4928
0.7365478873252869
In-batch accuracy: 0.0
Validation accuracy: 0.5036

Loss after epoch 7:
Global step: 5632
0.6840904355049133
In-batch accuracy: 0.5
Validation accuracy: 0.5028

Loss after epoch 8:
Global step: 6336
0.6969524025917053
In-batch accuracy: 0.5
Validation accuracy: 0.5076

Loss after epoch 9

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv4): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv5): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv6): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=8704, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc2): Se

In [29]:
run_model_with(noise_level=0.175, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.6842697262763977
In-batch accuracy: 0.75
Validation accuracy: 0.5052

Loss after epoch 1:
Global step: 1408
0.6580076217651367
In-batch accuracy: 0.75
Validation accuracy: 0.5148

Loss after epoch 2:
Global step: 2112
0.7000224590301514
In-batch accuracy: 0.5
Validation accuracy: 0.5056

Loss after epoch 3:
Global step: 2816
0.685793936252594
In-batch accuracy: 0.25
Validation accuracy: 0.5172

Loss after epoch 4:
Global step: 3520
0.703365683555603
In-batch accuracy: 0.25
Validation accuracy: 0.504

Loss after epoch 5:
Global step: 4224
0.7062662839889526
In-batch accuracy: 0.5
Validation accuracy: 0.504

Loss after epoch 6:
Global step: 4928
0.7318582534790039
In-batch accuracy: 0.25
Validation accuracy: 0.54

Loss after epoch 7:
Global step: 5632
0.6022824048995972
In-batch accuracy: 0.75
Validation accuracy: 0.6156

Loss after epoch 8:
Global step: 6336
0.6868293881416321
In-batch accuracy: 0.5
Validation accuracy: 0.596

Loss after epoch 9:
G

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv4): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv5): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv6): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=8704, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc2): Se

In [31]:
run_model_with(noise_level=0.16, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.6821818351745605
In-batch accuracy: 0.75
Validation accuracy: 0.5096

Loss after epoch 1:
Global step: 1408
0.6878247261047363
In-batch accuracy: 0.75
Validation accuracy: 0.5044

Loss after epoch 2:
Global step: 2112
0.6898511052131653
In-batch accuracy: 0.75
Validation accuracy: 0.5056

Loss after epoch 3:
Global step: 2816
0.6872284412384033
In-batch accuracy: 0.75
Validation accuracy: 0.5164

Loss after epoch 4:
Global step: 3520
0.6889880299568176
In-batch accuracy: 0.25
Validation accuracy: 0.514

Loss after epoch 5:
Global step: 4224
0.6952470541000366
In-batch accuracy: 0.5
Validation accuracy: 0.516

Loss after epoch 6:
Global step: 4928
0.695263147354126
In-batch accuracy: 0.75
Validation accuracy: 0.6144

Loss after epoch 7:
Global step: 5632
0.3912643790245056
In-batch accuracy: 1.0
Validation accuracy: 0.5828

Loss after epoch 8:
Global step: 6336
0.832965075969696
In-batch accuracy: 0.5
Validation accuracy: 0.63

Loss after epoch 9:


CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv4): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv5): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv6): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=8704, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc2): Se