In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable

from random import random, choice

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

CUDA = torch.cuda.is_available()

import numpy as np

from sklearn.metrics import accuracy_score

import torchtext
from collections import Counter

In [None]:
MAXLEN = 1024

BATCH_SIZE = 32
TEST_SIZE = 100

NOISE_LEVEL = 0.1

In [2]:
text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

# train, test = torchtext.datasets.IMDB.splits(text_field, label_field)
# c = Counter(''.join([' '.join(t.text_field) for t in train]))
# ALPHABET = [char[0] for char in c.most_common(62)]  # all other chars used less ~ 100 times in a test
# ALPHABET.append('UNK')
# ALPHABET.append('PAD')
ALPHABET = [' ', 'e', 't', 'a', 'i', 'o', 's', 'n', 'r', 'h', 'l', 'd', 'c', 'm', 'u', 'f', 'g', 'y', 'b', 'w', 'p',\
            '.', 'v', ',', 'k', "'", '/', '>', '<', '-', '"', 'j', 'x', ')', '(', '!', 'z', 'q', '0', '1', '?', ':',\
            '9', '2', '*', ';', '3', '5', '8', '4', '7', '&', '6', 'é', '\x96', '`', '$', '\x85', '_', '%', '=', '#',\
            'UNK', 'PAD']

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

In [3]:
def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.

def preprocess_text_nobatch(text, maxlen=MAXLEN):
    one_hotted_text = np.zeros((maxlen, ALPHABET_LEN))
    for i, char in enumerate(text):
        if i >= MAXLEN:
            break
        one_hotted_text[i, char2int.get(char, char2int['UNK'])] = 1.
    if i < MAXLEN:
        for j in range(i+1, MAXLEN):
            one_hotted_text[j, char2int['PAD']] = 1.

    return torch.FloatTensor(one_hotted_text)

def onehot2text(one_hotted_text, batch_size=None):
    if batch_size is None:
        text = ''
        _, idx = torch.max(one_hotted_text, 1)
        for i in idx:
            symb = ALPHABET[i]
            if symb == 'PAD':
                break
            else:
                text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts


def noise_generator(string, noise_level, chars=ALPHABET+['']):
    noised = ""
    for c in string:
        if random() > noise_level:
            noised += c
        if random() < noise_level:
            noised += choice(chars)
    return noised

class CharIMDB(torchtext.datasets.imdb.IMDB):
    noise_level = 0

    def __getitem__(self, idx):
        item = super(CharIMDB, self).__getitem__(idx)
        text = item.text
        text = noise_generator(text, self.noise_level)
        label = int(item.label == 'pos')
        return preprocess_text_nobatch(text), label

CharIMDB.noise_level = NOISE_LEVEL
train, test = CharIMDB.splits(text_field, label_field)

dataloader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

# Model

In [4]:
class CharCNN(nn.Module):
    
    def __init__(self, dropout=0.5):  #, hidden_dim=256, kernel_size=16):
        super(CharCNN, self).__init__()
        
        self.dropout = dropout
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(ALPHABET_LEN, 256, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )
        self.conv2 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=7, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )               
        self.conv3 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.conv4 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU()    
        )
        self.conv5 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.conv6 = nn.Sequential(
            nn.Conv1d(256, 256, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=3, stride=3)
        )   

        self.fc1 = nn.Sequential(
            nn.Linear(8704, 1024),  # MAXLEN = 1024
            nn.ReLU(),
            nn.Dropout(p=dropout)
        )
        
        self.fc2 = nn.Sequential(
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.Dropout(p=dropout)
        )

        self.fc3 = nn.Linear(1024, 2)

    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.conv5(x)
        x = self.conv6(x)

        # collapse
        x = x.view(x.size(0), -1)
        # linear layer
        x = self.fc1(x)
        # linear layer
        x = self.fc2(x)
        # linear layer
        x = self.fc3(x)
        return x
    
#     def describe(self):
#         return '_char_cnn_%s_%s' % (self.hidden_dim, self.kernel_size)

In [10]:
NOISE_LEVEL = 0
CharIMDB.noise_level = NOISE_LEVEL

model = CharCNN()
if CUDA:
    model.cuda()
model.train()

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv1d(256, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv4): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv5): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
  )
  (conv6): Sequential(
    (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=8704, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc2): Se

# Training

In [11]:
writer = SummaryWriter(comment='_charCNN_BIG_lr4_noise%s' % NOISE_LEVEL)

In [12]:
optimizer = optim.Adam(params=model.parameters(), lr=10**-4)
optimizer.zero_grad()

In [13]:
global_step = 0

In [14]:
%%time

N_EPOCHS = 30

loss_f = F.cross_entropy

for epoch in range(N_EPOCHS):
#     if epoch == 10:
#         optimizer = optim.Adam(params=model.parameters(), lr=10**-5)

    for batch_idx, (text, label) in enumerate(dataloader):

        if CUDA:
            text = Variable(text.cuda())
            label = Variable(torch.LongTensor(label).cuda())
        else:
            text = Variable(text)
            label = Variable(torch.LongTensor(label))

        text = text.permute(0, 2, 1)  # (1, 0, 2) for RNN
        prediction = model(text)

        loss = loss_f(prediction, label)

        writer.add_scalar('loss', loss.data[0], global_step=global_step)

        loss.backward()        
        torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
        optimizer.step()
        
        if CUDA:
            torch.cuda.synchronize()
        global_step += 1

    # evaluation
    print('Loss after epoch %s:' % epoch)
    print(loss.data[0])
        
    _, idx = torch.max(prediction, 1)
    acc = accuracy_score(label.data.tolist(), idx.data.tolist())
    writer.add_scalar('accuracy_train', acc, global_step=global_step)
    print('In-batch accuracy:', acc)
    
    model.eval()

    predictions = []
    lables = []

    for i in np.random.choice(range(25000), TEST_SIZE, replace=False):
        _test = test[i]
    #     test_texts.append(_test[0])
    #     test_labels.append(_test[1])
        _text = _test[0].unsqueeze(0).permute(0, 2, 1)
        _text = Variable(_text.cuda()) if CUDA else Variable(_text)
        lables.append(_test[1])
        pred = model(_text)
        _, idx = torch.max(pred, 1)
        predictions.append(idx.data[0])

    acc = accuracy_score(lables, predictions)
    print('Test accuracy:', acc)
    writer.add_scalar('accuracy_test', acc, global_step=global_step)
    print()
    model.train()


Loss after epoch 0:
0.6992976665496826
In-batch accuracy: 0.375
Test accuracy: 0.45

Loss after epoch 1:
0.7079662084579468
In-batch accuracy: 0.5
Test accuracy: 0.68

Loss after epoch 2:
0.7390520572662354
In-batch accuracy: 0.75
Test accuracy: 0.77

Loss after epoch 3:
0.20012804865837097
In-batch accuracy: 1.0
Test accuracy: 0.78

Loss after epoch 4:
0.228943333029747
In-batch accuracy: 1.0
Test accuracy: 0.87

Loss after epoch 5:
0.17180489003658295
In-batch accuracy: 0.875
Test accuracy: 0.84

Loss after epoch 6:
0.20836986601352692
In-batch accuracy: 0.875
Test accuracy: 0.86

Loss after epoch 7:
0.2620856463909149
In-batch accuracy: 0.875
Test accuracy: 0.83

Loss after epoch 8:
0.09418275207281113
In-batch accuracy: 1.0
Test accuracy: 0.84

Loss after epoch 9:
0.020548567175865173
In-batch accuracy: 1.0
Test accuracy: 0.86

Loss after epoch 10:
0.08662045001983643
In-batch accuracy: 1.0
Test accuracy: 0.76

Loss after epoch 11:
0.41919851303100586
In-batch accuracy: 0.875
Test 

In [18]:
def run_model_with(noise_level, lr=1e-4, dropout=0.5, epochs=30):
    CharIMDB.noise_level = noise_level

    model = CharCNN()
    if CUDA:
        model.cuda()
    model.train()
    
    writer = SummaryWriter(comment='_charCNN_BIG_lr%s_noise%s_dropout%s' % (
        int(-np.log10(lr)), noise_level, dropout
    ))
    
    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):
    #     if epoch == 10:
    #         optimizer = optim.Adam(params=model.parameters(), lr=10**-5)

        for batch_idx, (text, label) in enumerate(dataloader):

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(0, 2, 1)  # (1, 0, 2) for RNN
            prediction = model(text)

            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        print('Loss after epoch %s:' % epoch)
        print(loss.data[0])

        _, idx = torch.max(prediction, 1)
        acc = accuracy_score(label.data.tolist(), idx.data.tolist())
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        print('In-batch accuracy:', acc)

        model.eval()

        predictions = []
        lables = []

        for i in np.random.choice(range(25000), TEST_SIZE, replace=False):
            _test = test[i]
        #     test_texts.append(_test[0])
        #     test_labels.append(_test[1])
            _text = _test[0].unsqueeze(0).permute(0, 2, 1)
            _text = Variable(_text.cuda()) if CUDA else Variable(_text)
            lables.append(_test[1])
            pred = model(_text)
            _, idx = torch.max(pred, 1)
            predictions.append(idx.data[0])

        acc = accuracy_score(lables, predictions)
        print('Test accuracy:', acc)
        writer.add_scalar('accuracy_test', acc, global_step=global_step)
        print()
        model.train()

    model.eval()

    for i in range(len(test)):
        _test = test[i]
        _text = _test[0].unsqueeze(0).permute(0, 2, 1)
        _text = Variable(_text.cuda()) if CUDA else Variable(_text)
        lables.append(_test[1])
        pred = model(_text)
        _, idx = torch.max(pred, 1)
        predictions.append(idx.data[0])

    acc = accuracy_score(lables, predictions)
    print('Final test accuracy:', acc)
    writer.add_scalar('accuracy_test', acc, global_step=global_step)
    print()

    # model is in EVAL mode!
    return model

In [20]:
%%time

model = run_model_with(noise_level=0.02)

Loss after epoch 0:
0.693831205368042
In-batch accuracy: 0.5
Test accuracy: 0.45

Loss after epoch 1:
0.693220853805542
In-batch accuracy: 0.5
Test accuracy: 0.52

Loss after epoch 2:
0.694312334060669
In-batch accuracy: 0.5
Test accuracy: 0.47

Loss after epoch 3:
0.6909643411636353
In-batch accuracy: 0.625
Test accuracy: 0.57

Loss after epoch 4:
0.6980196833610535
In-batch accuracy: 0.25
Test accuracy: 0.56

Loss after epoch 5:
0.6982015371322632
In-batch accuracy: 0.25
Test accuracy: 0.59

Loss after epoch 6:
0.6916560530662537
In-batch accuracy: 0.625
Test accuracy: 0.49

Loss after epoch 7:
0.6996750831604004
In-batch accuracy: 0.375
Test accuracy: 0.46

Loss after epoch 8:
0.6863850951194763
In-batch accuracy: 0.5
Test accuracy: 0.54

Loss after epoch 9:
0.6915104389190674
In-batch accuracy: 0.5
Test accuracy: 0.48

Loss after epoch 10:
0.6835496425628662
In-batch accuracy: 0.75
Test accuracy: 0.47

Loss after epoch 11:
0.6930607557296753
In-batch accuracy: 0.5
Test accuracy: 0.

In [23]:
%%time

model = run_model_with(noise_level=0.015)

Loss after epoch 0:
0.6855185627937317
In-batch accuracy: 0.75
Test accuracy: 0.47

Loss after epoch 1:
0.6913802623748779
In-batch accuracy: 0.625
Test accuracy: 0.49

Loss after epoch 2:
0.6922506093978882
In-batch accuracy: 0.375
Test accuracy: 0.43

Loss after epoch 3:
0.6922696828842163
In-batch accuracy: 0.5
Test accuracy: 0.48

Loss after epoch 4:
0.6893981695175171
In-batch accuracy: 0.625
Test accuracy: 0.51

Loss after epoch 5:
0.697161078453064
In-batch accuracy: 0.375
Test accuracy: 0.43

Loss after epoch 6:
0.7012726068496704
In-batch accuracy: 0.125
Test accuracy: 0.53

Loss after epoch 7:
0.6930920481681824
In-batch accuracy: 0.625
Test accuracy: 0.43

Loss after epoch 8:
0.6983960866928101
In-batch accuracy: 0.5
Test accuracy: 0.48

Loss after epoch 9:
0.6918072700500488
In-batch accuracy: 0.625
Test accuracy: 0.48

Loss after epoch 10:
0.6950687170028687
In-batch accuracy: 0.625
Test accuracy: 0.5

Loss after epoch 11:
0.6959728002548218
In-batch accuracy: 0.5
Test acc

In [24]:
%%time

model = run_model_with(noise_level=0.011)

Loss after epoch 0:
0.6988425254821777
In-batch accuracy: 0.5
Test accuracy: 0.5

Loss after epoch 1:
0.7124646902084351
In-batch accuracy: 0.5
Test accuracy: 0.51

Loss after epoch 2:
0.5418098568916321
In-batch accuracy: 0.75
Test accuracy: 0.85

Loss after epoch 3:
0.4217878580093384
In-batch accuracy: 0.875
Test accuracy: 0.84

Loss after epoch 4:
0.5336219668388367
In-batch accuracy: 0.75
Test accuracy: 0.7

Loss after epoch 5:
0.31769514083862305
In-batch accuracy: 0.875
Test accuracy: 0.73

Loss after epoch 6:
0.39250969886779785
In-batch accuracy: 0.75
Test accuracy: 0.83

Loss after epoch 7:
1.3161165714263916
In-batch accuracy: 0.625
Test accuracy: 0.76

Loss after epoch 8:
0.7381945848464966
In-batch accuracy: 0.75
Test accuracy: 0.83

Loss after epoch 9:
0.5788499116897583
In-batch accuracy: 0.75
Test accuracy: 0.84

Loss after epoch 10:
0.08647662401199341
In-batch accuracy: 1.0
Test accuracy: 0.77

Loss after epoch 11:
0.47070184350013733
In-batch accuracy: 0.75
Test accu