In [1]:
from time import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.nn import init
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

from random import random, choice

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

CUDA = torch.cuda.is_available()

import numpy as np

from sklearn.metrics import accuracy_score, f1_score

import torchtext
from collections import Counter

In [2]:
MAXLEN = 512

BATCH_SIZE = 32
VALID_SIZE = 0.1

NOISE_LEVEL = 0.1

NOISE_LEVELS = [0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2]

In [3]:
# use preprocessing for noise?

text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

# remove PAD for noise function
# use more adequate alphabet
ALPHABET = ['<UNK>'] + ['\n'] + [s for s in """ abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}"""]

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

In [4]:
def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.

def preprocess_text_nobatch(text, maxlen=MAXLEN):
    one_hotted_text = np.zeros((maxlen, ALPHABET_LEN))
    for i, char in enumerate(text):
        if i >= MAXLEN:
            break
        one_hotted_text[i, char2int.get(char, char2int['<UNK>'])] = 1.

    return torch.FloatTensor(one_hotted_text)

def onehot2text(one_hotted_text, batch_size=None, show_pad=False):
    if batch_size is None:
        text = ''
        max_values, idx = torch.max(one_hotted_text, 1)
        for c, i in enumerate(idx):
            if max_values[c] == 0:
                if show_pad:
                    symb = '<PAD>'
                else:
                    symb = ''
            else:
                symb = ALPHABET[i]
            text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts

def noise_generator(string, noise_level, chars=ALPHABET+['']):
    noised = ""
    for c in string:
        if random() > noise_level:
            noised += c
        if random() < noise_level:
            noised += choice(chars)
    return noised

class CharIMDB(torchtext.datasets.imdb.IMDB):
    noise_level = 0

    def __getitem__(self, idx):
        item = super(CharIMDB, self).__getitem__(idx)
        text = item.text
        text = noise_generator(text, self.noise_level)  # это плохо
        label = int(item.label == 'pos')
        return preprocess_text_nobatch(text), label

def get_train_valid_loader(dataset, valid_size, batch_size, random_seed=42, shuffle=True, num_workers=4):

    len_dataset = len(dataset)
    indices = list(range(len_dataset))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_actual_size = int(len_dataset * valid_size)

    train_idx, valid_idx = indices[:-val_actual_size], indices[-val_actual_size:]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=4
    )

    return train_loader, valid_loader


def get_metrics(model, test_data, noise_level=None):
    """
    :param test_data: dataset or dataloader

    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []
    
    if noise_level is not None:
        assert isinstance(test_data, torch.utils.data.DataLoader)
        test_data.noise_level = noise_level
    
    elif not isinstance(test_data, torch.utils.data.DataLoader):
        test_dataloader = torch.utils.data.DataLoader(
            test_data, batch_size=BATCH_SIZE
        )
    else:
        test_dataloader = test_data

    for text, label in test_dataloader:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(1, 0, 2)  # (1, 0, 2) for RNN
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    f1 = f1_score(lables, predictions)
    model.train()
    return {'accuracy': acc, 'f1': f1}


In [5]:
CharIMDB.noise_level = 0
train, test = CharIMDB.splits(text_field, label_field)

dataloader, val_dataloader = get_train_valid_loader(
    train, valid_size=VALID_SIZE, batch_size=BATCH_SIZE
)

In [6]:
onehot2text(train[0][0])  # no spaces is onehot2text problem, not a data one

"this musical is decidedly mixed, and none of the elements really fit together, but it somehow manages to be mostly enjoyable. the plot contains some of the elements of wodehouse's novel, but none of its virtues, though he co-wrote the script. the songs, though charming, have nothing to do with this particular film, and are unusually crudely squeezed into the plot, even by pre-oklahoma standards. burns and allen do their usual shtick quite competently, but it misses the tone of the rest of the film by about "

# Model

In [6]:
class CharCNN(nn.Module):
    
    def __init__(self, init_function, dropout=0.5):  #, hidden_dim=256, kernel_size=16):
        super(CharCNN, self).__init__()
        self.init_function = init_function
        self.dropout = dropout
        self.n_filters = 256
        self.cnn_kernel_size = 15
        self.cnn_stride = 2
        self.pool_kernel_size = 64
        self.pool_stride = 32
        
        self.embedding = nn.Linear(ALPHABET_LEN, ALPHABET_LEN)
        self.conv = nn.Sequential(
            nn.Conv1d(ALPHABET_LEN, self.n_filters, kernel_size=self.cnn_kernel_size, stride=self.cnn_stride),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=self.pool_kernel_size, stride=self.pool_stride)
        )
        self.conv[0].weight = init_function(self.conv[0].weight)

        conv_dim = self.n_filters * (int(((MAXLEN-self.cnn_kernel_size) / self.cnn_stride - self.pool_kernel_size) / self.pool_stride) + 1)
#         self._conv_dim = conv_dim  # debug
#         print(conv_dim)
        self.fc = nn.Linear(conv_dim, 2)  # 30464 for MAXLEN=1024, 5888 for MAXLEN=256, 14080 for MAXLEN=512

    def forward(self, x):
        """
        (seq_len, batch_size, signal_dim)
        """
        x = self.embedding(x)  # fix dim
        x = x.permute(1, 2, 0)
        x = self.conv(x)
#         print(x.size())
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


In [7]:
def model_params_num(model):
    return sum(np.prod(list(p.size())) for p in model.parameters())

def mk_dataline(model_type, epochs, lr, noise_level_train, noise_level_test, acc_train, acc_test,
                f1_train, f1_test, dropout, model, init_function=None):
    return {
        'task': 'IMDB binary classification',
        'model_type': model_type,
        'trainable_params': model_params_num(model), 'dropout': dropout, 'init_function': init_function,
        'epochs': epochs, 'lr': lr,
        'noise_level_train': noise_level_train, 'noise_level_test': noise_level_test,
        'acc_train': acc_train, 'acc_test': acc_test,
        'f1_train': f1_train, 'f1_test': f1_test,
        'model_desc': str(model),
        'data_desc': 'Maxlen 512'
    }

In [8]:
results = []

In [9]:
def run_model_with(noise_level, init_function, lr=1e-4, dropout=0.5, epochs=30, _model=None):
    start_time = time()
    CharIMDB.noise_level = noise_level

    if _model is None:
        model = CharCNN(
            init_function=init_function, dropout=dropout
        )
        if CUDA:
            model.cuda()
        model.train()
    
    else:
        model = _model

    model_name = '_charCNN_embed_smaller2_lr%s_noise%s_dropout%s' % (
        int(-np.log10(lr)), noise_level, dropout
    )

    writer = SummaryWriter(comment=)
    print('Writer: %s' % list(writer.all_writers.keys()))


    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):

        for batch_idx, (text, label) in enumerate(dataloader):
            optimizer.zero_grad()

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(1, 0, 2)  # (1, 0, 2) for RNN
            prediction = model(text)

            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        if epoch % 10 == 0:
            print('Epoch %s. Global step %s. T=%s min' % (epoch, global_step, (time() - start_time) / 60.))
            print('Loss               : %s' % loss.data[0])

        # in-batch
        _, idx = torch.max(prediction, 1)
        _labels = label.data.tolist()
        _predictions = idx.data.tolist()
        acc = accuracy_score(_labels, _predictions)
        f1 = f1_score(_labels, _predictions)
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        writer.add_scalar('f1_train', f1, global_step=global_step)
        if epoch % 10 == 0:
            print('In-batch accuracy  :', acc)

        # validation
        metrics = get_metrics(model, val_dataloader)
        if epoch % 10 == 0:
            print('Validation accuracy: %s, f1: %s' % (metrics['accuracy'], metrics['f1']))
            print()

        writer.add_scalar('accuracy_val', metrics['accuracy'], global_step=global_step)
        writer.add_scalar('f1_val', metrics['f1'], global_step=global_step)

    # Test
    metrics_test = None

    print('Calculating validation metrics... Time %s min' % ((time() - start_time) / 60.))
    metrics_train = get_metrics_from_dataloader(model, dataloader)
    acc_train = metrics_train['accuracy']
    f1_train = metrics_train['f1']

    for test_noise in NOISE_LEVELS:
        metrics = get_metrics_from_dataset(model, test, test_noise)
        if test_noise == noise_level:
            metrics_test = metrics

        acc_test = metrics['accuracy']
        f1_test = metrics['f1']
        results.append(mk_dataline(
            model_type='charCNN', epochs=epochs, lr=lr,
            noise_level_train=noise_level, acc_train=acc_train, f1_train=f1_train,
            noise_level_test=test_noise, acc_test=acc_test, f1_test=f1_test,
            dropout=dropout, model=model,
            init_function=init_function
        ))

    print('Final test metrics: %s, Time %s min' % (metrics_test, ((time() - start_time) / 60.)))
    if metrics_test is not None:
        writer.add_scalar('accuracy_test_final', metrics_test['accuracy'], global_step=global_step)
        writer.add_scalar('f1_test_final', metrics_test['f1'], global_step=global_step)
    print()
    model.eval()
    # model is in EVAL mode!
    return model

In [10]:
from tqdm import tqdm

# Main exp

In [11]:
results = []

In [15]:
# searching for phase transition

for noise_level in tqdm(np.arange(0, .01, 0.001)):
    run_model_with(noise_level=noise_level, init_function=init.xavier_normal)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0. Global step 704. T=0.1436752398808797 min
Loss               : 0.6803672313690186
In-batch accuracy  : 0.5


  'precision', 'predicted', average, warn_for)


Validation accuracy: 0.512, f1: 0.061538461538461535

Epoch 10. Global step 7744. T=1.1542007724444072 min
Loss               : 0.23422397673130035
In-batch accuracy  : 1.0
Validation accuracy: 0.7436, f1: 0.75599543205177



  'recall', 'true', average, warn_for)


Epoch 20. Global step 14784. T=2.1592104117075603 min
Loss               : 0.013622313737869263
In-batch accuracy  : 1.0
Validation accuracy: 0.7332, f1: 0.7106290672451193

Calculating validation metrics... Time 3.076084923744202 min


 10%|█         | 1/10 [05:15<47:17, 315.31s/it]

Final test metrics: {'f1': 0.7376722695769934, 'accuracy': 0.73656}, Time 5.255144357681274 min

Epoch 0. Global step 704. T=0.14133394956588746 min
Loss               : 0.7314321994781494
In-batch accuracy  : 0.25
Validation accuracy: 0.508, f1: 0.6693548387096774

Epoch 10. Global step 7744. T=1.3562750816345215 min
Loss               : 0.22567851841449738
In-batch accuracy  : 1.0
Validation accuracy: 0.7432, f1: 0.7092391304347827

Epoch 20. Global step 14784. T=2.437349478403727 min
Loss               : 0.10743339359760284
In-batch accuracy  : 1.0
Validation accuracy: 0.7676, f1: 0.762953896368829

Calculating validation metrics... Time 3.448778168360392 min


 20%|██        | 2/10 [10:48<43:12, 324.09s/it]

Final test metrics: None, Time 5.54760464032491 min

Epoch 0. Global step 704. T=0.0907844622929891 min
Loss               : 0.6932111978530884
In-batch accuracy  : 0.5
Validation accuracy: 0.5532, f1: 0.6781907231345433

Epoch 10. Global step 7744. T=1.1276414434115092 min
Loss               : 0.4192439913749695
In-batch accuracy  : 0.75
Validation accuracy: 0.7496, f1: 0.7517842981760507

Epoch 20. Global step 14784. T=2.176605765024821 min
Loss               : 0.24921046197414398
In-batch accuracy  : 1.0
Validation accuracy: 0.7652, f1: 0.7583367641004527

Calculating validation metrics... Time 3.1391729354858398 min


 30%|███       | 3/10 [16:01<37:22, 320.41s/it]

Final test metrics: None, Time 5.217387167612712 min

Epoch 0. Global step 704. T=0.09537695248921713 min
Loss               : 0.6874804496765137
In-batch accuracy  : 0.75
Validation accuracy: 0.6132, f1: 0.5883354618986802

Epoch 10. Global step 7744. T=1.1471372763315837 min
Loss               : 0.3565017580986023
In-batch accuracy  : 0.75
Validation accuracy: 0.7444, f1: 0.737792367665162

Epoch 20. Global step 14784. T=2.2060555458068847 min
Loss               : 0.27578625082969666
In-batch accuracy  : 1.0
Validation accuracy: 0.7548, f1: 0.7615713730066123

Calculating validation metrics... Time 3.176018941402435 min


 40%|████      | 4/10 [21:17<31:56, 319.42s/it]

Final test metrics: None, Time 5.27445619503657 min

Epoch 0. Global step 704. T=0.09673238595326741 min
Loss               : 0.6823359131813049
In-batch accuracy  : 0.5
Validation accuracy: 0.5204, f1: 0.12926652142338416

Epoch 10. Global step 7744. T=1.1470781882603964 min
Loss               : 0.06243807077407837
In-batch accuracy  : 1.0
Validation accuracy: 0.7524, f1: 0.7618314736437091

Epoch 20. Global step 14784. T=2.2017578721046447 min
Loss               : 0.2650335729122162
In-batch accuracy  : 1.0
Validation accuracy: 0.7612, f1: 0.7534076827757126

Calculating validation metrics... Time 3.1687230110168456 min


 50%|█████     | 5/10 [26:35<26:35, 319.01s/it]

Final test metrics: None, Time 5.289030599594116 min

Epoch 0. Global step 704. T=0.09391260147094727 min
Loss               : 0.70283043384552
In-batch accuracy  : 0.5
Validation accuracy: 0.5024, f1: 0.0048000000000000004

Epoch 10. Global step 7744. T=1.1467703143755594 min
Loss               : 0.4105784595012665
In-batch accuracy  : 0.75
Validation accuracy: 0.762, f1: 0.7637951568082572

Epoch 20. Global step 14784. T=2.194654281934102 min
Loss               : 0.1121649369597435
In-batch accuracy  : 1.0
Validation accuracy: 0.752, f1: 0.7199638663053297

Calculating validation metrics... Time 3.165289080142975 min


 60%|██████    | 6/10 [31:53<21:15, 318.86s/it]

Final test metrics: None, Time 5.302026172478993 min

Epoch 0. Global step 704. T=0.09194004535675049 min
Loss               : 0.6559637188911438
In-batch accuracy  : 1.0
Validation accuracy: 0.5588, f1: 0.6731851851851852

Epoch 10. Global step 7744. T=1.147641658782959 min
Loss               : 0.27374106645584106
In-batch accuracy  : 1.0
Validation accuracy: 0.7448, f1: 0.763001485884101

Epoch 20. Global step 14784. T=2.209592616558075 min
Loss               : 0.2525128126144409
In-batch accuracy  : 0.75
Validation accuracy: 0.7668, f1: 0.7833519137866963

Calculating validation metrics... Time 3.181969145933787 min


 70%|███████   | 7/10 [37:11<15:56, 318.76s/it]

Final test metrics: None, Time 5.302956835428874 min

Epoch 0. Global step 704. T=0.09604016939798991 min
Loss               : 0.7281623482704163
In-batch accuracy  : 0.25
Validation accuracy: 0.6008, f1: 0.6815571155073389

Epoch 10. Global step 7744. T=1.156672199567159 min
Loss               : 0.5288417339324951
In-batch accuracy  : 0.75
Validation accuracy: 0.7516, f1: 0.7678504672897195

Epoch 20. Global step 14784. T=2.2299110293388367 min
Loss               : 0.15664401650428772
In-batch accuracy  : 1.0
Validation accuracy: 0.766, f1: 0.7692307692307693

Calculating validation metrics... Time 3.201933662096659 min


 80%|████████  | 8/10 [42:31<10:37, 318.90s/it]

Final test metrics: None, Time 5.331070137023926 min

Epoch 0. Global step 704. T=0.09274621407190958 min
Loss               : 0.6521236300468445
In-batch accuracy  : 0.75
Validation accuracy: 0.518, f1: 0.11201179071481208

Epoch 10. Global step 7744. T=1.1543944160143533 min
Loss               : 0.9379855394363403
In-batch accuracy  : 0.0
Validation accuracy: 0.7452, f1: 0.7308829742289819

Epoch 20. Global step 14784. T=2.2232012271881105 min
Loss               : 0.4122418761253357
In-batch accuracy  : 0.75
Validation accuracy: 0.7528, f1: 0.7306015693112466

Calculating validation metrics... Time 3.1914063771565755 min


 90%|█████████ | 9/10 [47:48<05:18, 318.67s/it]

Final test metrics: None, Time 5.280966401100159 min

Epoch 0. Global step 704. T=0.09717000325520833 min
Loss               : 0.7716481685638428
In-batch accuracy  : 0.25
Validation accuracy: 0.4988, f1: 0.6655991459834534

Epoch 10. Global step 7744. T=1.176574718952179 min
Loss               : 0.2566969096660614
In-batch accuracy  : 1.0
Validation accuracy: 0.7124, f1: 0.6463354648303001

Epoch 20. Global step 14784. T=2.2474002043406167 min
Loss               : 0.19343164563179016
In-batch accuracy  : 1.0
Validation accuracy: 0.77, f1: 0.7689835275210929

Calculating validation metrics... Time 3.21843079328537 min


100%|██████████| 10/10 [53:06<00:00, 318.66s/it]

Final test metrics: None, Time 5.309645819664001 min






In [25]:
for noise_level in tqdm(NOISE_LEVELS):
    run_model_with(noise_level=noise_level, init_function=init.xavier_normal)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0. Global step 704. T=0.09757444063822428 min
Loss               : 0.6833441853523254
In-batch accuracy  : 0.5
Validation accuracy: 0.594, f1: 0.6644628099173554



  'recall', 'true', average, warn_for)


Epoch 10. Global step 7744. T=1.0710898200670878 min
Loss               : 0.3576161563396454
In-batch accuracy  : 0.75
Validation accuracy: 0.7436, f1: 0.725010725010725

Epoch 20. Global step 14784. T=2.0572694261868794 min
Loss               : 0.1215888112783432
In-batch accuracy  : 1.0
Validation accuracy: 0.74, f1: 0.7207903780068728

Calculating validation metrics... Time 2.9548836310704547 min


 10%|█         | 1/10 [05:02<45:25, 302.83s/it]

Final test metrics: {'accuracy': 0.73676, 'f1': 0.7368547322963732}, Time 5.047053643067678 min

Epoch 0. Global step 704. T=0.08685567378997802 min
Loss               : 0.6945825815200806
In-batch accuracy  : 0.5
Validation accuracy: 0.5936, f1: 0.6788874841972187

Epoch 10. Global step 7744. T=1.0606398701667785 min
Loss               : 0.9090694189071655
In-batch accuracy  : 0.5


  'precision', 'predicted', average, warn_for)


Validation accuracy: 0.7456, f1: 0.7170818505338079

Epoch 20. Global step 14784. T=2.0351781884829205 min
Loss               : 0.9719800353050232
In-batch accuracy  : 0.75
Validation accuracy: 0.7704, f1: 0.774548311076198

Calculating validation metrics... Time 2.9230235616366067 min


 20%|██        | 2/10 [10:04<40:17, 302.23s/it]

Final test metrics: {'accuracy': 0.7544, 'f1': 0.7788343779266623}, Time 5.027114776770274 min

Epoch 0. Global step 704. T=0.08835642337799073 min
Loss               : 0.6829354763031006
In-batch accuracy  : 0.5
Validation accuracy: 0.5132, f1: 0.6695628563670921

Epoch 10. Global step 7744. T=1.0660610636075338 min
Loss               : 1.0264668464660645
In-batch accuracy  : 0.25
Validation accuracy: 0.7284, f1: 0.7064418504107219

Epoch 20. Global step 14784. T=2.046401325861613 min
Loss               : 0.23428675532341003
In-batch accuracy  : 1.0
Validation accuracy: 0.7544, f1: 0.7688253012048194

Calculating validation metrics... Time 2.931548313299815 min


 30%|███       | 3/10 [15:06<35:15, 302.16s/it]

Final test metrics: {'accuracy': 0.75008, 'f1': 0.7710013194546254}, Time 5.033909809589386 min

Epoch 0. Global step 704. T=0.08631937503814698 min
Loss               : 0.6909242868423462
In-batch accuracy  : 0.5
Validation accuracy: 0.5112, f1: 0.6693722943722944

Epoch 10. Global step 7744. T=1.083217167854309 min
Loss               : 0.3939869999885559
In-batch accuracy  : 0.75
Validation accuracy: 0.7104, f1: 0.7457865168539326

Epoch 20. Global step 14784. T=2.0790613214174907 min
Loss               : 0.3692682981491089
In-batch accuracy  : 0.75
Validation accuracy: 0.7464, f1: 0.7416462917685411

Calculating validation metrics... Time 2.9949778079986573 min


 40%|████      | 4/10 [20:12<30:18, 303.13s/it]

Final test metrics: {'accuracy': 0.74996, 'f1': 0.7519739713526168}, Time 5.100250689188639 min

Epoch 0. Global step 704. T=0.09309595028559367 min
Loss               : 0.7203581929206848
In-batch accuracy  : 0.0
Validation accuracy: 0.5096, f1: 0.6686486486486486

Epoch 10. Global step 7744. T=1.1207369367281597 min
Loss               : 0.7041387557983398
In-batch accuracy  : 0.5
Validation accuracy: 0.6788, f1: 0.6100048567265663

Epoch 20. Global step 14784. T=2.13401597738266 min
Loss               : 0.3560851514339447
In-batch accuracy  : 1.0
Validation accuracy: 0.7144, f1: 0.677797833935018

Calculating validation metrics... Time 3.0663119554519653 min


 50%|█████     | 5/10 [25:22<25:22, 304.55s/it]

Final test metrics: {'accuracy': 0.73496, 'f1': 0.7482714079477243}, Time 5.170687929789225 min

Epoch 0. Global step 704. T=0.0915231982866923 min
Loss               : 0.7024847269058228
In-batch accuracy  : 0.5
Validation accuracy: 0.5128, f1: 0.08558558558558559

Epoch 10. Global step 7744. T=1.1404865423838297 min
Loss               : 0.394155889749527
In-batch accuracy  : 1.0
Validation accuracy: 0.6812, f1: 0.7227826086956521

Epoch 20. Global step 14784. T=2.1909544388453166 min
Loss               : 0.8203105926513672
In-batch accuracy  : 0.25
Validation accuracy: 0.7056, f1: 0.6626947754353805

Calculating validation metrics... Time 3.150494122505188 min


 60%|██████    | 6/10 [30:38<20:25, 306.36s/it]

Final test metrics: {'accuracy': 0.7244, 'f1': 0.73599509540961}, Time 5.25678657690684 min

Epoch 0. Global step 704. T=0.09529105424880982 min
Loss               : 0.6908730268478394
In-batch accuracy  : 0.5
Validation accuracy: 0.4988, f1: 0.6655991459834534

Epoch 10. Global step 7744. T=1.1661256869633994 min
Loss               : 0.6972641944885254
In-batch accuracy  : 0.5
Validation accuracy: 0.6656, f1: 0.6620856911883589

Epoch 20. Global step 14784. T=2.235698322455088 min
Loss               : 0.6497185826301575
In-batch accuracy  : 0.5
Validation accuracy: 0.6992, f1: 0.6845637583892618

Calculating validation metrics... Time 3.2094284256299335 min


 70%|███████   | 7/10 [35:57<15:24, 308.24s/it]

Final test metrics: {'accuracy': 0.70756, 'f1': 0.729992244340215}, Time 5.3249284783999125 min

Epoch 0. Global step 704. T=0.0985660990079244 min
Loss               : 0.7428643107414246
In-batch accuracy  : 0.25
Validation accuracy: 0.5096, f1: 0.08370702541106129

Epoch 10. Global step 7744. T=1.2230181296666462 min
Loss               : 0.7159014940261841
In-batch accuracy  : 0.25
Validation accuracy: 0.6468, f1: 0.5860290670417253

Epoch 20. Global step 14784. T=2.3315184394518536 min
Loss               : 0.6266547441482544
In-batch accuracy  : 0.75
Validation accuracy: 0.696, f1: 0.6657871591908532

Calculating validation metrics... Time 3.3505717277526856 min


 80%|████████  | 8/10 [41:25<10:21, 310.66s/it]

Final test metrics: {'accuracy': 0.70232, 'f1': 0.703080114905841}, Time 5.46062300602595 min

Epoch 0. Global step 704. T=0.1013597846031189 min
Loss               : 0.6744129061698914
In-batch accuracy  : 0.75
Validation accuracy: 0.4988, f1: 0.6655991459834534

Epoch 10. Global step 7744. T=1.241614584128062 min
Loss               : 0.5243411660194397
In-batch accuracy  : 1.0
Validation accuracy: 0.6204, f1: 0.5044386422976501

Epoch 20. Global step 14784. T=2.3675535559654235 min
Loss               : 0.31338781118392944
In-batch accuracy  : 1.0
Validation accuracy: 0.6856, f1: 0.712719298245614

Calculating validation metrics... Time 3.446236224969228 min


 90%|█████████ | 9/10 [46:58<05:13, 313.19s/it]

Final test metrics: {'accuracy': 0.6814, 'f1': 0.6337425851841634}, Time 5.556990742683411 min

Epoch 0. Global step 704. T=0.10616544882456462 min
Loss               : 0.7120844125747681
In-batch accuracy  : 0.25
Validation accuracy: 0.5092, f1: 0.09312638580931264

Epoch 10. Global step 7744. T=1.304196302096049 min
Loss               : 0.5625705718994141
In-batch accuracy  : 0.75
Validation accuracy: 0.6392, f1: 0.6015901060070671

Epoch 20. Global step 14784. T=2.4941790223121645 min
Loss               : 0.6240410208702087
In-batch accuracy  : 0.5
Validation accuracy: 0.6772, f1: 0.6783579115185332

Calculating validation metrics... Time 3.575537435213725 min


100%|██████████| 10/10 [52:41<00:00, 316.16s/it]

Final test metrics: {'accuracy': 0.68192, 'f1': 0.6674194897532414}, Time 5.715057114760081 min






In [16]:
import pandas as pd

In [17]:
pd.DataFrame(results).to_csv('results/CharCNN_embed2.csv')

In [22]:
%%time
run_model_with(noise_level=0.1, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.6670222878456116
In-batch accuracy: 0.75
Validation accuracy: 0.5112

Loss after epoch 1:
Global step: 1408
0.6755492091178894
In-batch accuracy: 0.5
Validation accuracy: 0.5172

Loss after epoch 2:
Global step: 2112
0.699364960193634
In-batch accuracy: 0.25
Validation accuracy: 0.5364

Loss after epoch 3:
Global step: 2816
0.6891615390777588
In-batch accuracy: 0.75
Validation accuracy: 0.6676

Loss after epoch 4:
Global step: 3520
0.5262390375137329
In-batch accuracy: 0.5
Validation accuracy: 0.7088

Loss after epoch 5:
Global step: 4224
0.41326266527175903
In-batch accuracy: 0.75
Validation accuracy: 0.7372

Loss after epoch 6:
Global step: 4928
0.5759977102279663
In-batch accuracy: 0.5
Validation accuracy: 0.7484

Loss after epoch 7:
Global step: 5632
0.4313564896583557
In-batch accuracy: 0.5
Validation accuracy: 0.746

Loss after epoch 8:
Global step: 6336
0.5776655673980713
In-batch accuracy: 0.75
Validation accuracy: 0.7544

Loss after epoch

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(7,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=4, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=61184, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc3): Linear(in_features=1024, out_features=2, bias=True)
)

In [32]:
%%time
run_model_with(noise_level=0.01, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.7154096364974976
In-batch accuracy: 0.5
Validation accuracy: 0.6276

Loss after epoch 1:
Global step: 1408
0.3147427439689636
In-batch accuracy: 1.0
Validation accuracy: 0.7424

Loss after epoch 2:
Global step: 2112
0.43052104115486145
In-batch accuracy: 0.75
Validation accuracy: 0.7732

Loss after epoch 3:
Global step: 2816
0.14214535057544708
In-batch accuracy: 1.0
Validation accuracy: 0.8052

Loss after epoch 4:
Global step: 3520
0.16783644258975983
In-batch accuracy: 1.0
Validation accuracy: 0.8112

Loss after epoch 5:
Global step: 4224
0.5878106951713562
In-batch accuracy: 0.5
Validation accuracy: 0.82

Loss after epoch 6:
Global step: 4928
0.1835513859987259
In-batch accuracy: 1.0
Validation accuracy: 0.8224

Loss after epoch 7:
Global step: 5632
0.036214977502822876
In-batch accuracy: 1.0
Validation accuracy: 0.8056

Loss after epoch 8:
Global step: 6336
0.036427706480026245
In-batch accuracy: 1.0
Validation accuracy: 0.8236

Loss after epo

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(16,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=30464, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc3): Linear(in_features=1024, out_features=2, bias=True)
)

In [38]:
%%time
run_model_with(noise_level=0.01, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.785068154335022
In-batch accuracy: 0.25
Validation accuracy: 0.6084

Loss after epoch 1:
Global step: 1408
0.3312198519706726
In-batch accuracy: 1.0
Validation accuracy: 0.7292

Loss after epoch 2:
Global step: 2112
0.2843222916126251
In-batch accuracy: 1.0
Validation accuracy: 0.7424

Loss after epoch 3:
Global step: 2816
0.1944502741098404
In-batch accuracy: 1.0
Validation accuracy: 0.7604

Loss after epoch 4:
Global step: 3520
0.24288120865821838
In-batch accuracy: 0.75
Validation accuracy: 0.8008

Loss after epoch 5:
Global step: 4224
0.36140114068984985
In-batch accuracy: 0.75
Validation accuracy: 0.8036

Loss after epoch 6:
Global step: 4928
0.2669448256492615
In-batch accuracy: 0.75
Validation accuracy: 0.8048

Loss after epoch 7:
Global step: 5632
0.18363156914710999
In-batch accuracy: 1.0
Validation accuracy: 0.8092

Loss after epoch 8:
Global step: 6336
0.48155519366264343
In-batch accuracy: 0.75
Validation accuracy: 0.8184

Loss after e

CharCNN(
  (conv): Sequential(
    (0): Conv1d(64, 256, kernel_size=(16,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=30464, out_features=2, bias=True)
)

In [12]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.8411439657211304
In-batch accuracy: 0.25
Validation accuracy: 0.5012

Loss after epoch 1:
Global step: 1408
0.5809738636016846
In-batch accuracy: 0.75
Validation accuracy: 0.66

Loss after epoch 2:
Global step: 2112
0.4452686309814453
In-batch accuracy: 1.0
Validation accuracy: 0.6824

Loss after epoch 3:
Global step: 2816
0.23656554520130157
In-batch accuracy: 1.0
Validation accuracy: 0.7136

Loss after epoch 4:
Global step: 3520
0.5787070393562317
In-batch accuracy: 0.5
Validation accuracy: 0.7248

Loss after epoch 5:
Global step: 4224
0.502457857131958
In-batch accuracy: 0.75
Validation accuracy: 0.726

Loss after epoch 6:
Global step: 4928
0.12519408762454987
In-batch accuracy: 1.0
Validation accuracy: 0.7332

Loss after epoch 7:
Global step: 5632
0.731816291809082
In-batch accuracy: 0.75
Validation accuracy: 0.736

Loss after epoch 8:
Global step: 6336
0.12233468890190125
In-batch accuracy: 1.0
Validation accuracy: 0.7276

Loss after epoch 9:

In [13]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.7073005437850952
In-batch accuracy: 0.5
Validation accuracy: 0.6404

Loss after epoch 1:
Global step: 1408
0.5612970590591431
In-batch accuracy: 0.75
Validation accuracy: 0.7176

Loss after epoch 2:
Global step: 2112
0.9215638637542725
In-batch accuracy: 0.25
Validation accuracy: 0.7228

Loss after epoch 3:
Global step: 2816
0.3698350191116333
In-batch accuracy: 1.0
Validation accuracy: 0.728

Loss after epoch 4:
Global step: 3520
0.584014356136322
In-batch accuracy: 0.5
Validation accuracy: 0.7556

Loss after epoch 5:
Global step: 4224
0.19489029049873352
In-batch accuracy: 1.0
Validation accuracy: 0.758

Loss after epoch 6:
Global step: 4928
0.41615644097328186
In-batch accuracy: 0.75
Validation accuracy: 0.7776

Loss after epoch 7:
Global step: 5632
0.2622656226158142
In-batch accuracy: 1.0
Validation accuracy: 0.7752

Loss after epoch 8:
Global step: 6336
0.16676472127437592
In-batch accuracy: 1.0
Validation accuracy: 0.7824

Loss after epoch 

In [10]:
def predict(model, text):
    text = preprocess_text_nobatch(text)
    text = text.unsqueeze(0).permute(0, 2, 1)
    text = Variable(text.cuda())
    prediction = model(text)
    _, prediction = torch.max(prediction, 1)
    return prediction

In [61]:
predict(model, 'I love it')

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [62]:
predict(model, 'I hate it')

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [68]:
predict(model, 'I have seen this film as I was a child and it was awersome! Love it! Love it!')

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [74]:
predict(model, 'Love it! Love it!  Love it! Love it! Love it! Love it! Love it!')

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [80]:
predict(model, "Maybe just long enough text if really suficcient so let's write something neutral")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [81]:
predict(model, "We need more emotions! Like when film is cool you are so happy to rank it 10")

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [82]:
predict(model, "So only long texts")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [85]:
predict(model, "This is not good for tweets")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [87]:
predict(model, "This is very good for tweets")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

Что-то такое себе

Попробуем обучить на малой длине (140)

In [11]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.70023113489151
In-batch accuracy: 0.5
Validation accuracy: 0.5332

Loss after epoch 1:
Global step: 1408
0.7055613398551941
In-batch accuracy: 0.5
Validation accuracy: 0.6228

Loss after epoch 2:
Global step: 2112
0.46998968720436096
In-batch accuracy: 1.0
Validation accuracy: 0.6316

Loss after epoch 3:
Global step: 2816
0.7275577783584595
In-batch accuracy: 0.5
Validation accuracy: 0.6564

Loss after epoch 4:
Global step: 3520
0.33533045649528503
In-batch accuracy: 1.0
Validation accuracy: 0.6628

Loss after epoch 5:
Global step: 4224
0.6466774940490723
In-batch accuracy: 0.5
Validation accuracy: 0.6712

Loss after epoch 6:
Global step: 4928
0.3631550967693329
In-batch accuracy: 1.0
Validation accuracy: 0.6668

Loss after epoch 7:
Global step: 5632
0.3672349154949188
In-batch accuracy: 0.75
Validation accuracy: 0.6776

Loss after epoch 8:
Global step: 6336
0.18997327983379364
In-batch accuracy: 1.0
Validation accuracy: 0.6776

Loss after epoch 9

In [16]:
predict(model, 'I love it')

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [17]:
predict(model, 'I hate it')

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [18]:
predict(model, 'I have seen this film as I was a child and it was awersome! Love it! Love it!')

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [19]:
predict(model, 'Love it! Love it!  Love it! Love it! Love it! Love it! Love it!')

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [20]:
predict(model, "Maybe just long enough text if really suficcient so let's write something neutral")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [21]:
predict(model, "We need more emotions! Like when film is cool you are so happy to rank it 10") # Изменилось с 1 на 0

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [22]:
predict(model, "So only long texts")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [23]:
predict(model, "This is not good for tweets")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [24]:
predict(model, "This is very good for tweets")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

# Twitter

In [25]:
import pandas as pd

In [26]:
class OneHotDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, alphabet=None, noise_level=0, maxlen=512):
        """
        :param dataframe: pandas dataframe with fields "text": str and "label": int
        """
        if alphabet is None:
            raise NotImplementedError()
        else:
            self.alphabet = alphabet
        self.char2int = {s: i for s, i in zip(self.alphabet, range(len(self.alphabet)))}

        self.maxlen = maxlen
        self.dataframe = dataframe
        self.noise_level = noise_level
        if self.noise_level > 0:
            raise NotImplementedError()

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        line = self.dataframe.iloc[idx]
        text = self._preprocess_text_nobatch(line.text)
        label = line.label
        return text, label

    def _noise_generator(string):
        noised = ""
        for c in string:
            if random() > self.noise_level:
                noised += c
            if random() < self.noise_level:
                noised += choice(self.alphabet)
        return noised

    def _one_hot(self, char):
        zeros = np.zeros(len(self.alphabet))
        if char in self.char2int:
            zeros[self.char2int[char]] = 1.
        else:
            zeros[self.char2int['UNK']] = 1.

    def _preprocess_text_nobatch(self, text):
        one_hotted_text = np.zeros((self.maxlen, len(self.alphabet)))
        for i, char in enumerate(text):
            if i >= self.maxlen:
                break
            one_hotted_text[i, self.char2int.get(char, self.char2int['UNK'])] = 1.
        if i < self.maxlen:
            for j in range(i+1, self.maxlen):
                one_hotted_text[j, self.char2int['PAD']] = 1.

        return torch.FloatTensor(one_hotted_text)

    def onehot2text(self, one_hotted_text):
        text = ''
        _, idx = torch.max(one_hotted_text, 1)
        for i in idx:
            symb = self.alphabet[i]
            if symb == 'PAD':
                break
            else:
                text += symb
        return text



In [27]:
twitter_df = pd.read_csv('/media/data/nlp/data/twitter_sentiment/twitter_sentiment_valid.csv')

In [28]:
twitter_df.sample()

Unnamed: 0.1,Unnamed: 0,ItemID,Sentiment,SentimentText
282205,294537,294549,1,@julianna12369 Hon those two tweets together w...


In [29]:
twitter_df.columns = ['idxx', 'ItemID', 'label', 'text']

In [32]:
twitter_ds = OneHotDataset(twitter_df, alphabet=ALPHABET, maxlen=140)
twitter_dl = torch.utils.data.DataLoader(twitter_ds, batch_size=BATCH_SIZE, num_workers=4, shuffle=True)

In [49]:
%%time
# MAXLEN 512
get_accuracy(model, twitter_dl)

CPU times: user 33 s, sys: 13.8 s, total: 46.8 s
Wall time: 48.9 s


0.5117747288379131

In [33]:
%%time
# MAXLEN 140
get_accuracy(model, twitter_dl)

CPU times: user 14.9 s, sys: 4.83 s, total: 19.8 s
Wall time: 25.8 s


0.5006580369517948