In [1]:
from time import time
from collections import Counter
from random import random, choice

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.nn import init
from torch.autograd import Variable
from torch.utils.data.sampler import SubsetRandomSampler

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

from sklearn.metrics import accuracy_score, f1_score

import torchtext

CUDA = torch.cuda.is_available()
CUDA

True

In [2]:
BATCH_SIZE = 32

NOISE_LEVEL = 0.1  # possibly not used

NOISE_LEVELS = [0, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2]

### IMDB

In [3]:
MAXLEN = 512
VALID_SIZE = 0.1

# use preprocessing for noise?

text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

ALPHABET = ['<UNK>'] + ['\n'] + [s for s in """ abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}"""]

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

class CharIMDB(torchtext.datasets.imdb.IMDB):
    noise_level = 0

    def __getitem__(self, idx):
        item = super(CharIMDB, self).__getitem__(idx)
        text = item.text
        text = noise_generator(text, self.noise_level)  # это плохо
        label = int(item.label == 'pos')
        return preprocess_text_nobatch(text), label


### One-hot dataset (Mokoron, SST, IMDB)

In [4]:
# MAXLEN = 170  # for makaron
# MAXLEN = 200  # for SST
MAXLEN = 512  # for IMDB

russian = False
no_emoji = False
if MAXLEN == 170:
    print('MOKORON!')
    russian = True
    no_emoji = True
elif MAXLEN == 512:
    print('IMDB')
elif MAXLEN == 200:
    print('SST!')
else:
    print("I don't know which dataset is used =(")

ALPHABET = ['<UNK>'] + ['\n'] + [s for s in """ 0123456789-,;.!?:'’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}"""]
if russian:
    ALPHABET += [s for s in 'абвгдеёжзийклмнопрстуфхцчщъыьэюя']

ALPHABET += [s for s in 'abcdefghijklmnopqrstuvwxyz']

if no_emoji:
    ALPHABET = [s for s in ALPHABET if s not in ('(', ')')]

ALPHABET_LEN = len(ALPHABET)
char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

class MokoronDatasetOneHot(torch.utils.data.Dataset):
    """
    Zero vector for padding.
    """
    noise_level = 0

    def __init__(self, filepath, text_field, maxlen=MAXLEN):
        self.alphabet = ALPHABET

        self.data = pd.read_csv(filepath)
        self.text_field = text_field
        self.maxlen = maxlen
        self.char2int = {s: i for s, i in zip(self.alphabet, range(len(self.alphabet)))}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        line = self.data.iloc[idx]
        text = line[self.text_field]
        label = int(line.sentiment == 1.)

        if self.noise_level > 0:
            text = self._noise_generator(text)
        text = self._preprocess(text)
        return text, label

    def _noise_generator(self, string):
        noised = ""
        for c in string:
            if random() > self.noise_level:
                noised += c
            if random() < self.noise_level:
                noised += choice(self.alphabet)
        return noised

    def _one_hot(self, char):
        zeros = np.zeros(len(self.alphabet))
        if char in self.char2int:
            zeros[self.char2int[char]] = 1.
        else:
            zeros[self.char2int['<UNK>']] = 1.

    def _preprocess(self, text):
        text = text.lower()
        one_hotted_text = np.zeros((self.maxlen, len(self.alphabet)))
        for i, char in enumerate(text):
            if i >= self.maxlen:
                break
            one_hotted_text[i, self.char2int.get(char, self.char2int['<UNK>'])] = 1.

        return torch.FloatTensor(one_hotted_text)

    def onehot2text(self, one_hotted_text, show_pad=False):
        text = ''
        max_values, idx = torch.max(one_hotted_text, 1)
        for c, i in enumerate(idx):
            if max_values[c] == 0:
                if show_pad:
                    symb = '<PAD>'
                else:
                    symb = ''
            else:
                symb = ALPHABET[i]
            text += symb
        return text


IMDB


In [5]:
def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.


def preprocess_text_nobatch(text, maxlen=MAXLEN):
    one_hotted_text = np.zeros((maxlen, ALPHABET_LEN))
    for i, char in enumerate(text):
        if i >= MAXLEN:
            break
        one_hotted_text[i, char2int.get(char, char2int['<UNK>'])] = 1.

    return torch.FloatTensor(one_hotted_text)


def onehot2text(one_hotted_text, batch_size=None, show_pad=False):
    if batch_size is None:
        text = ''
        max_values, idx = torch.max(one_hotted_text, 1)
        for c, i in enumerate(idx):
            if max_values[c] == 0:
                if show_pad:
                    symb = '<PAD>'
                else:
                    symb = ''
            else:
                symb = ALPHABET[i]
            text += symb
        return text
    else:
        texts = []
        for text in one_hotted_text:
            texts.append(onehot2text(one_hotted_text, batch_size=None))
        return texts


def noise_generator(string, noise_level, chars=ALPHABET+['']):
    noised = ""
    for c in string:
        if random() > noise_level:
            noised += c
        if random() < noise_level:
            noised += choice(chars)
    return noised


def get_train_valid_loader(dataset, valid_size, batch_size, random_seed=42, shuffle=True, num_workers=4):

    len_dataset = len(dataset)
    indices = list(range(len_dataset))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    val_actual_size = int(len_dataset * valid_size)

    train_idx, valid_idx = indices[:-val_actual_size], indices[-val_actual_size:]

    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=train_sampler, num_workers=4
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=4
    )

    return train_loader, valid_loader


def get_metrics(model, test_data, noise_level=None):
    """
    :param test_data: dataset or dataloader

    Moder will be in TRAIN mode after that
    """
    model.eval()

    predictions = []
    lables = []
    
    if isinstance(test_data, torch.utils.data.Dataset):
        if noise_level is not None:
            test_data.noise_level = noise_level

        test_dataloader = torch.utils.data.DataLoader(
            test_data, batch_size=BATCH_SIZE
        )
    else:
        assert isinstance(test_data, torch.utils.data.DataLoader)
        test_dataloader = test_data

    for text, label in test_dataloader:
        if CUDA:
            text = Variable(text.cuda())
        else:
            text = Variable(text)

        text = text.permute(1, 0, 2)  # (1, 0, 2) for RNN
        prediction = model(text)

        _, idx = torch.max(prediction, 1)
        predictions += idx.data.tolist()
        lables += label.tolist()

    acc = accuracy_score(lables, predictions)
    f1 = f1_score(lables, predictions)
    model.train()
    return {'accuracy': acc, 'f1': f1}


### IMDB

In [5]:
CharIMDB.noise_level = 0
train, test = CharIMDB.splits(text_field, label_field)

dataloader, val_dataloader = get_train_valid_loader(
    train, valid_size=VALID_SIZE, batch_size=BATCH_SIZE
)

downloading aclImdb_v1.tar.gz


In [6]:
onehot2text(train[0][0])  # no spaces is onehot2text problem, not a data one

"this musical is decidedly mixed, and none of the elements really fit together, but it somehow manages to be mostly enjoyable. the plot contains some of the elements of wodehouse's novel, but none of its virtues, though he co-wrote the script. the songs, though charming, have nothing to do with this particular film, and are unusually crudely squeezed into the plot, even by pre-oklahoma standards. burns and allen do their usual shtick quite competently, but it misses the tone of the rest of the film by about "

### Mokoron or SST

In [6]:
basepath = '/media/data/nlp/sentiment/IMDB/splits/'  # makaron

In [5]:
basepath = '/media/data/nlp/sentiment/ru-mokoron/splits/'  # makaron

In [5]:
basepath = '/media/data/nlp/sentiment/stanfordSentimentTreebank/splits/'  # SST

In [33]:
train = MokoronDatasetOneHot(basepath + 'train.csv', 'text_original')
valid = MokoronDatasetOneHot(basepath + 'validation.csv', 'text_original')
test = MokoronDatasetOneHot(basepath + 'test.csv', 'text_original')

test_original = MokoronDatasetOneHot(basepath + 'test.csv', 'text_original')

dataloader = torch.utils.data.DataLoader(train, BATCH_SIZE, shuffle=True, num_workers=4)
val_dataloader = torch.utils.data.DataLoader(valid, BATCH_SIZE, shuffle=True, num_workers=4)

In [10]:
len(train)

21250

# Model

In [11]:
class CharCNN(nn.Module):
    
    def __init__(self, init_function, n_filters, cnn_kernel_size, dropout=0.5):  #, hidden_dim=256, kernel_size=16):
        """
        :param init_funciton: torch.nn.init
        :param dropout: dropout zero probability (1 - keep probability)
        """
        super(CharCNN, self).__init__()
        self.init_function = init_function
        self.dropout_prob = dropout
        self.n_filters = n_filters
        self.cnn_kernel_size = cnn_kernel_size  # 15
        self.cnn_stride = 2
        self.pool_kernel_size = 64  # MAXLEN  # 64
        self.pool_stride = 32  # self.pool_kernel_size  # 32

        self.embedding = nn.Linear(ALPHABET_LEN, ALPHABET_LEN)
        self.conv = nn.Sequential(
            nn.Conv1d(ALPHABET_LEN, self.n_filters, kernel_size=self.cnn_kernel_size, stride=self.cnn_stride),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=self.pool_kernel_size, stride=self.pool_stride)
        )
        self.conv[0].weight = init_function(self.conv[0].weight)

        conv_dim = self.n_filters * (int(((MAXLEN-self.cnn_kernel_size) / self.cnn_stride - self.pool_kernel_size) / self.pool_stride) + 1)
        self.dropout = nn.Dropout(self.dropout_prob)
        self.fc = nn.Linear(conv_dim, 2)

    def forward(self, x):
        """
        (seq_len, batch_size, signal_dim)
        """
        x = self.embedding(x)
        x = x.permute(1, 2, 0)
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.fc(x)
        return x


In [12]:
def model_params_num(model):
    return sum(np.prod(list(p.size())) for p in model.parameters())

def mk_dataline(model_type, epochs, lr, noise_level_train, noise_level_test, acc_train, acc_test,
                f1_train, f1_test, dropout, model, run_name, task, init_function=None):
    return {
        'task': task,
        'model_type': model_type,
        'trainable_params': model_params_num(model), 'dropout': dropout, 'init_function': init_function,
        'epochs': epochs, 'lr': lr,
        'noise_level_train': noise_level_train, 'noise_level_test': noise_level_test,
        'acc_train': acc_train, 'acc_test': acc_test,
        'f1_train': f1_train, 'f1_test': f1_test,
        'model_desc': str(model),
        'run_name': run_name,
        'data_desc': 'Maxlen %s' % MAXLEN
    }

In [13]:
results = []

In [14]:
def run_model_with(noise_level, n_filters, cnn_kernel_size, init_function, lr=1e-4, dropout=0.5, epochs=30, log_every=1, comment='', _model=None):
    start_time = time()
#     CharIMDB.noise_level = noise_level
#     task='IMDB binary classification'
    MokoronDatasetOneHot.noise_level = noise_level
    task='Mokoron binary classification'

    if _model is None:
        model = CharCNN(
            n_filters=n_filters, cnn_kernel_size=cnn_kernel_size,
            init_function=init_function, dropout=dropout
        )
        if CUDA:
            model.cuda()
        model.train()
    
    else:
        model = _model

    model_name = '_charCNN_embed_smaller2_lr%s_noise%s_dropout%s_filters%s_cnn_kernel%s' % (
        int(-np.log10(lr)), noise_level, dropout, n_filters, cnn_kernel_size
    ) + comment
    
    if '(' not in ALPHABET:
        model_name += '_no_emoji'

    writer = SummaryWriter(comment=model_name)
    if len(list(writer.all_writers.keys())) > 1:
        print('More than one writer! 0_o')
        print(list(writer.all_writers.keys()))

    run_name = list(writer.all_writers.keys())[0]
    print('Writer: %s' % run_name)

    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    optimizer.zero_grad()
    
    global_step = 0

    loss_f = F.cross_entropy

    for epoch in range(epochs):

        for batch_idx, (text, label) in enumerate(dataloader):
            optimizer.zero_grad()

            if CUDA:
                text = Variable(text.cuda())
                label = Variable(torch.LongTensor(label).cuda())
            else:
                text = Variable(text)
                label = Variable(torch.LongTensor(label))

            text = text.permute(1, 0, 2)  # (1, 0, 2) for RNN
            prediction = model(text)
            loss = loss_f(prediction, label)

            writer.add_scalar('loss', loss.data[0], global_step=global_step)

            loss.backward()        
            torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
            optimizer.step()

            if CUDA:
                torch.cuda.synchronize()
            global_step += 1

        # evaluation
        if epoch % log_every == 0:
            print('Epoch %s. Global step %s. T=%s min' % (epoch, global_step, (time() - start_time) / 60.))
            print('Loss               : %s' % loss.data[0])

        # in-batch
        _, idx = torch.max(prediction, 1)
        _labels = label.data.tolist()
        _predictions = idx.data.tolist()
        acc = accuracy_score(_labels, _predictions)
        f1 = f1_score(_labels, _predictions)
        writer.add_scalar('accuracy_train', acc, global_step=global_step)
        writer.add_scalar('f1_train', f1, global_step=global_step)
        if epoch % log_every == 0:
            print('In-batch accuracy  :', acc)

        # validation
        metrics = get_metrics(model, val_dataloader)
        if epoch % log_every == 0:
            print('Validation accuracy: %s, f1: %s' % (metrics['accuracy'], metrics['f1']))
            print()

        writer.add_scalar('accuracy_val', metrics['accuracy'], global_step=global_step)
        writer.add_scalar('f1_val', metrics['f1'], global_step=global_step)

    with open('models/%s.torch' % run_name.split('/')[-1], 'wb') as f:
        try:
            torch.save(model, f)
        except Exception as e:
            print(e)
            print('Continuing (probably) without saving')
        
    # Test
    model.eval()

    metrics_test = None

    print('Calculating validation metrics... Time %s min' % ((time() - start_time) / 60.))
    metrics_train = get_metrics(model, dataloader)
    acc_train = metrics_train['accuracy']
    f1_train = metrics_train['f1']

    for test_noise in NOISE_LEVELS:
        metrics = get_metrics(model, test, test_noise)
        if test_noise == noise_level:
            metrics_test = metrics

        acc_test = metrics['accuracy']
        f1_test = metrics['f1']
        results.append(mk_dataline(
            model_type='charCNN', epochs=epochs, lr=lr,
            noise_level_train=noise_level, acc_train=acc_train, f1_train=f1_train,
            noise_level_test=test_noise, acc_test=acc_test, f1_test=f1_test,
            dropout=dropout, model=model,
            init_function=init_function,
            run_name=run_name,
            task=task
        ))
    
    # test original
    metrics = get_metrics(model, test_original)
    results.append(mk_dataline(
        model_type='charCNN', epochs=epochs, lr=lr,
        noise_level_train=noise_level, acc_train=acc_train, f1_train=f1_train,
        noise_level_test=-1, acc_test=metrics['accuracy'], f1_test=metrics['f1'],
        dropout=dropout, model=model,
        init_function=init_function,
        run_name=run_name,
        task=task
    ))
    
    print('Original dataset: acc %s, f1 %s' % (metrics['accuracy'], metrics['f1']))
    writer.add_scalar('accuracy_test_original', metrics['accuracy'], global_step=global_step)
    writer.add_scalar('f1_test_original', metrics['f1'], global_step=global_step)

    print('Final test metrics: %s, Time %s min' % (metrics_test, ((time() - start_time) / 60.)))
    if metrics_test is not None:
        writer.add_scalar('accuracy_test_final', metrics_test['accuracy'], global_step=global_step)
        writer.add_scalar('f1_test_final', metrics_test['f1'], global_step=global_step)
    print()
    # model is in EVAL mode!
    return model

# Main exp

In [20]:
results = []

In [34]:
model = run_model_with(
    noise_level=0, n_filters=64, cnn_kernel_size=5,
    init_function=init.xavier_normal, log_every=10
)

Writer: runs/May15_23-38-40_phobos-aijun_charCNN_embed_smaller2_lr4_noise0_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.26346145073572796 min
Loss               : 0.7305917739868164
In-batch accuracy  : 0.0


  'recall', 'true', average, warn_for)


Validation accuracy: 0.5077333333333334, f1: 0.6733899504600142



  'precision', 'predicted', average, warn_for)


Epoch 10. Global step 7315. T=3.226904360453288 min
Loss               : 0.580379068851471
In-batch accuracy  : 0.5
Validation accuracy: 0.6837333333333333, f1: 0.7291000456829604

Epoch 20. Global step 13965. T=6.092093384265899 min
Loss               : 0.5446941256523132
In-batch accuracy  : 0.5
Validation accuracy: 0.7282666666666666, f1: 0.7204389574759945

Calculating validation metrics... Time 8.783432698249817 min


  "type " + obj.__name__ + ". It won't be checked "


Original dataset: acc 0.7418, f1 0.7482940144277636
Final test metrics: {'accuracy': 0.7418, 'f1': 0.7482940144277636}, Time 13.341286142667135 min



In [21]:
for noise_level in tqdm(NOISE_LEVELS):
    model = run_model_with(noise_level=noise_level, n_filters=64, cnn_kernel_size=5,
                   init_function=init.xavier_normal, log_every=10)

pd.DataFrame(results).to_csv('results/CharCNN_IMDB.csv')

  0%|          | 0/11 [00:00<?, ?it/s]

Writer: runs/May15_07-39-40_phobos-aijun_charCNN_embed_smaller2_lr4_noise0_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.07604155143102011 min
Loss               : 0.6920870542526245
In-batch accuracy  : 0.5
Validation accuracy: 0.5656, f1: 0.48595771536762383



  'precision', 'predicted', average, warn_for)


Epoch 10. Global step 7315. T=0.985636814435323 min
Loss               : 0.6295992136001587
In-batch accuracy  : 0.5


  'recall', 'true', average, warn_for)


Validation accuracy: 0.6946666666666667, f1: 0.7094646028926669

Epoch 20. Global step 13965. T=1.8904971639315287 min
Loss               : 0.3419972062110901
In-batch accuracy  : 1.0
Validation accuracy: 0.7226666666666667, f1: 0.7345584481878509

Calculating validation metrics... Time 2.7216787219047545 min


  "type " + obj.__name__ + ". It won't be checked "
  9%|▉         | 1/11 [05:57<59:30, 357.09s/it]

Original dataset: acc 0.73712, f1 0.7409947190037045
Final test metrics: {'accuracy': 0.73616, 'f1': 0.7406009123800534}, Time 5.9513659079869585 min

Writer: runs/May15_07-45-37_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.005_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.11152073939641317 min
Loss               : 0.7022352814674377
In-batch accuracy  : 0.0
Validation accuracy: 0.5552, f1: 0.5919765166340508

Epoch 10. Global step 7315. T=1.5042980035146079 min
Loss               : 0.5270780324935913
In-batch accuracy  : 1.0
Validation accuracy: 0.6981333333333334, f1: 0.7027310924369748

Epoch 20. Global step 13965. T=2.896902032693227 min
Loss               : 0.4095366895198822
In-batch accuracy  : 1.0
Validation accuracy: 0.7128, f1: 0.7448471926083866

Calculating validation metrics... Time 4.711442462603251 min


 18%|█▊        | 2/11 [15:46<1:10:58, 473.16s/it]

Original dataset: acc 0.74172, f1 0.7524061505425823
Final test metrics: {'accuracy': 0.74136, 'f1': 0.7520705521472393}, Time 9.820373165607453 min

Writer: runs/May15_07-55-26_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.01_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.1883750081062317 min
Loss               : 0.7170911431312561
In-batch accuracy  : 0.5
Validation accuracy: 0.5434666666666667, f1: 0.486502699460108

Epoch 10. Global step 7315. T=2.37151038646698 min
Loss               : 0.8055444955825806
In-batch accuracy  : 0.0
Validation accuracy: 0.6725333333333333, f1: 0.6381850324101356

Epoch 20. Global step 13965. T=4.513528688748678 min
Loss               : 0.3246620297431946
In-batch accuracy  : 1.0
Validation accuracy: 0.7210666666666666, f1: 0.7276041666666668

Calculating validation metrics... Time 6.500309022267659 min


 27%|██▋       | 3/11 [27:22<1:12:59, 547.40s/it]

Original dataset: acc 0.73864, f1 0.7465280471720072
Final test metrics: {'accuracy': 0.73692, 'f1': 0.7448104605595003}, Time 11.59800528685252 min

Writer: runs/May15_08-07-02_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.025_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.19005041519800822 min
Loss               : 0.6560627222061157
In-batch accuracy  : 1.0
Validation accuracy: 0.5261333333333333, f1: 0.6587286345304397

Epoch 10. Global step 7315. T=2.3852654337882995 min
Loss               : 0.6179137229919434
In-batch accuracy  : 0.5
Validation accuracy: 0.6613333333333333, f1: 0.6547036432843938

Epoch 20. Global step 13965. T=4.554977452754974 min
Loss               : 1.0299016237258911
In-batch accuracy  : 0.5
Validation accuracy: 0.7088, f1: 0.7245206861755803

Calculating validation metrics... Time 6.637977600097656 min


 36%|███▋      | 4/11 [39:32<1:09:12, 593.23s/it]

Original dataset: acc 0.72468, f1 0.7139116338999959
Final test metrics: {'accuracy': 0.72416, 'f1': 0.7132878762680857}, Time 12.178344980875652 min

Writer: runs/May15_08-19-12_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.05_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.1909635861714681 min
Loss               : 0.6384822130203247
In-batch accuracy  : 1.0
Validation accuracy: 0.5250666666666667, f1: 0.4601394361927857

Epoch 10. Global step 7315. T=2.5131717483202616 min
Loss               : 0.7300280332565308
In-batch accuracy  : 0.5
Validation accuracy: 0.6512, f1: 0.6414473684210525

Epoch 20. Global step 13965. T=4.803376424312591 min
Loss               : 0.6986559629440308
In-batch accuracy  : 0.5
Validation accuracy: 0.6890666666666667, f1: 0.7293407613741875

Calculating validation metrics... Time 6.891248905658722 min


 45%|████▌     | 5/11 [52:00<1:02:24, 624.04s/it]

Original dataset: acc 0.71352, f1 0.7213229571984435
Final test metrics: {'accuracy': 0.718, 'f1': 0.7265746199193299}, Time 12.454665855566661 min

Writer: runs/May15_08-31-40_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.075_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.18850224415461223 min
Loss               : 0.7039552927017212
In-batch accuracy  : 0.5
Validation accuracy: 0.4944, f1: 0.027692307692307693

Epoch 10. Global step 7315. T=2.49138286113739 min
Loss               : 0.6377542018890381
In-batch accuracy  : 0.5
Validation accuracy: 0.6290666666666667, f1: 0.5765601217656011

Epoch 20. Global step 13965. T=4.781709869702657 min
Loss               : 0.2550310492515564
In-batch accuracy  : 1.0
Validation accuracy: 0.6768, f1: 0.6992555831265509

Calculating validation metrics... Time 6.863145315647126 min


 55%|█████▍    | 6/11 [1:02:42<52:15, 627.03s/it]

Original dataset: acc 0.70052, f1 0.700052081246745
Final test metrics: {'accuracy': 0.69848, 'f1': 0.6970013666693464}, Time 10.69947506984075 min

Writer: runs/May15_08-42-22_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.1_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.13867592016855876 min
Loss               : 0.6907225847244263
In-batch accuracy  : 0.5
Validation accuracy: 0.4965333333333333, f1: 0.06069651741293532

Epoch 10. Global step 7315. T=1.846001962820689 min
Loss               : 0.7267296314239502
In-batch accuracy  : 0.5
Validation accuracy: 0.6381333333333333, f1: 0.6452287581699346

Epoch 20. Global step 13965. T=3.5540513396263123 min
Loss               : 0.43835192918777466
In-batch accuracy  : 0.5
Validation accuracy: 0.6690666666666667, f1: 0.6641407307171854

Calculating validation metrics... Time 5.107692666848501 min


 64%|██████▎   | 7/11 [1:12:59<41:42, 625.69s/it]

Original dataset: acc 0.68364, f1 0.7032826861752016
Final test metrics: {'accuracy': 0.68432, 'f1': 0.7030850263355906}, Time 10.29462761481603 min

Writer: runs/May15_08-52-39_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.125_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.19807033538818358 min
Loss               : 0.6840479969978333
In-batch accuracy  : 0.5
Validation accuracy: 0.512, f1: 0.49053452115812923

Epoch 10. Global step 7315. T=2.539535431067149 min
Loss               : 0.7711414694786072
In-batch accuracy  : 0.5
Validation accuracy: 0.6210666666666667, f1: 0.6148007590132827

Epoch 20. Global step 13965. T=4.8980390985806785 min
Loss               : 0.3088642954826355
In-batch accuracy  : 1.0
Validation accuracy: 0.6429333333333334, f1: 0.6181921870544625

Calculating validation metrics... Time 7.056734299659729 min


 73%|███████▎  | 8/11 [1:25:33<32:05, 641.72s/it]

Original dataset: acc 0.64968, f1 0.7045209176788125
Final test metrics: {'accuracy': 0.64888, 'f1': 0.7034860154033239}, Time 12.564697790145875 min

Writer: runs/May15_09-05-13_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.15_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.2069850961367289 min
Loss               : 0.6446655988693237
In-batch accuracy  : 1.0
Validation accuracy: 0.4925333333333333, f1: 0.011428571428571427

Epoch 10. Global step 7315. T=2.566296072800954 min
Loss               : 0.6213182806968689
In-batch accuracy  : 0.5
Validation accuracy: 0.572, f1: 0.682618153055171

Epoch 20. Global step 13965. T=4.950214680035909 min
Loss               : 1.0378543138504028
In-batch accuracy  : 0.0
Validation accuracy: 0.6488, f1: 0.6651411136536994

Calculating validation metrics... Time 7.120599893728892 min


 82%|████████▏ | 9/11 [1:38:14<21:49, 654.90s/it]

Original dataset: acc 0.64324, f1 0.5904018369690012
Final test metrics: {'accuracy': 0.64836, 'f1': 0.5950154328096928}, Time 12.672563286622365 min

Writer: runs/May15_09-17-54_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.175_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.20301238298416138 min
Loss               : 0.6737090349197388
In-batch accuracy  : 0.5
Validation accuracy: 0.5053333333333333, f1: 0.24501424501424499

Epoch 10. Global step 7315. T=2.6100611686706543 min
Loss               : 0.9757543802261353
In-batch accuracy  : 0.0
Validation accuracy: 0.5677333333333333, f1: 0.6680319475732132

Epoch 20. Global step 13965. T=4.964823528130849 min
Loss               : 0.6471542716026306
In-batch accuracy  : 0.5
Validation accuracy: 0.6426666666666667, f1: 0.6712463199214915

Calculating validation metrics... Time 7.133025042215983 min


 91%|█████████ | 10/11 [1:50:58<11:05, 665.90s/it]

Original dataset: acc 0.6402, f1 0.5819195909830351
Final test metrics: {'accuracy': 0.636, 'f1': 0.5778829204935523}, Time 12.74777292807897 min

Writer: runs/May15_09-30-39_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.2_dropout0.5_filters64_cnn_kernel5
Epoch 0. Global step 665. T=0.20064572095870972 min
Loss               : 0.7360313534736633
In-batch accuracy  : 0.0
Validation accuracy: 0.4984, f1: 0.11648661343353689

Epoch 10. Global step 7315. T=2.6507937868436175 min
Loss               : 0.6601804494857788
In-batch accuracy  : 0.5
Validation accuracy: 0.5901333333333333, f1: 0.6233766233766234

Epoch 20. Global step 13965. T=5.0814787228902185 min
Loss               : 0.7808637619018555
In-batch accuracy  : 0.5
Validation accuracy: 0.6293333333333333, f1: 0.6249325418240691

Calculating validation metrics... Time 7.280388518174489 min


100%|██████████| 11/11 [2:02:25<00:00, 667.79s/it]

Original dataset: acc 0.63912, f1 0.6476331823152632
Final test metrics: {'accuracy': 0.64376, 'f1': 0.653139118242717}, Time 11.445397675037384 min






AttributeError: 'list' object has no attribute 'to_csv'

In [24]:
pd.DataFrame(results).to_csv('results/CharCNN_IMDB.csv')

In [32]:
pd.DataFrame(results)['run_name'].str[40:]

0      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
1      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
2      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
3      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
4      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
5      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
6      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
7      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
8      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
9      _embed_smaller2_lr4_noise0_dropout0.5_filters6...
10     _embed_smaller2_lr4_noise0_dropout0.5_filters6...
11     _embed_smaller2_lr4_noise0_dropout0.5_filters6...
12     _embed_smaller2_lr4_noise0.005_dropout0.5_filt...
13     _embed_smaller2_lr4_noise0.005_dropout0.5_filt...
14     _embed_smaller2_lr4_noise0.005_dropout0.5_filt...
15     _embed_smaller2_lr4_noise0.005_dropout0.5_filt...
16     _embed_smaller2_lr4_noise0.005_dropout0.5_filt...
17     _embed_smaller2_lr4_nois

In [None]:
pd.

In [None]:
for noise_level in tqdm(NOISE_LEVELS):
    run_model_with(noise_level=noise_level, init_function=init.xavier_normal)
pd.DataFrame(results.to_csv('results/CharCNN_mokoron_noemoji.csv'))

  0%|          | 0/11 [00:00<?, ?it/s]

Writer: runs/May13_20-49-28_phobos-aijun_charCNN_embed_smaller2_lr4_noise0_dropout0.5_no_emoji
Epoch 0. Global step 4757. T=0.3843465248743693 min
Loss               : 0.4333800673484802
In-batch accuracy  : 0.8125
Validation accuracy: 0.7009443218052489, f1: 0.7026400829217732

Epoch 1. Global step 9514. T=0.7659179250399272 min
Loss               : 0.5420602560043335
In-batch accuracy  : 0.75
Validation accuracy: 0.7258707382879568, f1: 0.7406376004409249

Epoch 2. Global step 14271. T=1.1522807518641154 min
Loss               : 0.3842434287071228
In-batch accuracy  : 0.875
Validation accuracy: 0.7401888643610498, f1: 0.7459679836920678

Epoch 3. Global step 19028. T=1.5361450552940368 min
Loss               : 0.5443617105484009
In-batch accuracy  : 0.6875
Validation accuracy: 0.7494787834191807, f1: 0.753194188540188

Epoch 4. Global step 23785. T=1.9242071827252707 min
Loss               : 0.5320109128952026
In-batch accuracy  : 0.6875
Validation accuracy: 0.7582474858964925, f1: 0

  "type " + obj.__name__ + ". It won't be checked "
  9%|▉         | 1/11 [13:23<2:13:50, 803.00s/it]

Original dataset: acc 0.7708793230316409, f1 0.7708582467114341
Final test metrics: {'f1': 0.7698217578365089, 'accuracy': 0.7703581064508217}, Time 13.3831045627594 min

Writer: runs/May13_21-02-50_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.005_dropout0.5_no_emoji
Epoch 0. Global step 4757. T=0.33614275058110554 min
Loss               : 0.5651463270187378
In-batch accuracy  : 0.8125
Validation accuracy: 0.7065857247976454, f1: 0.7015716602220281

Epoch 1. Global step 9514. T=0.730189315478007 min
Loss               : 0.44066038727760315
In-batch accuracy  : 0.875
Validation accuracy: 0.7267905322541084, f1: 0.7325389440826006

Epoch 2. Global step 14271. T=1.1234610994656882 min
Loss               : 0.7339755296707153
In-batch accuracy  : 0.4375
Validation accuracy: 0.7391464311994114, f1: 0.7312867159370855

Epoch 3. Global step 19028. T=1.5168570677439372 min
Loss               : 0.44902440905570984
In-batch accuracy  : 0.75
Validation accuracy: 0.7451250919793966, f1: 0.7341796

 18%|█▊        | 2/11 [27:02<2:01:40, 811.18s/it]

Original dataset: acc 0.7758462104488595, f1 0.7813500014953495
Final test metrics: {'f1': 0.7857743097238895, 'accuracy': 0.7811503556536669}, Time 13.655587859948476 min

Writer: runs/May13_21-16-29_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.01_dropout0.5_no_emoji
Epoch 0. Global step 4757. T=0.3356864054997762 min
Loss               : 0.42460814118385315
In-batch accuracy  : 0.9375
Validation accuracy: 0.6925741967132696, f1: 0.7160857377466943

Epoch 1. Global step 9514. T=0.7333456357320149 min
Loss               : 0.4754043519496918
In-batch accuracy  : 0.6875
Validation accuracy: 0.7175925925925926, f1: 0.7123629891015832

Epoch 2. Global step 14271. T=1.1282407283782958 min
Loss               : 0.5356807708740234
In-batch accuracy  : 0.6875
Validation accuracy: 0.7306230071130734, f1: 0.7163793660016785

Epoch 3. Global step 19028. T=1.5222609519958497 min
Loss               : 0.45465928316116333
In-batch accuracy  : 0.8125
Validation accuracy: 0.7419671326956095, f1: 0.732

 27%|██▋       | 3/11 [40:40<1:48:28, 813.61s/it]

Original dataset: acc 0.7776857983811626, f1 0.7763210661072893
Final test metrics: {'f1': 0.7766152317675185, 'accuracy': 0.7794027471179789}, Time 13.64095369974772 min

Writer: runs/May13_21-30-07_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.025_dropout0.5_no_emoji
Epoch 0. Global step 4757. T=0.33512759606043496 min
Loss               : 0.4829114079475403
In-batch accuracy  : 0.75
Validation accuracy: 0.6939845474613686, f1: 0.6782709602552944

Epoch 1. Global step 9514. T=0.7303391377131144 min
Loss               : 0.7523029446601868
In-batch accuracy  : 0.5625
Validation accuracy: 0.719401520726024, f1: 0.7205496183206106

Epoch 2. Global step 14271. T=1.121863806247711 min
Loss               : 0.5633825063705444
In-batch accuracy  : 0.6875
Validation accuracy: 0.7240618101545254, f1: 0.7209821428571429

Epoch 3. Global step 19028. T=1.515214463075002 min
Loss               : 0.2992437481880188
In-batch accuracy  : 0.9375
Validation accuracy: 0.7322173166544027, f1: 0.743162971

 36%|███▋      | 4/11 [54:19<1:35:03, 814.81s/it]

Original dataset: acc 0.7736080451312239, f1 0.7806689241371116
Final test metrics: {'f1': 0.7788470132578579, 'accuracy': 0.7724123129752268}, Time 13.639712619781495 min

Writer: runs/May13_21-43-46_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.05_dropout0.5_no_emoji
Epoch 0. Global step 4757. T=0.3341816544532776 min
Loss               : 0.729953408241272
In-batch accuracy  : 0.4375
Validation accuracy: 0.6847559480009812, f1: 0.6779023870684794

Epoch 1. Global step 9514. T=0.7405523379643758 min
Loss               : 0.5235426425933838
In-batch accuracy  : 0.8125
Validation accuracy: 0.7006990434142752, f1: 0.7061585696225392

Epoch 2. Global step 14271. T=1.1382336338361105 min
Loss               : 0.5956830978393555
In-batch accuracy  : 0.625
Validation accuracy: 0.7155077262693157, f1: 0.7156385032637677

Epoch 3. Global step 19028. T=1.5346531748771668 min
Loss               : 0.725429892539978
In-batch accuracy  : 0.625
Validation accuracy: 0.7201680156978171, f1: 0.698490304

 45%|████▌     | 5/11 [1:07:58<1:21:34, 815.73s/it]

Original dataset: acc 0.7624172185430463, f1 0.7570465590217903
Final test metrics: {'f1': 0.7541622164980838, 'accuracy': 0.760056414029924}, Time 13.656627666950225 min

Writer: runs/May13_21-57-25_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.075_dropout0.5_no_emoji
Epoch 0. Global step 4757. T=0.3359651565551758 min
Loss               : 0.6112332344055176
In-batch accuracy  : 0.6875
Validation accuracy: 0.6780107922492028, f1: 0.6744575325480471

Epoch 1. Global step 9514. T=0.7363389531771342 min
Loss               : 0.7393947839736938
In-batch accuracy  : 0.625
Validation accuracy: 0.6967745891586952, f1: 0.7033415321854941

Epoch 2. Global step 14271. T=1.135967202981313 min
Loss               : 0.4217904806137085
In-batch accuracy  : 0.9375
Validation accuracy: 0.7073522197694383, f1: 0.7064612356613464

Epoch 3. Global step 19028. T=1.537773108482361 min
Loss               : 0.46832719445228577
In-batch accuracy  : 0.6875
Validation accuracy: 0.7131162619573216, f1: 0.7046681

 55%|█████▍    | 6/11 [1:21:42<1:08:05, 817.10s/it]

Original dataset: acc 0.7520848663232769, f1 0.7467268057382698
Final test metrics: {'f1': 0.747085370440015, 'accuracy': 0.7525754231052244}, Time 13.732470726966858 min

Writer: runs/May13_22-11-09_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.1_dropout0.5_no_emoji
Epoch 0. Global step 4757. T=0.3339157819747925 min
Loss               : 0.6387908458709717
In-batch accuracy  : 0.625
Validation accuracy: 0.6652563159185676, f1: 0.6376128518321826

Epoch 1. Global step 9514. T=0.7288359483083089 min
Loss               : 0.5573965907096863
In-batch accuracy  : 0.75
Validation accuracy: 0.6880978660779985, f1: 0.6895730981660614

Epoch 2. Global step 14271. T=1.1228139638900756 min
Loss               : 0.56484055519104
In-batch accuracy  : 0.625
Validation accuracy: 0.6984608780966397, f1: 0.7121491497643926

Epoch 3. Global step 19028. T=1.5152902722358703 min
Loss               : 0.5416707992553711
In-batch accuracy  : 0.6875
Validation accuracy: 0.7037650233014472, f1: 0.6862987012987

 64%|██████▎   | 7/11 [1:35:23<54:30, 817.65s/it]  

Original dataset: acc 0.7443279372087319, f1 0.7473413119224359
Final test metrics: {'f1': 0.7461721466479594, 'accuracy': 0.7448491537895512}, Time 13.681911969184876 min

Writer: runs/May13_22-24-50_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.125_dropout0.5_no_emoji
Epoch 0. Global step 4757. T=0.3358120242754618 min
Loss               : 0.5713484287261963
In-batch accuracy  : 0.75
Validation accuracy: 0.6546786853078244, f1: 0.6060097247000386

Epoch 1. Global step 9514. T=0.7310911774635315 min
Loss               : 0.4563905894756317
In-batch accuracy  : 0.8125
Validation accuracy: 0.6788999264164827, f1: 0.6470528763522394

Epoch 2. Global step 14271. T=1.1284239172935486 min
Loss               : 0.5607260465621948
In-batch accuracy  : 0.625
Validation accuracy: 0.6932180524895757, f1: 0.681601221918157

Epoch 3. Global step 19028. T=1.5249507109324136 min
Loss               : 0.47985750436782837
In-batch accuracy  : 0.8125
Validation accuracy: 0.6982769193034094, f1: 0.7001066

 73%|███████▎  | 8/11 [1:49:02<40:53, 817.85s/it]

Original dataset: acc 0.7375521216580819, f1 0.7441874364951288
Final test metrics: {'f1': 0.7404779798587104, 'accuracy': 0.7353139563404464}, Time 13.654931231339772 min

Writer: runs/May13_22-38-29_phobos-aijun_charCNN_embed_smaller2_lr4_noise0.15_dropout0.5_no_emoji
Epoch 0. Global step 4757. T=0.3329009493192037 min
Loss               : 0.622446596622467
In-batch accuracy  : 0.5625
Validation accuracy: 0.6607799852832965, f1: 0.6651737077835613

Epoch 1. Global step 9514. T=0.7281931877136231 min
Loss               : 0.6457914113998413
In-batch accuracy  : 0.5
Validation accuracy: 0.6765391219033603, f1: 0.6775869445632907

Epoch 2. Global step 14271. T=1.123043922583262 min
Loss               : 0.5680097341537476
In-batch accuracy  : 0.5625
Validation accuracy: 0.6869941133186166, f1: 0.6813769857370245

Epoch 3. Global step 19028. T=1.518110485871633 min
Loss               : 0.7019524574279785
In-batch accuracy  : 0.5625
Validation accuracy: 0.6930954132940887, f1: 0.68422712933

In [None]:
for noise_level in tqdm(NOISE_LEVELS):
    run_model_with(noise_level=noise_level, init_function=init.xavier_normal)

In [None]:
# searching for phase transition

for noise_level in tqdm(np.arange(0, .01, 0.001)):
    run_model_with(noise_level=noise_level, init_function=init.xavier_normal)

In [None]:
for noise_level in tqdm(NOISE_LEVELS):
    run_model_with(
        noise_level=noise_level, init_function=init.xavier_normal, log_every=5, epochs=30, comment='_spellchecked'
    )

pd.DataFrame(results).to_csv('CharCNN_spellchecked.csv')

  0%|          | 0/11 [00:00<?, ?it/s]

Writer: runs/May12_19-31-46_madrugado_charCNN_embed_smaller2_lr4_noise0_dropout0.5_spellchecked


In [15]:
pd.DataFrame(results).to_csv('CharCNN_with_emoji.csv')

# Exp (other)

In [35]:
model = run_model_with(
    noise_level=0, n_filters=256, cnn_kernel_size=15, init_function=init.xavier_normal, dropout=0, log_every=10
)

Writer: runs/May14_00-37-34_phobos-aijun_charCNN_embed_smaller2_lr4_noise0_dropout0_filters256_cnn_kernel15
Epoch 0. Global step 217. T=0.01633059581120809 min
Loss               : 0.6889715790748596
In-batch accuracy  : 0.5
Validation accuracy: 0.5114678899082569, f1: 0.6753048780487805

Epoch 10. Global step 2387. T=0.21625566482543945 min
Loss               : 0.5218710899353027
In-batch accuracy  : 0.875
Validation accuracy: 0.6731651376146789, f1: 0.7046632124352331

Epoch 20. Global step 4557. T=0.4164242744445801 min
Loss               : 0.3434000015258789
In-batch accuracy  : 0.875
Validation accuracy: 0.6869266055045872, f1: 0.7135362014690451

Calculating validation metrics... Time 0.5979397575060527 min


  "type " + obj.__name__ + ". It won't be checked "


Original dataset: acc 0.700164744645799, f1 0.7077087794432547
Final test metrics: {'accuracy': 0.6974190005491488, 'f1': 0.7039226222461042}, Time 0.6913856983184814 min



In [25]:
%%time
run_model_with(noise_level=0, n_filters=16, cnn_kernel_size=5,
               init_function=init.xavier_normal, comment='_makaron_test', epochs=1)

Writer: runs/May14_00-29-01_phobos-aijun_charCNN_embed_smaller2_lr4_noise0_dropout16_filters5_cnn_kernel0.5_makaron_test
Epoch 0. Global step 217. T=0.014664558569590251 min
Loss               : 0.7041813135147095
In-batch accuracy  : 0.25
Validation accuracy: 0.5126146788990825, f1: 0.6748278500382554

Calculating validation metrics... Time 0.018363038698832195 min


  "type " + obj.__name__ + ". It won't be checked "


Original dataset: acc 0.500823723228995, f1 0.6654398233345602
Final test metrics: {'accuracy': 0.500274574409665, 'f1': 0.6651949963208241}, Time 0.11014003753662109 min

CPU times: user 5.62 s, sys: 765 ms, total: 6.39 s
Wall time: 6.61 s


CharCNN(
  (embedding): Linear(in_features=74, out_features=74, bias=True)
  (conv): Sequential(
    (0): Conv1d(74, 16, kernel_size=(5,), stride=(2,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=64, padding=0, dilation=1, ceil_mode=False)
  )
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=16, out_features=2, bias=True)
)

In [14]:
%%time
run_model_with(noise_level=0, n_filters=16, cnn_kernel_size=5,
               init_function=init.xavier_normal, comment='_makaron_test', epochs=1)

Writer: runs/May14_00-09-42_phobos-aijun_charCNN_embed_smaller2_lr4_noise0_dropout0.5_makaron_test
Epoch 0. Global step 4757. T=0.3195761243502299 min
Loss               : 0.034621015191078186
In-batch accuracy  : 1.0
Validation accuracy: 0.984271523178808, f1: 0.9844068208760144

Calculating validation metrics... Time 0.3735865354537964 min


  "type " + obj.__name__ + ". It won't be checked "


Original dataset: acc 0.9840569045867059, f1 0.9841839527951821
Final test metrics: {'accuracy': 0.9840569045867059, 'f1': 0.9841897233201581}, Time 2.071729278564453 min

CPU times: user 1min 49s, sys: 11.1 s, total: 2min
Wall time: 2min 4s


CharCNN(
  (embedding): Linear(in_features=74, out_features=74, bias=True)
  (conv): Sequential(
    (0): Conv1d(74, 16, kernel_size=(5,), stride=(2,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=64, padding=0, dilation=1, ceil_mode=False)
  )
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=16, out_features=2, bias=True)
)

In [14]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal, epochs=1)

Writer: runs/May12_18-45-28_madrugado_charCNN_embed_smaller2_lr4_noise0_dropout0.5
Epoch 0. Global step 4757. T=0.34553041458129885 min
Loss               : 1.6093254089355469e-06
In-batch accuracy  : 1.0
Validation accuracy: 0.998589649252, f1: 0.998609851919

Calculating validation metrics... Time 0.3979764183362325 min


  "type " + obj.__name__ + ". It won't be checked "


Final test metrics: {'accuracy': 0.9977005150846211, 'f1': 0.99773311168203116}, Time 1.8905688643455505 min

CPU times: user 1min 37s, sys: 11 s, total: 1min 48s
Wall time: 1min 53s


In [41]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal, log_every=5, epochs=30, comment='no_emoji')

Writer: runs/May12_14-42-03_madrugado_charCNN_embed_smaller2_lr4_noise0_dropout0.5no_emoji
Epoch 0. Global step 4962. T=0.32575590213139854 min
Loss               : 0.5092490911483765
In-batch accuracy  : 0.625
Validation accuracy: 0.716708302719, f1: 0.698149249992

Epoch 1. Global step 9924. T=0.705766232808431 min
Loss               : 0.44099992513656616
In-batch accuracy  : 0.84375
Validation accuracy: 0.744217487142, f1: 0.737125079288

Epoch 2. Global step 14886. T=1.085922352472941 min
Loss               : 0.3741908669471741
In-batch accuracy  : 0.90625
Validation accuracy: 0.75664952241, f1: 0.750962463908

Epoch 3. Global step 19848. T=1.4666020234425863 min
Loss               : 0.41205355525016785
In-batch accuracy  : 0.78125
Validation accuracy: 0.766789125643, f1: 0.765797939848

Epoch 4. Global step 24810. T=1.847240976492564 min
Loss               : 0.3329382538795471
In-batch accuracy  : 0.8125
Validation accuracy: 0.770345334313, f1: 0.761739236492

Epoch 5. Global step

In [59]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal, log_every=5, epochs=30, comment='no_emoji_enabled_dropout')

Writer: runs/May12_15-47-15_madrugado_charCNN_embed_smaller2_lr4_noise0_dropout0.5no_emoji_enabled_dropout
Epoch 0. Global step 4962. T=0.32687731583913165 min
Loss               : 0.6044824123382568
In-batch accuracy  : 0.65625
Validation accuracy: 0.70407053637, f1: 0.690041557642

Epoch 5. Global step 29772. T=2.251354956626892 min
Loss               : 0.3281029462814331
In-batch accuracy  : 0.90625
Validation accuracy: 0.762762674504, f1: 0.764183464797

Epoch 10. Global step 54582. T=4.174359858036041 min
Loss               : 0.396010160446167
In-batch accuracy  : 0.8125
Validation accuracy: 0.780014695077, f1: 0.785302469667

Epoch 15. Global step 79392. T=6.104336130619049 min
Loss               : 0.4849695861339569
In-batch accuracy  : 0.78125
Validation accuracy: 0.780044085231, f1: 0.785706104684

Epoch 20. Global step 104202. T=8.03821431795756 min
Loss               : 0.357676237821579
In-batch accuracy  : 0.75
Validation accuracy: 0.782806759735, f1: 0.789110210604

Epoch 

  "type " + obj.__name__ + ". It won't be checked "


Final test metrics: {'accuracy': 0.78104335047758999, 'f1': 0.7861898748708529}, Time 13.131581568717957 min

CPU times: user 9min 32s, sys: 2min 41s, total: 12min 13s
Wall time: 13min 7s


In [73]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal, log_every=5, epochs=30, comment='_enabled_dropout')

Writer: runs/May12_16-20-08_madrugado_charCNN_embed_smaller2_lr4_noise0_dropout0.5_enabled_dropout
Epoch 0. Global step 4962. T=0.3338169852892558 min
Loss               : 9.800493717193604e-05
In-batch accuracy  : 1.0
Validation accuracy: 0.999000734754, f1: 0.999017624964

Epoch 5. Global step 29772. T=2.276704251766205 min
Loss               : 8.940696716308594e-08
In-batch accuracy  : 1.0
Validation accuracy: 0.999764878766, f1: 0.999768906349

Epoch 10. Global step 54582. T=4.223753829797109 min
Loss               : 8.940696716308594e-08
In-batch accuracy  : 1.0
Validation accuracy: 0.999764878766, f1: 0.999768906349

Epoch 15. Global step 79392. T=6.173992296059926 min
Loss               : 0.0
In-batch accuracy  : 1.0
Validation accuracy: 0.999764878766, f1: 0.9997689197

Epoch 20. Global step 104202. T=8.125758945941925 min
Loss               : 0.0
In-batch accuracy  : 1.0
Validation accuracy: 0.999823659074, f1: 0.999826699786

Epoch 25. Global step 129012. T=10.077120367685954

  "type " + obj.__name__ + ". It won't be checked "


Final test metrics: {'accuracy': 0.99994121969140337, 'f1': 0.99994241621559377}, Time 13.275928223133088 min

CPU times: user 9min 41s, sys: 2min 44s, total: 12min 25s
Wall time: 13min 16s


In [30]:
%%time
model = run_model_with(noise_level=0.1, init_function=init.xavier_normal, log_every=5, epochs=30, comment='_spellchecked')

Writer: runs/May12_19-15-08_madrugado_charCNN_embed_smaller2_lr4_noise0.1_dropout0.5_spellchecked
Epoch 0. Global step 4757. T=0.32279518445332844 min
Loss               : 0.23855933547019958
In-batch accuracy  : 0.875
Validation accuracy: 0.960111601668, f1: 0.961088679527

Epoch 5. Global step 28542. T=2.233179748058319 min
Loss               : 0.11236453801393509
In-batch accuracy  : 0.9375
Validation accuracy: 0.965538386068, f1: 0.965873208647

Epoch 10. Global step 52327. T=4.14134658575058 min
Loss               : 0.05149190500378609
In-batch accuracy  : 0.9375
Validation accuracy: 0.966918077017, f1: 0.967321844998

Epoch 15. Global step 76112. T=6.052044069766998 min
Loss               : 0.006237946450710297
In-batch accuracy  : 1.0
Validation accuracy: 0.966887417219, f1: 0.967575357272

Epoch 20. Global step 99897. T=7.961504407723745 min
Loss               : 0.032453540712594986
In-batch accuracy  : 1.0
Validation accuracy: 0.970903850871, f1: 0.971349213537

Epoch 25. Glob

  "type " + obj.__name__ + ". It won't be checked "


Final test metrics: {'accuracy': 0.96894162374294823, 'f1': 0.96946863980228459}, Time 13.229927579561869 min

CPU times: user 9min 35s, sys: 2min 33s, total: 12min 9s
Wall time: 13min 13s


In [34]:
pd.DataFrame(results).to_csv('CharCNN_with_emoji_spellchecked.csv')

In [13]:
text, label = train[2]
train.onehot2text(text), label

('@envoyatthenet и мы под предводительтвом пу боль<UNK>ими скачками несемся в совок,а потом и в деспотию(((',
 0)

In [55]:
text, label = train[20000]
train.onehot2text(text), label

('ахахахаха пипец, прикольная фигня :d http://t.co/tsvhwraala', 1)

In [16]:
torch.max(
    model.forward(Variable(test._preprocess_text_nobatch('ахахахаха пипец, прикольная фигня').unsqueeze(0).permute(1, 0, 2).cuda())),
    1
)

(Variable containing:
  1.2101
 [torch.cuda.FloatTensor of size 1 (GPU 0)], Variable containing:
  0
 [torch.cuda.LongTensor of size 1 (GPU 0)])

In [22]:
torch.max(
    model.forward(Variable(test._preprocess_text_nobatch('Новые Мстители потрясающие!').unsqueeze(0).permute(1, 0, 2).cuda())),
    1
)[1].data


 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [21]:
torch.max(
    model.forward(Variable(test._preprocess_text_nobatch('Новые Мстители потрясающие! =)').unsqueeze(0).permute(1, 0, 2).cuda())),
    1
)[1].data


 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [32]:
torch.max(
    model.forward(Variable(test._preprocess_text_nobatch('Это было ужасно! =)').unsqueeze(0).permute(1, 0, 2).cuda())),
    1
)[1].data


 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [30]:
torch.max(
    model.forward(Variable(test._preprocess_text_nobatch('Это было ужасно! =()').unsqueeze(0).permute(1, 0, 2).cuda())),
    1
)[1].data


 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [33]:
torch.max(
    model.forward(Variable(test._preprocess_text_nobatch('Это было ужасно! =(').unsqueeze(0).permute(1, 0, 2).cuda())),
    1
)[1].data


 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [34]:
torch.max(
    model.forward(Variable(test._preprocess_text_nobatch('Это было ужасно!').unsqueeze(0).permute(1, 0, 2).cuda())),
    1
)[1].data


 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [56]:
model.forward(Variable(test._preprocess_text_nobatch('ахахахаха пипец, прикольная фигня').unsqueeze(0).permute(1, 0, 2).cuda()))

Variable containing:
 1.2101 -1.1104
[torch.cuda.FloatTensor of size 1x2 (GPU 0)]

In [32]:
%%time
run_model_with(noise_level=0.01, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.7154096364974976
In-batch accuracy: 0.5
Validation accuracy: 0.6276

Loss after epoch 1:
Global step: 1408
0.3147427439689636
In-batch accuracy: 1.0
Validation accuracy: 0.7424

Loss after epoch 2:
Global step: 2112
0.43052104115486145
In-batch accuracy: 0.75
Validation accuracy: 0.7732

Loss after epoch 3:
Global step: 2816
0.14214535057544708
In-batch accuracy: 1.0
Validation accuracy: 0.8052

Loss after epoch 4:
Global step: 3520
0.16783644258975983
In-batch accuracy: 1.0
Validation accuracy: 0.8112

Loss after epoch 5:
Global step: 4224
0.5878106951713562
In-batch accuracy: 0.5
Validation accuracy: 0.82

Loss after epoch 6:
Global step: 4928
0.1835513859987259
In-batch accuracy: 1.0
Validation accuracy: 0.8224

Loss after epoch 7:
Global step: 5632
0.036214977502822876
In-batch accuracy: 1.0
Validation accuracy: 0.8056

Loss after epoch 8:
Global step: 6336
0.036427706480026245
In-batch accuracy: 1.0
Validation accuracy: 0.8236

Loss after epo

CharCNN(
  (conv1): Sequential(
    (0): Conv1d(64, 256, kernel_size=(16,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Sequential(
    (0): Linear(in_features=30464, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
  )
  (fc3): Linear(in_features=1024, out_features=2, bias=True)
)

In [38]:
%%time
run_model_with(noise_level=0.01, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.785068154335022
In-batch accuracy: 0.25
Validation accuracy: 0.6084

Loss after epoch 1:
Global step: 1408
0.3312198519706726
In-batch accuracy: 1.0
Validation accuracy: 0.7292

Loss after epoch 2:
Global step: 2112
0.2843222916126251
In-batch accuracy: 1.0
Validation accuracy: 0.7424

Loss after epoch 3:
Global step: 2816
0.1944502741098404
In-batch accuracy: 1.0
Validation accuracy: 0.7604

Loss after epoch 4:
Global step: 3520
0.24288120865821838
In-batch accuracy: 0.75
Validation accuracy: 0.8008

Loss after epoch 5:
Global step: 4224
0.36140114068984985
In-batch accuracy: 0.75
Validation accuracy: 0.8036

Loss after epoch 6:
Global step: 4928
0.2669448256492615
In-batch accuracy: 0.75
Validation accuracy: 0.8048

Loss after epoch 7:
Global step: 5632
0.18363156914710999
In-batch accuracy: 1.0
Validation accuracy: 0.8092

Loss after epoch 8:
Global step: 6336
0.48155519366264343
In-batch accuracy: 0.75
Validation accuracy: 0.8184

Loss after e

CharCNN(
  (conv): Sequential(
    (0): Conv1d(64, 256, kernel_size=(16,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=64, stride=8, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=30464, out_features=2, bias=True)
)

In [12]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.8411439657211304
In-batch accuracy: 0.25
Validation accuracy: 0.5012

Loss after epoch 1:
Global step: 1408
0.5809738636016846
In-batch accuracy: 0.75
Validation accuracy: 0.66

Loss after epoch 2:
Global step: 2112
0.4452686309814453
In-batch accuracy: 1.0
Validation accuracy: 0.6824

Loss after epoch 3:
Global step: 2816
0.23656554520130157
In-batch accuracy: 1.0
Validation accuracy: 0.7136

Loss after epoch 4:
Global step: 3520
0.5787070393562317
In-batch accuracy: 0.5
Validation accuracy: 0.7248

Loss after epoch 5:
Global step: 4224
0.502457857131958
In-batch accuracy: 0.75
Validation accuracy: 0.726

Loss after epoch 6:
Global step: 4928
0.12519408762454987
In-batch accuracy: 1.0
Validation accuracy: 0.7332

Loss after epoch 7:
Global step: 5632
0.731816291809082
In-batch accuracy: 0.75
Validation accuracy: 0.736

Loss after epoch 8:
Global step: 6336
0.12233468890190125
In-batch accuracy: 1.0
Validation accuracy: 0.7276

Loss after epoch 9:

In [13]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.7073005437850952
In-batch accuracy: 0.5
Validation accuracy: 0.6404

Loss after epoch 1:
Global step: 1408
0.5612970590591431
In-batch accuracy: 0.75
Validation accuracy: 0.7176

Loss after epoch 2:
Global step: 2112
0.9215638637542725
In-batch accuracy: 0.25
Validation accuracy: 0.7228

Loss after epoch 3:
Global step: 2816
0.3698350191116333
In-batch accuracy: 1.0
Validation accuracy: 0.728

Loss after epoch 4:
Global step: 3520
0.584014356136322
In-batch accuracy: 0.5
Validation accuracy: 0.7556

Loss after epoch 5:
Global step: 4224
0.19489029049873352
In-batch accuracy: 1.0
Validation accuracy: 0.758

Loss after epoch 6:
Global step: 4928
0.41615644097328186
In-batch accuracy: 0.75
Validation accuracy: 0.7776

Loss after epoch 7:
Global step: 5632
0.2622656226158142
In-batch accuracy: 1.0
Validation accuracy: 0.7752

Loss after epoch 8:
Global step: 6336
0.16676472127437592
In-batch accuracy: 1.0
Validation accuracy: 0.7824

Loss after epoch 

In [10]:
def predict(model, text):
    text = preprocess_text_nobatch(text)
    text = text.unsqueeze(0).permute(0, 2, 1)
    text = Variable(text.cuda())
    prediction = model(text)
    _, prediction = torch.max(prediction, 1)
    return prediction

In [61]:
predict(model, 'I love it')

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [62]:
predict(model, 'I hate it')

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [68]:
predict(model, 'I have seen this film as I was a child and it was awersome! Love it! Love it!')

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [74]:
predict(model, 'Love it! Love it!  Love it! Love it! Love it! Love it! Love it!')

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [80]:
predict(model, "Maybe just long enough text if really suficcient so let's write something neutral")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [81]:
predict(model, "We need more emotions! Like when film is cool you are so happy to rank it 10")

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [82]:
predict(model, "So only long texts")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [85]:
predict(model, "This is not good for tweets")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [87]:
predict(model, "This is very good for tweets")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

Что-то такое себе

Попробуем обучить на малой длине (140)

In [11]:
%%time
model = run_model_with(noise_level=0, init_function=init.xavier_normal)

Loss after epoch 0:
Global step: 704
0.70023113489151
In-batch accuracy: 0.5
Validation accuracy: 0.5332

Loss after epoch 1:
Global step: 1408
0.7055613398551941
In-batch accuracy: 0.5
Validation accuracy: 0.6228

Loss after epoch 2:
Global step: 2112
0.46998968720436096
In-batch accuracy: 1.0
Validation accuracy: 0.6316

Loss after epoch 3:
Global step: 2816
0.7275577783584595
In-batch accuracy: 0.5
Validation accuracy: 0.6564

Loss after epoch 4:
Global step: 3520
0.33533045649528503
In-batch accuracy: 1.0
Validation accuracy: 0.6628

Loss after epoch 5:
Global step: 4224
0.6466774940490723
In-batch accuracy: 0.5
Validation accuracy: 0.6712

Loss after epoch 6:
Global step: 4928
0.3631550967693329
In-batch accuracy: 1.0
Validation accuracy: 0.6668

Loss after epoch 7:
Global step: 5632
0.3672349154949188
In-batch accuracy: 0.75
Validation accuracy: 0.6776

Loss after epoch 8:
Global step: 6336
0.18997327983379364
In-batch accuracy: 1.0
Validation accuracy: 0.6776

Loss after epoch 9

In [16]:
predict(model, 'I love it')

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [17]:
predict(model, 'I hate it')

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [18]:
predict(model, 'I have seen this film as I was a child and it was awersome! Love it! Love it!')

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [19]:
predict(model, 'Love it! Love it!  Love it! Love it! Love it! Love it! Love it!')

Variable containing:
 1
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [20]:
predict(model, "Maybe just long enough text if really suficcient so let's write something neutral")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [21]:
predict(model, "We need more emotions! Like when film is cool you are so happy to rank it 10") # Изменилось с 1 на 0

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [22]:
predict(model, "So only long texts")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [23]:
predict(model, "This is not good for tweets")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

In [24]:
predict(model, "This is very good for tweets")

Variable containing:
 0
[torch.cuda.LongTensor of size 1 (GPU 0)]

# Twitter

In [25]:
import pandas as pd

In [26]:
class OneHotDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, alphabet=None, noise_level=0, maxlen=512):
        """
        :param dataframe: pandas dataframe with fields "text": str and "label": int
        """
        if alphabet is None:
            raise NotImplementedError()
        else:
            self.alphabet = alphabet
        self.char2int = {s: i for s, i in zip(self.alphabet, range(len(self.alphabet)))}

        self.maxlen = maxlen
        self.dataframe = dataframe
        self.noise_level = noise_level
        if self.noise_level > 0:
            raise NotImplementedError()

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        line = self.dataframe.iloc[idx]
        text = self._preprocess_text_nobatch(line.text)
        label = line.label
        return text, label

    def _noise_generator(string):
        noised = ""
        for c in string:
            if random() > self.noise_level:
                noised += c
            if random() < self.noise_level:
                noised += choice(self.alphabet)
        return noised

    def _one_hot(self, char):
        zeros = np.zeros(len(self.alphabet))
        if char in self.char2int:
            zeros[self.char2int[char]] = 1.
        else:
            zeros[self.char2int['UNK']] = 1.

    def _preprocess_text_nobatch(self, text):
        one_hotted_text = np.zeros((self.maxlen, len(self.alphabet)))
        for i, char in enumerate(text):
            if i >= self.maxlen:
                break
            one_hotted_text[i, self.char2int.get(char, self.char2int['UNK'])] = 1.
        if i < self.maxlen:
            for j in range(i+1, self.maxlen):
                one_hotted_text[j, self.char2int['PAD']] = 1.

        return torch.FloatTensor(one_hotted_text)

    def onehot2text(self, one_hotted_text):
        text = ''
        _, idx = torch.max(one_hotted_text, 1)
        for i in idx:
            symb = self.alphabet[i]
            if symb == 'PAD':
                break
            else:
                text += symb
        return text



In [27]:
twitter_df = pd.read_csv('/media/data/nlp/data/twitter_sentiment/twitter_sentiment_valid.csv')

In [28]:
twitter_df.sample()

Unnamed: 0.1,Unnamed: 0,ItemID,Sentiment,SentimentText
282205,294537,294549,1,@julianna12369 Hon those two tweets together w...


In [29]:
twitter_df.columns = ['idxx', 'ItemID', 'label', 'text']

In [32]:
twitter_ds = OneHotDataset(twitter_df, alphabet=ALPHABET, maxlen=140)
twitter_dl = torch.utils.data.DataLoader(twitter_ds, batch_size=BATCH_SIZE, num_workers=4, shuffle=True)

In [49]:
%%time
# MAXLEN 512
get_accuracy(model, twitter_dl)

CPU times: user 33 s, sys: 13.8 s, total: 46.8 s
Wall time: 48.9 s


0.5117747288379131

In [33]:
%%time
# MAXLEN 140
get_accuracy(model, twitter_dl)

CPU times: user 14.9 s, sys: 4.83 s, total: 19.8 s
Wall time: 25.8 s


0.5006580369517948