In [28]:
train.download('/media/data/nlp/data/')

downloading aclImdb_v1.tar.gz


'/media/data/nlp/data/imdb/aclImdb'

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

CUDA = torch.cuda.is_available()

import numpy as np

import torchtext
from collections import Counter

# Dataset

In [2]:
text = torchtext.data.Field(
    lower=False, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label = torchtext.data.Field(sequential=False, use_vocab=False)

train, test = torchtext.datasets.IMDB.splits(text, label)

In [3]:
train[0].label

'pos'

In [4]:
train[444].text

'updated January 1st, 2006<br /><br />Parsifal is one of my two favorite Wagner operas or music dramas, to be more accurate, (Meistersinger is the other.) though it\'s hard to imagine it as the "top of anyone\'s pops". The libretto, by the composer as usual, is a muddle of religion, paganism, eroticism, and possibly even homo-eroticism, and its length may make it seem to the audience like hearing paint dry.<br /><br />Wagner, being a famous anti-Semite, (Klingsor may be one of his surrogate Jewish villains.) naturally entrusted the premiere to an unconverted (not for want of RW\'s trying!) Hermann Levi, who was his favorite conductor! (Go figure!) Kundry, a most mixed-up-gal and another likely Jewish surrogate, is both villainous or benevolent, depending on the scene.<br /><br />Considering that many video versions of Parsifal seem on the stodgy side, this film of the opera is, in comparison, a breath of fresh air. Hans-Jürgen Syberberg, the director, has brought considerable imaginati

In [5]:
sum(len(t.label) > 1 for t in train)

25000

In [6]:
print('Train: ', Counter([t.label[0] for t in train]))
print('Test:  ', Counter([t.label[0] for t in test]))

Train:  Counter({'n': 12500, 'p': 12500})
Test:   Counter({'n': 12500, 'p': 12500})


In [7]:
train_labels_counter

NameError: name 'train_labels_counter' is not defined

In [8]:
all_lens = [len(' '.join(t.text)) for t in train]

In [9]:
plt.hist(all_lens)

NameError: name 'plt' is not defined

# Preprocessing

* one hot all chars, UNK 
* pad sequences
* pytorchify it: FloatTensor
* make torch dataloader

In [4]:
c = Counter(''.join([' '.join(t.text) for t in train]))

In [5]:
len(c)

178

In [6]:
ALPHABET = [char[0] for char in c.most_common(126)]  # all other chars used less ~ 100 times in a test
ALPHABET.append('UNK')
ALPHABET.append('PAD')

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

MAXLEN = 2048

BATCH_SIZE = 32

In [7]:
def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.

def preprocess_text(text, maxlen=MAXLEN):
    one_hotted_text = np.zeros((BATCH_SIZE, ALPHABET_LEN, maxlen))
    for bi, batch in enumerate(text):
        for i, char in enumerate(batch):
            if i >= MAXLEN:
                break
            one_hotted_text[bi, char2int.get(char, char2int['UNK']), i] = 1.
        if i < MAXLEN:
            for j in range(i+1, MAXLEN):
                one_hotted_text[bi, char2int['PAD'], j] = 1.

    return torch.FloatTensor(one_hotted_text)

In [8]:
all_texts = [t.text for t in train]
all_labels = [int(t.label == 'pos') for t in train]

In [9]:
from sklearn.utils import shuffle
X, y = shuffle(all_texts, all_labels)

In [10]:
del all_texts, all_labels

In [11]:
batch_idx = 0

def next_batch():
    global batch_idx
    return X[batch_idx:batch_idx+BATCH_SIZE], y[batch_idx:batch_idx+BATCH_SIZE]

In [141]:
next_batch()

(['It should be against the law not to experience this extremely funny stand up show with Eddie Murphy. I have never seen anything like it.<br /><br />Murphy goes on for almost 70 minutes about dicks, pussy, tits and insaults so many famous people including his own "family". Among the people who gets it by murphy are: Elvis, Mr.T, Michael Jackson, Stevie Wonder, Mick Jagger, Luther Vandross and James Brown. I have seriously never laughed so hard of anything my entire life. I mean, when a person doesn\'t know who Mr. T is, but still laughs so hard of Murphy as Mr. T, there\'s something about it. At the time I saw the show I couldn\'t remember who Mr T. was but still laughed. Now I know who he is and that just makes it so much more funny. Because that\'s what Eddie do - he can make those impressions so good that it don\'t matter who the hell he\'s trying to do, it\'s still hilarious. And on top of that, we learn that Murphy actually is a very good singer. Please watch it..',
  "ATTENTION

# Models

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable

from tensorboardX import SummaryWriter
from tqdm import tqdm_notebook as tqdm

CUDA = torch.cuda.is_available()

In [13]:
CUDA

True

In [14]:
BATCH_SIZE = 32

## CharCNN

In [15]:
class CharCNN(nn.Module):
    def __init__(self, kernel_size=5, hidden_size=256, label_size=1):
        super(CharCNN, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv1d(
                in_channels=ALPHABET_LEN,
                out_channels=hidden_size,
                kernel_size=kernel_size
            ),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=hidden_size)
        )
        self.projector = nn.Linear(256*7, label_size)
#         self.probs = nn.Softmax()

    def forward(self, inp):
        #       BATCH x ALPHABET_SIZE x MAXLEN
        convolved = self.conv(inp)
        #       BATCH x hidden_size x 1
        rshpd = convolved.view(BATCH_SIZE, -1)
        projected = self.projector(rshpd)
#         return self.probs(projected)
        return projected

In [16]:
writer = SummaryWriter(comment='_charCNN')

In [17]:
global_step = 0

In [18]:
model = CharCNN(kernel_size=5, hidden_size=256, label_size=1)

if CUDA:
    model.cuda()

model.train()

CharCNN(
  (conv): Sequential(
    (0): Conv1d (128, 256, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): MaxPool1d(kernel_size=256, stride=256, padding=0, dilation=1, ceil_mode=False)
  )
  (projector): Linear(in_features=1792, out_features=1)
)

In [19]:
optimizer = optim.Adam(params=model.parameters(), lr=10**-3)

optimizer.zero_grad()

In [20]:
text, label = next_batch()
label = Variable(torch.FloatTensor(label).view(-1, 1).cuda())

In [22]:
F.binary_cross_entropy(label, label)

Variable containing:
 0
[torch.cuda.FloatTensor of size 1 (GPU 0)]

In [29]:
F.binary_cross_entropy_with_logits(label, label)

Variable containing:
 0.5388
[torch.cuda.FloatTensor of size 1 (GPU 0)]

In [23]:
one_hotted_text = preprocess_text(text)

In [24]:
one_hotted_text = Variable(one_hotted_text).cuda()

In [26]:
prediction = model(one_hotted_text)

In [29]:
F.binary_cross_entropy(prediction, label)

RuntimeError: cudaEventSynchronize in future::wait: device-side assert triggered

In [35]:
loss = F.binary_cross_entropy_with_logits(prediction, label)

ValueError: Target size (torch.Size([32])) must be the same as input size (torch.Size([32, 1]))

In [23]:
N_EPOCHS = 10

for epoch in tqdm(range(N_EPOCHS)):
    global batch_idx
    batch_idx = 0
    X, y = shuffle(X, y)
    while batch_idx < len(X) - BATCH_SIZE:
        text, label = next_batch()
        
        label = Variable(torch.LongTensor(label).cuda()) # if CUDA else Variable(torch.LongTensor(label))

        global_step += 1

        one_hotted_text = preprocess_text(text)
        one_hotted_text = Variable(one_hotted_text).cuda() # if CUDA else Variable(ohe_hotted_text)
        prediction = model(one_hotted_text)

        print(prediction)
        
        loss = F.binary_cross_entropy_with_logits(prediction, label)
        
        print(loss)
        
        writer.add_scalar('loss', loss.data[0])
        
        loss.backward()        
        optimizer.step()





RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/torch/lib/THC/generic/THCTensorCopy.c:20

# Ээээээксперименты

In [14]:
isinstance(train, torch.utils.data.Dataset)

True

In [112]:
train[4].label

'pos'

In [40]:
for i in train.iters(batch_size=BATCH_SIZE, shuffle=True):
    print(i.data)
    print(i.)
    break

<bound method Iterator.data of <torchtext.data.iterator.BucketIterator object at 0x7fe78f0e80b8>>


In [42]:
iterator = train.iters(batch_size=BATCH_SIZE)

TypeError: __init__() got an unexpected keyword argument 'shuffle'

In [103]:
X, y = train.iters(batch_size=BATCH_SIZE)

In [55]:
X.create_batches()

In [113]:
for x in X:
    print(x.text)
    print(x.label)
    break

Variable containing:
 3.0390e+03  1.4994e+04  1.3300e+02  ...   1.3240e+04  9.0000e+00  1.3300e+02
 1.0722e+04  8.8797e+04  3.3900e+03  ...   5.0000e+00  2.8400e+02  4.7000e+01
 7.0000e+00  1.3000e+01  5.0000e+01  ...   3.6000e+01  1.1000e+01  1.0320e+03
                ...                   ⋱                   ...                
 1.0000e+00  1.0000e+00  1.0000e+00  ...   4.0000e+00  2.1060e+03  4.1700e+03
 1.0000e+00  1.0000e+00  1.0000e+00  ...   4.7000e+01  5.5000e+01  4.0000e+00
 1.0000e+00  1.0000e+00  1.0000e+00  ...   4.1644e+04  2.7662e+05  5.6432e+04
[torch.cuda.LongTensor of size 241x32 (GPU 0)]

Variable containing:
 1
 2
 1
 1
 1
 1
 1
 1
 1
 2
 2
 2
 1
 1
 2
 1
 1
 2
 1
 1
 1
 1
 1
 2
 2
 1
 2
 1
 1
 2
 1
 2
[torch.cuda.LongTensor of size 32 (GPU 0)]



In [66]:
for x in y:
    print(x.text)
    break

Variable containing:

Columns 0 to 5 
 6.7900e+02  1.1600e+03  1.9000e+01  2.3400e+02  4.9000e+01  1.3300e+02
 4.1000e+02  4.6500e+02  3.2369e+04  3.0000e+00  7.0000e+00  9.5000e+01
 1.1700e+02  9.5000e+01  1.9600e+03  2.3000e+02  2.0000e+00  2.4000e+01
 8.3000e+01  2.0910e+03  1.6000e+01  3.8500e+02  7.9700e+02  8.0000e+00
 4.0000e+00  1.3300e+02  3.0950e+03  5.0000e+00  2.1000e+01  9.2000e+01
 3.4300e+02  8.1290e+03  1.3600e+03  1.1263e+04  1.5290e+03  2.5950e+03
 2.0000e+01  1.2900e+02  8.5000e+01  0.0000e+00  1.5200e+02  2.0000e+00
 1.1400e+02  7.7000e+01  3.2000e+01  6.1086e+04  2.8000e+01  2.2090e+03
 1.2300e+02  2.4300e+02  2.8190e+03  9.0000e+00  2.5000e+01  6.1200e+02
 2.6000e+01  3.3700e+02  9.5100e+02  4.3900e+02  4.8800e+02  8.1000e+01
 1.7200e+02  6.0000e+00  1.9000e+01  1.1000e+01  1.2000e+01  2.9200e+02
 1.5000e+01  1.1800e+02  1.8540e+03  1.7154e+04  1.4200e+02  2.0000e+00
 9.9000e+01  4.0000e+00  2.2000e+01  5.6000e+01  1.6000e+01  1.1220e+03
 1.5000e+01  2.1980e+03  7

In [115]:
all_texts = [t.text[:MAXLEN] for t in train]

In [37]:
dataloader = torch.utils.data.DataLoader(train.iters(), batch_size=BATCH_SIZE, shuffle=True)