In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

CUDA = torch.cuda.is_available()

import numpy as np

import torchtext
from collections import Counter

In [2]:
text = torchtext.data.Field(
    lower=False, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label = torchtext.data.Field(sequential=False, use_vocab=False)

train, test = torchtext.datasets.IMDB.splits(text, label)

In [23]:
c = Counter(''.join([' '.join(t.text) for t in train]))

ALPHABET = [char[0] for char in c.most_common(126)]  # all other chars used less ~ 100 times in a test
ALPHABET.append('UNK')
ALPHABET.append('PAD')

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

MAXLEN = 512

BATCH_SIZE = 32

In [24]:
def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.

def preprocess_text(text, maxlen=MAXLEN):
    one_hotted_text = np.zeros((BATCH_SIZE, ALPHABET_LEN, maxlen))
    for bi, batch in enumerate(text):
        for i, char in enumerate(batch):
            if i >= MAXLEN:
                break
            one_hotted_text[bi, char2int.get(char, char2int['UNK']), i] = 1.
        if i < MAXLEN:
            for j in range(i+1, MAXLEN):
                one_hotted_text[bi, char2int['PAD'], j] = 1.

    return torch.FloatTensor(one_hotted_text)

In [25]:
all_texts = [t.text for t in train]
all_labels = [int(t.label == 'pos') for t in train]

from sklearn.utils import shuffle
X, y = shuffle(all_texts, all_labels)

In [26]:
batch_idx = 0

def next_batch():
    global batch_idx
    return X[batch_idx:batch_idx+BATCH_SIZE], y[batch_idx:batch_idx+BATCH_SIZE]

In [27]:
class Perceptron(nn.Module):
    def __init__(self, hidden_size=128):
        super(Perceptron, self).__init__()
        
        self.hidden = nn.Linear(MAXLEN, hidden_size)
        self.res = nn.Linear(hidden_size*128, 1)
    
    def forward(self, inp):
        h = F.sigmoid(self.hidden(inp)).view(BATCH_SIZE, -1)
        return F.sigmoid(self.res(h))

In [28]:
model = Perceptron(1024)

model.cuda()

model.train()

Perceptron(
  (hidden): Linear(in_features=512, out_features=1024)
  (res): Linear(in_features=131072, out_features=1)
)

In [30]:
writer = SummaryWriter(comment='_perceptron_1024')

In [31]:
optimizer = optim.Adam(params=model.parameters(), lr=10**-3)

optimizer.zero_grad()

In [32]:
global_step = 0

In [33]:
N_EPOCHS = 10

for epoch in tqdm(range(N_EPOCHS)):
    global batch_idx
    batch_idx = 0
    X, y = shuffle(X, y)
    while batch_idx < len(X) - BATCH_SIZE:
        text, label = next_batch()
        
        label = Variable(torch.FloatTensor(label).cuda()) # if CUDA else Variable(torch.LongTensor(label))

        global_step += 1

        one_hotted_text = preprocess_text(text)
        one_hotted_text = Variable(one_hotted_text).cuda() # if CUDA else Variable(ohe_hotted_text)
        prediction = model(one_hotted_text)
        
        loss = F.binary_cross_entropy(prediction, label)
                
        writer.add_scalar('loss', loss.data[0], global_step=global_step)
        
        loss.backward()        
        optimizer.step()


  "Please ensure they have the same size.".format(target.size(), input.size()))


KeyboardInterrupt: 

По крайней мере теперь видно, что не плохо сделанный батчинг убивает pytorch, а что-то не так было с CNN-моделью 