In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

CUDA = torch.cuda.is_available()

import numpy as np

from sklearn.metrics import accuracy_score

import torchtext
from collections import Counter

In [61]:
text = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label = torchtext.data.Field(sequential=False, use_vocab=False)

train, test = torchtext.datasets.IMDB.splits(text, label)

c = Counter(''.join([' '.join(t.text) for t in train]))

ALPHABET = [char[0] for char in c.most_common(62)]  # all other chars used less ~ 100 times in a test
ALPHABET.append('UNK')
ALPHABET.append('PAD')

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

MAXLEN = 128

BATCH_SIZE = 32
TEST_SIZE = 100

def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.

def preprocess_text(text, maxlen=MAXLEN, batch_size=BATCH_SIZE):
    one_hotted_text = np.zeros((batch_size, maxlen, ALPHABET_LEN))
    assert len(text) == batch_size
    for bi, batch in enumerate(text):
        for i, char in enumerate(batch):
            if i >= MAXLEN:
                break
            one_hotted_text[bi, i, char2int.get(char, char2int['UNK'])] = 1.
        if i < MAXLEN:
            for j in range(i+1, MAXLEN):
                one_hotted_text[bi, j, char2int['PAD']] = 1.

    return torch.FloatTensor(one_hotted_text)

def onehot2text(one_hotted_text):
    texts = []

    for s in one_hotted_text:
        text = ''
        _, idx = torch.max(s, 1)
        for i in idx:
            symb = ALPHABET[i]
            if symb == 'PAD':
                break
            else:
                text += symb
        texts.append(text)
    return texts

all_texts = [t.text for t in train]
all_labels = [int(t.label == 'pos') for t in train]

from sklearn.utils import shuffle
X, y = shuffle(all_texts, all_labels)

batch_idx = 0

def next_batch():
    # BATCH_SIZE(32), ALPHABET_LEN(128), MAXLEN(512)
    global batch_idx
    batch = X[batch_idx:batch_idx+BATCH_SIZE], y[batch_idx:batch_idx+BATCH_SIZE]
    batch_idx += BATCH_SIZE
    return batch

def clip_gradient(optimizer, grad_clip):
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None and param.requires_grad:
                param.grad.data.clamp_(-grad_clip, grad_clip)

In [62]:
class CharRNN(nn.Module):
    
    def __init__(self, hidden_dim=256, dropout=0.5, num_layers=1):
        super(CharRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.rnn = nn.GRU(ALPHABET_LEN, hidden_dim, num_layers=num_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.projector = nn.Linear(hidden_dim, 2)
        
        self.hidden = self.init_hidden()

    def init_hidden(self, batch_size=None):
        if batch_size is None:
            batch_size = BATCH_SIZE
        
        # 1 is num_layers
        if CUDA:
            h0 = Variable(torch.randn([self.num_layers, batch_size, self.hidden_dim]).cuda())
#             c0 = Variable(torch.randn([1, batch_size, self.hidden_dim]).cuda())
        else:
            h0 = Variable(torch.randn([self.num_layers, batch_size, self.hidden_dim]))
#             c0 = Variable(torch.randn([1, batch_size, self.hidden_dim]))

        return h0
    
    def forward(self, inp):
        self.hidden = self.init_hidden(inp.size()[1])
        rnn_out, rnn_hidden = self.rnn(inp, self.hidden)
        rnn_out_last = self.dropout(rnn_out[-1])
        out = self.projector(rnn_out_last)
        return out

In [63]:
model = CharRNN(512, dropout=0.5, num_layers=1)
if CUDA:
    model.cuda()
model.train()

CharRNN(
  (rnn): GRU(64, 512, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (projector): Linear(in_features=512, out_features=2, bias=True)
)

In [64]:
writer = SummaryWriter(comment='_char_GRU512_optimizer5_maxlen128_batch32_dropout05_layers1')

In [65]:
optimizer = optim.Adam(params=model.parameters(), lr=10**-5)
optimizer.zero_grad()

In [66]:
global_step = 0

In [14]:
len(X)

25000

In [3]:
test_texts = [t.text for t in test]
test_labels = [int(t.label == 'pos') for t in test]

test_texts, test_labels = shuffle(test_texts, test_labels)

In [16]:
print(len(test_labels))
sum(test_labels)

25000


12500

In [None]:
def evaluate(step, last_prediction, last_labels):

    _, idx = torch.max(last_prediction, 1)
    acc = accuracy_score(last_labels.data.tolist(), idx.data.tolist())
    writer.add_scalar('accuracy_train', acc, global_step=global_step)
    print('In-batch accuracy:', acc)

    model.eval()

    predictions = []

    test_texts, test_labels = shuffle(test_texts, test_labels)
    
    for t in test_texts[:TEST_SIZE]:

        ptex = preprocess_text([t], batch_size=1)
        ptex = Variable(ptex.cuda())
        ptex = ptex.permute(1, 0, 2)
        pred = model(ptex)
        _, idx = torch.max(pred, 1)

        predictions.append(idx.data[0])
    
    lables = test_labels[:TEST_SIZE]
    
    acc = accuracy_score(lables, predictions)
    print('Test accuracy:', acc)
    writer.add_scalar('accuracy_test', acc, global_step=global_step)

    model.train()



In [67]:
N_EPOCHS = 10

loss_f = F.cross_entropy

print('Batch size: ', BATCH_SIZE)
print('Optimizer: ', optimizer.defaults)

for epoch in tqdm(range(N_EPOCHS)):
    global batch_idx
    batch_idx = 0
    X, y = shuffle(X, y)
    while batch_idx < len(X) - BATCH_SIZE:
        text, label = next_batch()

        label = Variable(torch.LongTensor(label).cuda()) if CUDA else Variable(torch.LongTensor(label))

        global_step += 1

        one_hotted_text = preprocess_text(text)
        one_hotted_text = Variable(one_hotted_text.cuda()) if CUDA else Variable(one_hotted_text)
        one_hotted_text = one_hotted_text.permute(1, 0, 2)
        prediction = model(one_hotted_text)

        loss = loss_f(prediction, label)

        writer.add_scalar('loss', loss.data[0], global_step=global_step)

        loss.backward()        
        torch.nn.utils.clip_grad_norm(model.parameters(), 1e-1)
        optimizer.step()

    # evaluation
    print('Loss after epoch %s:' % epoch)
    print(loss.data[0])
        
    _, idx = torch.max(prediction, 1)
    acc = accuracy_score(label.data.tolist(), idx.data.tolist())
    writer.add_scalar('accuracy_train', acc, global_step=global_step)
    print('In-batch accuracy:', acc)
    
    model.eval()

    predictions = []

    test_texts, test_labels = shuffle(test_texts, test_labels)
    
    for t in test_texts[:TEST_SIZE]:

        ptex = preprocess_text([t], batch_size=1)
        ptex = Variable(ptex.cuda())
        ptex = ptex.permute(1, 0, 2)
        pred = model(ptex)
        _, idx = torch.max(pred, 1)

        predictions.append(idx.data[0])
    
    lables = test_labels[:TEST_SIZE]
    
    acc = accuracy_score(lables, predictions)
    print('Test accuracy:', acc)
    writer.add_scalar('accuracy_test', acc, global_step=global_step)

    model.train()



  0%|          | 0/10 [00:00<?, ?it/s][A

Batch size:  32
Optimizer:  {'lr': 1e-05, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0}



Exception in thread Thread-6:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Loss after epoch 0:
0.6952921152114868
In-batch accuracy: 0.4375


 10%|█         | 1/10 [00:12<01:55, 12.82s/it]

Test accuracy: 0.49
Loss after epoch 1:
0.6852248311042786
In-batch accuracy: 0.65625


 20%|██        | 2/10 [00:25<01:43, 12.91s/it]

Test accuracy: 0.57
Loss after epoch 2:
0.6973998546600342
In-batch accuracy: 0.40625


 30%|███       | 3/10 [00:38<01:30, 12.90s/it]

Test accuracy: 0.48
Loss after epoch 3:
0.6884451508522034
In-batch accuracy: 0.53125


 40%|████      | 4/10 [00:51<01:17, 12.89s/it]

Test accuracy: 0.52
Loss after epoch 4:
0.7078744173049927
In-batch accuracy: 0.4375


 50%|█████     | 5/10 [01:04<01:04, 12.90s/it]

Test accuracy: 0.5
Loss after epoch 5:
0.6940731406211853
In-batch accuracy: 0.46875


 60%|██████    | 6/10 [01:17<00:51, 12.87s/it]

Test accuracy: 0.56
Loss after epoch 6:
0.6920180320739746
In-batch accuracy: 0.59375


 70%|███████   | 7/10 [01:29<00:38, 12.85s/it]

Test accuracy: 0.59
Loss after epoch 7:
0.7015429139137268
In-batch accuracy: 0.40625


 80%|████████  | 8/10 [01:42<00:25, 12.86s/it]

Test accuracy: 0.51
Loss after epoch 8:
0.6840828061103821
In-batch accuracy: 0.5


 90%|█████████ | 9/10 [01:55<00:12, 12.87s/it]

Test accuracy: 0.54
Loss after epoch 9:
0.6917307376861572
In-batch accuracy: 0.4375


100%|██████████| 10/10 [02:08<00:00, 12.88s/it]

Test accuracy: 0.41





In [16]:
BATCH_SIZE

32

In [140]:
test_texts[1], test_labels[1]

('I am a Sociologist/Anthropologist specializing in the field of Symbolic Interactionism, and I must say that this film exhibits high quality in the symbolic context throughout the entire film. To anyone who has not yet seen this, I recommend that you also read "Man\'s Search For Ultimate Meaning" by Victor E. Frankl. I think you will be able to draw some amazing correlations.<br /><br />That being said, I would like to say that despite the fact that the main characters are gay, this is not a story about being gay. This is a story about seeking out and finding meaning in life, despite the difficulties and challenges, the pain and terror that stand in your way. This is a story of seeking and finding balance and wholeness and happiness.',
 1)

In [157]:
model.eval()

predictions = []

for t in tqdm(test_texts[:1000], total=1000):

    ptex = preprocess_text([t], batch_size=1)
    ptex = Variable(ptex.cuda())
    ptex = ptex.permute(1, 0, 2)
    pred = model(ptex)
    _, idx = torch.max(pred, 1)

    predictions.append(idx.data[0])

model.train()

100%|██████████| 1000/1000 [00:10<00:00, 93.30it/s]


CharRNN(
  (rnn): LSTM(128, 256)
  (projector): Linear(in_features=256, out_features=2, bias=True)
)

In [158]:
lables = test_labels[:1000]

In [161]:
from sklearn.metrics import accuracy_score

In [162]:
accuracy_score(lables, predictions)

0.511

In [131]:
ptex = preprocess_text([test_texts[0]])

In [130]:
ptex.size()

torch.Size([512, 32, 128])

In [124]:
len(text)

32

In [7]:
t = test_texts[0]

In [8]:
t

'out of any category, this is one demented and over the edge film, even in todays standards. filmed entirely in crap-o-rama, this film will blow your mind (and something else too!)<br /><br />the amount of hilarious bad taste and sleaze is astonishing. the dialog is breathtakingly fast and campy. you\'ll either love or hate this film, but give it go. i\'ve seen it 4 times and absolutely love it. divine is in the quest for being the filthiest person alive, but so are her rivals too in this obscene and disgusting (but funny) and stylish little film. <br /><br />divine was phenomenal, and "she" will always be missed greatly. edith massey does the unforgettable performance as the "egglady" and don\'t forget the energetic mink stole!<br /><br />über crazy s**t! <br /><br />recommended also for you sick little puppies;<br /><br />female trouble <br /><br />desperate living <br /><br />polyester'

In [9]:
one_hotted_text = preprocess_text(text)
one_hotted_text = Variable(one_hotted_text.cuda())
one_hotted_text = one_hotted_text.permute(1, 0, 2)

pred = model(one_hotted_text)

AssertionError: 

In [126]:
pred.size()

torch.Size([32, 2])

In [10]:
one_hotted_text = preprocess_text([t], batch_size=1)

In [32]:
X, y = shuffle(X, y)

In [38]:
for i, text in enumerate(onehot2text(preprocess_text(X[:10], batch_size=10))):
    print(text)
    print('\n')
    print(y[i])
    print('\n\n')

"the true story of the friendship that shook south africa and awakened the world." <br /><br />richard attenborough, who directed "a bridge too far" and "gandhi", wanted to bring the story of steve biko to life, and the journey and trouble that journalist donald woods went through in order to get his story told. the films uses wood's two books for it's information and basis - "biko" and "asking for trouble".<br /><br />the film takes place in the late 1970's, in south africa. south africa is in the grip of the terrible apartheid, which keeps the blacks separated from the whites and classifies the whites as the superior race. the blacks are forced to live in shantytowns on the outskirts of the cities and towns, and they come under frequent harassment by the police and the army. we are shown a dawn raid on a shantytown, as bulldozers and armed police force their way through the camp beating and even killing the inhabitants. then we are introduced to donald woods (kevin kline), who is the

Ошибок в кодировке по-видимому нет. Почему сеть не учится всё ещё неясно