In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable

from tensorboardX import SummaryWriter
from tqdm import tqdm as tqdm

CUDA = torch.cuda.is_available()

import numpy as np

from sklearn.metrics import accuracy_score

import torchtext
from collections import Counter

In [6]:
text = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=2048, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label = torchtext.data.Field(sequential=False, use_vocab=False)

train, test = torchtext.datasets.IMDB.splits(text, label)

c = Counter(''.join([' '.join(t.text) for t in train]))

ALPHABET = [char[0] for char in c.most_common(62)]  # all other chars used less ~ 100 times in a test
ALPHABET.append('UNK')
ALPHABET.append('PAD')

ALPHABET_LEN = len(ALPHABET)

char2int = {s: i for s, i in zip(ALPHABET, range(ALPHABET_LEN))}

MAXLEN = 1024

BATCH_SIZE = 256
TEST_SIZE = 100

In [7]:
def one_hot(char):
    zeros = np.zeros(ALPHABET_LEN)
    if char in char2int:
        zeros[char2int[char]] = 1.
    else:
        zeros[char2int['UNK']] = 1.

def preprocess_text(text, maxlen=MAXLEN, batch_size=BATCH_SIZE):
    one_hotted_text = np.zeros((batch_size, maxlen, ALPHABET_LEN))
    assert len(text) == batch_size
    for bi, batch in enumerate(text):
        for i, char in enumerate(batch):
            if i >= MAXLEN:
                break
            one_hotted_text[bi, i, char2int.get(char, char2int['UNK'])] = 1.
        if i < MAXLEN:
            for j in range(i+1, MAXLEN):
                one_hotted_text[bi, j, char2int['PAD']] = 1.

    return torch.FloatTensor(one_hotted_text)

all_texts = [t.text for t in train]
all_labels = [int(t.label == 'pos') for t in train]

from sklearn.utils import shuffle
X, y = shuffle(all_texts, all_labels)

batch_idx = 0

def next_batch():
    # BATCH_SIZE(32), ALPHABET_LEN(128), MAXLEN(512)
    global batch_idx
    batch = X[batch_idx:batch_idx+BATCH_SIZE], y[batch_idx:batch_idx+BATCH_SIZE]
    batch_idx += BATCH_SIZE
    return batch

def clip_gradient(optimizer, grad_clip):
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None and param.requires_grad:
                param.grad.data.clamp_(-grad_clip, grad_clip)

In [8]:
class CharRNN(nn.Module):
    
    def __init__(self, hidden_dim=256):
        super(CharRNN, self).__init__()
        self.hidden_dim = hidden_dim

        self.rnn = nn.LSTM(ALPHABET_LEN, hidden_dim, num_layers=1)
        self.projector = nn.Linear(hidden_dim, 2)
        
        self.hidden = self.init_hidden()

    def init_hidden(self, batch_size=None):
        if batch_size is None:
            batch_size = BATCH_SIZE
        
        # 1 is num_layers
        if CUDA:
            h0 = Variable(torch.randn([1, batch_size, self.hidden_dim]).cuda())
            c0 = Variable(torch.randn([1, batch_size, self.hidden_dim]).cuda())
        else:
            h0 = Variable(torch.randn([1, batch_size, self.hidden_dim]))
            c0 = Variable(torch.randn([1, batch_size, self.hidden_dim]))

        return h0, c0
    
    def forward(self, inp):
        self.hidden = self.init_hidden(inp.size()[1])
        rnn_out, rnn_hidden = self.rnn(inp, self.hidden)
        out = self.projector(rnn_out[-1])
        return out

In [10]:
model = CharRNN(256)
if CUDA:
    model.cuda()
model.train()

CharRNN(
  (rnn): LSTM(128, 256)
  (projector): Linear(in_features=256, out_features=2, bias=True)
)

In [11]:
writer = SummaryWriter(comment='_char_lstm_256_optimizer_5_maxlen_1024_batch_256')

In [12]:
optimizer = optim.Adam(params=model.parameters(), lr=10**-5)
optimizer.zero_grad()

In [13]:
global_step = 0

In [14]:
len(X)

25000

In [15]:
test_texts = [t.text for t in test]
test_labels = [int(t.label == 'pos') for t in test]

test_texts, test_labels = shuffle(test_texts, test_labels)

In [16]:
print(len(test_labels))
sum(test_labels)

25000


12500

In [None]:
def evaluate(step, last_prediction, last_labels):

    _, idx = torch.max(last_prediction, 1)
    acc = accuracy_score(last_labels.data.tolist(), idx.data.tolist())
    writer.add_scalar('accuracy_train', acc, global_step=global_step)
    print('In-batch accuracy:', acc)

    model.eval()

    predictions = []

    test_texts, test_labels = shuffle(test_texts, test_labels)
    
    for t in test_texts[:TEST_SIZE]:

        ptex = preprocess_text([t], batch_size=1)
        ptex = Variable(ptex.cuda())
        ptex = ptex.permute(1, 0, 2)
        pred = model(ptex)
        _, idx = torch.max(pred, 1)

        predictions.append(idx.data[0])
    
    lables = test_labels[:TEST_SIZE]
    
    acc = accuracy_score(lables, predictions)
    print('Test accuracy:', acc)
    writer.add_scalar('accuracy_test', acc, global_step=global_step)

    model.train()



In [17]:
N_EPOCHS = 50

loss_f = F.cross_entropy

for epoch in range(N_EPOCHS):
    global batch_idx
    batch_idx = 0
    X, y = shuffle(X, y)
    while batch_idx < len(X) - BATCH_SIZE:
        text, label = next_batch()

        label = Variable(torch.LongTensor(label).cuda()) if CUDA else Variable(torch.LongTensor(label))

        global_step += 1

        one_hotted_text = preprocess_text(text)
        one_hotted_text = Variable(one_hotted_text.cuda()) if CUDA else Variable(one_hotted_text)
        one_hotted_text = one_hotted_text.permute(1, 0, 2)
        prediction = model(one_hotted_text)

        loss = loss_f(prediction, label)

        writer.add_scalar('loss', loss.data[0], global_step=global_step)

        loss.backward()        
        clip_gradient(optimizer, 1e-1)
        optimizer.step()

    # evaluation
    print('Loss after epoch %s:' % epoch)
    print(loss.data[0])
        
    _, idx = torch.max(prediction, 1)
    acc = accuracy_score(label.data.tolist(), idx.data.tolist())
    writer.add_scalar('accuracy_train', acc, global_step=global_step)
    print('In-batch accuracy:', acc)
    
    model.eval()

    predictions = []

    test_texts, test_labels = shuffle(test_texts, test_labels)
    
    for t in test_texts[:TEST_SIZE]:

        ptex = preprocess_text([t], batch_size=1)
        ptex = Variable(ptex.cuda())
        ptex = ptex.permute(1, 0, 2)
        pred = model(ptex)
        _, idx = torch.max(pred, 1)

        predictions.append(idx.data[0])
    
    lables = test_labels[:TEST_SIZE]
    
    acc = accuracy_score(lables, predictions)
    print('Test accuracy:', acc)
    writer.add_scalar('accuracy_test', acc, global_step=global_step)

    model.train()


Loss after epoch 0:
0.6937292218208313
In-batch accuracy: 0.48046875
Test accuracy: 0.5
Loss after epoch 1:
0.6938817501068115
In-batch accuracy: 0.48046875
Test accuracy: 0.47
Loss after epoch 2:
0.6937626600265503
In-batch accuracy: 0.50390625
Test accuracy: 0.54
Loss after epoch 3:
0.6924145221710205
In-batch accuracy: 0.54296875
Test accuracy: 0.43
Loss after epoch 4:
0.6929919123649597
In-batch accuracy: 0.484375
Test accuracy: 0.48
Loss after epoch 5:
0.6946525573730469
In-batch accuracy: 0.46484375
Test accuracy: 0.56
Loss after epoch 6:
0.6942274570465088
In-batch accuracy: 0.46875
Test accuracy: 0.51
Loss after epoch 7:
0.6910460591316223
In-batch accuracy: 0.546875
Test accuracy: 0.57
Loss after epoch 8:
0.692038357257843
In-batch accuracy: 0.52734375
Test accuracy: 0.49
Loss after epoch 9:
0.691204309463501
In-batch accuracy: 0.5546875
Test accuracy: 0.55
Loss after epoch 10:
0.6895204186439514
In-batch accuracy: 0.58984375
Test accuracy: 0.51
Loss after epoch 11:
0.69194209

In [140]:
test_texts[1], test_labels[1]

('I am a Sociologist/Anthropologist specializing in the field of Symbolic Interactionism, and I must say that this film exhibits high quality in the symbolic context throughout the entire film. To anyone who has not yet seen this, I recommend that you also read "Man\'s Search For Ultimate Meaning" by Victor E. Frankl. I think you will be able to draw some amazing correlations.<br /><br />That being said, I would like to say that despite the fact that the main characters are gay, this is not a story about being gay. This is a story about seeking out and finding meaning in life, despite the difficulties and challenges, the pain and terror that stand in your way. This is a story of seeking and finding balance and wholeness and happiness.',
 1)

In [157]:
model.eval()

predictions = []

for t in tqdm(test_texts[:1000], total=1000):

    ptex = preprocess_text([t], batch_size=1)
    ptex = Variable(ptex.cuda())
    ptex = ptex.permute(1, 0, 2)
    pred = model(ptex)
    _, idx = torch.max(pred, 1)

    predictions.append(idx.data[0])

model.train()

100%|██████████| 1000/1000 [00:10<00:00, 93.30it/s]


CharRNN(
  (rnn): LSTM(128, 256)
  (projector): Linear(in_features=256, out_features=2, bias=True)
)

In [158]:
lables = test_labels[:1000]

In [161]:
from sklearn.metrics import accuracy_score

In [162]:
accuracy_score(lables, predictions)

0.511

In [131]:
ptex = preprocess_text([test_texts[0]])

In [130]:
ptex.size()

torch.Size([512, 32, 128])

In [124]:
len(text)

32

In [125]:
one_hotted_text = preprocess_text(text)
one_hotted_text = Variable(one_hotted_text.cuda())
one_hotted_text = one_hotted_text.permute(1, 0, 2)

pred = model(one_hotted_text)

In [126]:
pred.size()

torch.Size([32, 2])