In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import torchtext.experimental
import torchtext.experimental.vectors

import collections
import time
import random

In [3]:
from helpers import ep_time, predict

In [3]:
from torchtext.experimental.datasets.raw.text_classification import \
    RawTextIterableDataset
from torchtext.experimental.datasets.text_classification import \
    TextClassificationDataset

from helpers import *


def get_train_valid_split(raw_train, split_ratio=0.7):
    raw_train = list(raw_train)
    random.shuffle(raw_train)

    n_train_ex = int(len(raw_train) * split_ratio)
    train_data = raw_train[:n_train_ex]
    valid_data = raw_train[n_train_ex:]
    return train_data, valid_data


def gen_vocab(raw_data, tokenizer, **vocab_kwargs):
    token_freqs = collections.Counter()

    for label, text in raw_data:
        tokens = tokenizer.tokenize(text)
        token_freqs.update(tokens)

    vocab = torchtext.vocab.Vocab(token_freqs, **vocab_kwargs)

    return vocab


def process_raw(raw_data, tokenizer, vocab):
    raw_data = [(label, text) for (label, text) in raw_data]
    text_trans = sequential_transforms(
        tokenizer.tokenize, vocab_func(vocab), to_tensor(dtype=torch.long)
    )
    label_trans = sequential_transforms(to_tensor(dtype=torch.long))

    transforms = (label_trans, text_trans)

    return TextClassificationDataset(raw_data, vocab, transforms)


def init_params(m: nn.Module):
    if isinstance(m, nn.Embedding):
        nn.init.uniform_(m.weight, -0.05, 0.05)
    elif isinstance(m, nn.LSTM):
        for n, p in m.named_parameters():
            if "weight_ih" in n:
                i, f, g, o = p.chunk(4)
                nn.init.xavier_uniform_(i)
                nn.init.xavier_uniform_(f)
                nn.init.xavier_uniform_(g)
                nn.init.xavier_uniform_(o)
            elif "weight_hh" in n:
                i, f, g, o = p.chunk(4)
                nn.init.orthogonal_(i)
                nn.init.orthogonal_(f)
                nn.init.orthogonal_(g)
                nn.init.orthogonal_(o)
            elif "bias" in n:
                i, f, g, o = p.chunk(4)
                nn.init.zeros_(i)
                nn.init.ones_(f)
                nn.init.zeros_(g)
                nn.init.zeros_(o)
    elif isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)


def get_pretrained_embedding(init_embed, pretrained_vectors, vocab, unk_token):
    pretrained_embedding = torch.FloatTensor(init_embed.weight.clone()).detach()
    # pretrained_vocab = pretrained_vectors.vectors.get_stoi()

    unk_tokens = []

    for idx, token in enumerate(vocab.itos):
        # if token in pretrained_vocab:
        pretrained_vector = pretrained_vectors[token]
        pretrained_embedding[idx] = pretrained_vector
        # else:
            # unk_tokens.append(token)

    return pretrained_embedding, unk_tokens


def train(model, iterator, optimizer, criterion, device):
    ep_loss, ep_acc = 0, 0

    model.train()

    for labels, text, lengths in iterator:
        labels, text = labels.to(device), text.to(device)

        optimizer.zero_grad()

        predictions = model(text, lengths)

        loss = criterion(predictions, labels)

        acc = calc_acc(predictions, labels)

        loss.backward()
        optimizer.step()

        ep_loss += loss.item()
        ep_acc += acc.item()

    return ep_loss / len(iterator), ep_acc / len(iterator)


def evaluate(model, iterator, criterion, device):
    ep_loss, ep_acc = 0, 0

    model.eval()

    with torch.no_grad():
        for labels, text, lengths in iterator:
            labels, text = labels.to(device), text.to(device)

            predictions = model(text, lengths)

            loss = criterion(predictions, labels)

            acc = calc_acc(predictions, labels)

            ep_loss += loss.item()
            ep_acc += acc.item()

    return ep_loss / len(iterator), ep_acc / len(iterator)


class Tokenizer:
    def __init__(self, fn="basic_english", lower=True, max_len=None):
        self.tokenize_fn = torchtext.data.utils.get_tokenizer(fn)
        self.lower = lower
        self.max_len = max_len

    def tokenize(self, s):
        tokens = self.tokenize_fn(s)

        if self.lower:
            tokens = [token.lower() for token in tokens]

        if self.max_len is not None:
            tokens = tokens[: self.max_len]

        return tokens


class Collator:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def collate(self, batch):
        labels, text = zip(*batch)
        labels, lengths = torch.LongTensor(labels), torch.LongTensor(
            [len(x) for x in text]
        )

        text = nn.utils.rnn.pad_sequence(text, padding_value=self.pad_idx)
        return labels, text, lengths


class BiLSTM(nn.Module):
    def __init__(
        self, input_dim, emb_dim, hid_dim, output_dim, n_layer, dropout, pad_idx
    ):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            emb_dim, hid_dim, num_layers=n_layer, bidirectional=True, dropout=dropout
        )
        self.fc = nn.Linear(2 * hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, lengths):
        # [seq_len, batch_size, emb_dim]
        embedded = self.dropout(self.embedding(text))
        # https://discuss.pytorch.org/t/simple-working-example-how-to-use-packing-for-variable-length-sequence-inputs-for-rnn/2120
        packed_emb = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths, enforce_sorted=False
        )
        packed_out, (hidden, cell) = self.lstm(packed_emb)

        # outputs : [seq_len, batch_size, n_direction * hid_dim]
        # hid : [n_layers * n_direction, batch_size, hid_dim]
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out)

        # [batch_size, hid_dim]
        hidden_fwd, hidden_bck = hidden[-2], hidden[-1]
        # [batch_size, hid_dim*2]
        hidden = torch.cat((hidden_fwd, hidden_bck), dim=1)
        # pred : [batch_size, output_dim]
        return self.fc(self.dropout(hidden))


In [16]:
seed = 1234
max_len = 500
max_size = 25000

torch.manual_seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [5]:
raw_train, raw_test = torchtext.experimental.datasets.raw.IMDB()
raw_train, raw_valid = get_train_valid_split(raw_train)

In [6]:
tokenizer = Tokenizer(max_len=max_len)

In [7]:
vocab = gen_vocab(raw_train, tokenizer, max_size=max_size)

In [8]:
train_data = process_raw(raw_train,tokenizer, vocab)
test_data = process_raw(raw_test,tokenizer, vocab)
valid_data = process_raw(raw_valid,tokenizer, vocab)

In [9]:
batch_size = 256
pad_token = '<pad>'
unk_token = '<unk>'
pad_idx = vocab[pad_token]
input_dim = len(vocab)
emb_dim = 100
hid_dim = 256
output_dim = 2
n_layers = 2
dropout = 0.5
n_epochs = 10
best_valid_loss = float('inf')

In [10]:
collator = Collator(pad_idx)

In [11]:
train_iterator = torch.utils.data.DataLoader(train_data, batch_size, shuffle=True, collate_fn=collator.collate)

valid_iterator = torch.utils.data.DataLoader(valid_data, batch_size, shuffle=False, collate_fn=collator.collate)

test_iterator = torch.utils.data.DataLoader(test_data, batch_size, shuffle=False, collate_fn=collator.collate)

In [17]:
model = BiLSTM(input_dim, emb_dim, hid_dim, output_dim, n_layers, dropout, pad_idx)

glove = torchtext.experimental.vectors.GloVe(name = '6B',
                                             dim = emb_dim)
# for n,p in model.named_parameters():
#     print(f'name:{n}\nshape:{p.shape}\n')
model.apply(init_params)

BiLSTM(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (lstm): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [12]:
pretrained_embedding, unk_tokens = get_pretrained_embedding(model.embedding, glove, vocab, unk_token)

In [13]:
model.embedding.weight.data.copy_(pretrained_embedding)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.2925,  0.1087,  0.7920,  ..., -0.3641,  0.1822, -0.4104],
        [-0.7250,  0.7545,  0.1637,  ..., -0.0144, -0.1761,  0.3418],
        [ 1.1753,  0.0460, -0.3542,  ...,  0.4510,  0.0485, -0.4015]])

In [14]:
model.embedding.weight.data[pad_idx] = torch.zeros(emb_dim)

In [15]:
optimizer = optim.Adam(model.parameters())

In [13]:
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
for epoch in range(n_epochs):

    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    
    end_time = time.monotonic()

    epoch_mins, epoch_secs = ep_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bilstm.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [18]:
model.load_state_dict(torch.load('bilstm.pt', map_location=device))



<All keys matched successfully>

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion, device)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [19]:
sent = 'I fucking hate you'
predict(tokenizer, vocab, model, device, sent)

0.4891466498374939

In [20]:
%%writefile bentodeploy.py
from bentoml import env, artifacts, api, BentoService
from bentoml.adapters import JsonInput, JsonOutput
from bentoml.artifact import PickleArtifact
from bentoml.frameworks.pytorch import PytorchModelArtifact
import torchtext
import torch
@env(infer_pip_packages=True)
@artifacts([PytorchModelArtifact('profanity_model'), PickleArtifact('tokenizer'), PickleArtifact('vocab'), PickleArtifact('device')])
class ProfanityClassifier(BentoService):    
    def predict_actual(self,sentence):
        self.artifacts.profanity_model.eval()
        device = self.artifacts.device
        tokens = self.artifacts.tokenizer.tokenize(sentence)
        length = torch.LongTensor([len(tokens)]).to(device)
        idx = [self.artifacts.vocab.stoi[token] for token in tokens]
        tensor = torch.LongTensor(idx).unsqueeze(-1).to(device)
        prediction = self.artifacts.profanity_model(tensor, length)
        probabilities = torch.nn.functional.softmax(prediction, dim=-1)
        return probabilities.squeeze()[-1].item()
    @api(input=JsonInput(), output=JsonOutput())
    def predict(self, input):
        # result = self.artifacts.profanity_model.predict(sent)
        return self.predict_actual(input["text"])

Overwriting bentodeploy.py


In [21]:
    from bentodeploy import ProfanityClassifier
    bento_service = ProfanityClassifier()
    bento_service.pack('profanity_model', model)
    bento_service.pack('tokenizer', tokenizer)
    bento_service.pack('vocab', vocab)
    bento_service.pack('device', device)
    saved_path = bento_service.save()

[2020-09-27 23:27:44,388] INFO - BentoService bundle 'ProfanityClassifier:20200927232742_9097E6' saved to: /home/kishoreganesh/bentoml/repository/ProfanityClassifier/20200927232742_9097E6
