### 1) Specify how preprocessing should be done (Fields)
### 2) Use Dataset to load the data -> TabularDataset
### 3) Construct an iterator to do batching and padding -> BucketIterator

In [17]:
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import spacy

In [37]:
spacy_en = spacy.load("en")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
def tokenize(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

In [21]:
quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True)
score = Field(sequential=False, use_vocab=False)

In [22]:
# Specify which columns to use in Dataset
fields = {"quote": ("q", quote), "score": ("s", score)}
train_data, test_data = TabularDataset.splits(
    path=".",
    train="train.json",
    # validation="validation.json"
    test="test.json",
    format="json",
    fields=fields
)

In [23]:
# # Specify which columns to use in Dataset
# fields = {"quote": ("q", quote), "score": ("s", score)}
# train_data, test_data = TabularDataset.splits(
#     path=".",
#     train="train.csv",
#     test="test.csv",
#     format="csv",
#     fields=fields
# )

In [24]:
# # Specify which columns to use in Dataset
# fields = {"quote": ("q", quote), "score": ("s", score)}
# train_data, test_data = TabularDataset.splits(
#     path=".",
#     train="train.tsv",
#     test="test.tsv",
#     format="tsv",
#     fields=fields
# )

In [25]:
print(train_data[0].__dict__.keys())
print(train_data[0].__dict__.values())

dict_keys(['q', 's'])
dict_values([['you', 'must', 'own', 'everything', 'in', 'your', 'world', '.', 'there', 'is', 'no', 'one', 'else', 'to', 'blame', '.'], 1])


In [30]:
# Building vocabulary for quote filed
# We can also have pretrained embeddings
quote.build_vocab(train_data,
                  max_size=10000,
                  min_freq=1,
                  vectors="glove.6B.100d") # If you don't pass vectors/embeddings, it would build index vocabulary

.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                           
100%|█████████▉| 399999/400000 [00:21<00:00, 18621.10it/s]


In [38]:
# Does padding automatically
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=3,
    device=device,
)

In [39]:
for batch in train_iterator:
  print(batch.q)
  print(batch.s)

tensor([[14, 35, 29],
        [25, 23, 31],
        [ 7, 26,  3],
        [ 5, 18, 11],
        [10, 19, 28],
        [15, 36, 22],
        [21, 34,  4],
        [ 3,  2, 27],
        [ 7, 33,  9],
        [ 5, 20,  1],
        [32, 24,  1],
        [30,  6,  1],
        [ 8, 16,  1],
        [17,  8,  1],
        [ 4, 12,  1],
        [13,  2,  1],
        [ 6,  1,  1],
        [ 2,  1,  1]], device='cuda:0')
tensor([1, 1, 0], device='cuda:0')


In [40]:
import torch
import torch.nn as nn
import torch.optim as optim

In [41]:
######### Training a simple LSTM on this toy data of ours #########
class RNN_LSTM(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, num_layers):
        super(RNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers)
        self.fc_out = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)

        embedded = self.embedding(x)
        outputs, _ = self.rnn(embedded, (h0, c0))
        prediction = self.fc_out(outputs[-1, :, :])

        return prediction

In [42]:
# Hyperparameters
input_size = len(quote.vocab)
hidden_size = 512
num_layers = 2
embedding_size = 100
learning_rate = 0.005
num_epochs = 10

In [43]:
# Initialize network
model = RNN_LSTM(input_size, embedding_size, hidden_size, num_layers).to(device)

In [44]:
# Load the pretrained embeddings onto our model
pretrained_embeddings = quote.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 0.4918,  1.1164,  1.1424,  ..., -0.5088,  0.6256,  0.4392],
        [-0.4989,  0.7660,  0.8975,  ..., -0.4118,  0.4054,  0.7850],
        [-0.5718,  0.0463,  0.8673,  ..., -0.3566,  0.9293,  0.8995]],
       device='cuda:0')

In [45]:
# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [47]:
from tqdm.notebook import tqdm

In [50]:
# Train Network
for epoch in tqdm(range(num_epochs)):
    for batch_idx, batch in enumerate(train_iterator):
        # Get data to cuda if possible
        data = batch.q.to(device=device)
        targets = batch.s.to(device=device)

        # forward
        scores = model(data)
        loss = criterion(scores.squeeze(1), targets.type_as(scores))

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent
        optimizer.step()

    print(f"Epoch {epoch} is done")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0 is done
Epoch 1 is done
Epoch 2 is done
Epoch 3 is done
Epoch 4 is done
Epoch 5 is done
Epoch 6 is done
Epoch 7 is done
Epoch 8 is done
Epoch 9 is done
