In [22]:
import torchtext
import spacy
import torch
import torch.nn as nn
import torchtext.transforms as T
import torch.optim as optim
import pytorch_lightning as pl
import torch.nn.functional as F
import os


In [23]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc


class RNN(pl.LightningModule):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,
                 train_datapipe, val_datapipe, test_datapipe, batch_size=32):
        super().__init__()
        
        # Required since our input vector represents each word as an index into
        # the vocabulary.
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=1)
        # Creates an RNN using tanh by default.
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

        # LightningModule attributes
        self.lr = 1e-3
        self.batch_size = batch_size
        self.loss_fn = nn.BCEWithLogitsLoss()

        # Datasets
        self.train_datapipe = train_datapipe
        self.val_datapipe = val_datapipe
        self.test_datapipe = test_datapipe

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)

        return self.fc(hidden.squeeze(0))

    def training_step(self, batch, batch_idx):
        input = batch["token_ids"].cuda()
        target = torch.tensor(batch["target"], dtype=torch.float).cuda()
        output = self(input).squeeze()
        loss = self.loss_fn(output, target)
        acc = binary_accuracy(output, target)
        
        self.log("train_loss", loss)
        self.log("train_acc", acc)

        return loss
    
    def validation_step(self, batch, batch_idx):
        input = batch["token_ids"].cuda()
        target = torch.tensor(batch["target"], dtype=torch.float).cuda()
        output = self(input).squeeze()
        loss = self.loss_fn(output, target)
        acc = binary_accuracy(output, target)
        
        self.log("val_loss", loss)
        self.log("val_acc", acc)

    def train_dataloader(self):
        loader = torch.utils.data.DataLoader(self.train_datapipe,
                                             batch_size=None,
                                             num_workers=8,
                                             shuffle=True)

        return loader

    def val_dataloader(self):
        loader = torch.utils.data.DataLoader(self.val_datapipe,
                                             batch_size=None,
                                             num_workers=8,
                                             shuffle=False)

        return loader

    def test_dataloader(self):
        loader = torch.utils.data.DataLoader(self.test_datapipe,
                                             batch_size=None,
                                             num_workers=8,
                                             shuffle=False)

        return loader

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.lr)

In [24]:
tokenizer = torchtext.data.utils.get_tokenizer("spacy", language="en_core_web_sm")
max_tokens = 25000

def make_vocabulary():
    train_dataset = torchtext.datasets.SST2(split="train")
    train_datapipe = train_dataset.map(lambda x: tokenizer(x[0]))
    v = torchtext.vocab.build_vocab_from_iterator(train_datapipe, specials=["<unk>"], max_tokens=max_tokens)
    v.set_default_index(0)

    return v

In [25]:
v = make_vocabulary()

In [26]:
padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256

text_transform = T.Sequential(
    T.VocabTransform(v),
    T.Truncate(max_seq_len - 2),
    T.AddToken(token=bos_idx, begin=True),
    T.AddToken(token=eos_idx, begin=False),
    T.ToTensor(padding_value=padding_idx)
)

batch_size = 32

train_datapipe = torchtext.datasets.SST2(split='train')
test_datapipe = torchtext.datasets.SST2(split='test')
val_datapipe = torchtext.datasets.SST2(split='dev')

train_datapipe = train_datapipe.map(lambda x: (tokenizer(x[0]), x[1]))
train_datapipe = train_datapipe.batch(batch_size).rows2columnar(["text", "label"])
train_datapipe = train_datapipe.map(lambda x: {"token_ids": text_transform(x["text"]), "target": x["label"]})

test_datapipe = test_datapipe.map(lambda x: (tokenizer(x[0]), x[1]))
test_datapipe = test_datapipe.batch(batch_size).rows2columnar(["text", "label"])
test_datapipe = test_datapipe.map(lambda x: {"token_ids": text_transform(x["text"]), "target": x["label"]})

val_datapipe = val_datapipe.map(lambda x: (tokenizer(x[0]), x[1]))
val_datapipe = val_datapipe.batch(batch_size).rows2columnar(["text", "label"])
val_datapipe = val_datapipe.map(lambda x: {"token_ids": text_transform(x["text"]), "target": x["label"]})

In [27]:
model = RNN(len(v), 100, 256, 1, train_datapipe, val_datapipe, test_datapipe)

In [28]:
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    mode="min"
)

trainer = pl.Trainer(accelerator="c=gpu", callbacks=[checkpoint_callback], max_epochs=5)
trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: C:\Users\Kiran\Untitled Folder 8\lightning_logs

  | Name      | Type              | Params
------------------------------------------------
0 | embedding | Embedding         | 1.4 M 
1 | rnn       | LSTM              | 366 K 
2 | fc        | Linear            | 257   
3 | loss_fn   | BCEWithLogitsLoss | 0     
------------------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.023     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

NameError: Caught NameError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\_utils\worker.py", line 287, in _worker_loop
    data = fetcher.fetch(index)
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\_utils\fetch.py", line 39, in fetch
    data = next(self.dataset_iter)
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\_typing.py", line 366, in wrap_generator
    response = gen.send(None)
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\datapipes\iter\callable.py", line 112, in __iter__
    for data in self.datapipe:
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\_typing.py", line 366, in wrap_generator
    response = gen.send(None)
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torchdata\datapipes\iter\util\rows2columnar.py", line 53, in __iter__
    for batch in self.source_datapipe:
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\_typing.py", line 366, in wrap_generator
    response = gen.send(None)
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\datapipes\iter\grouping.py", line 90, in __iter__
    for x in self.datapipe:
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\_typing.py", line 366, in wrap_generator
    response = gen.send(None)
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\datapipes\iter\callable.py", line 113, in __iter__
    yield self._apply_fn(data)
  File "C:\Users\Kiran\anaconda3\envs\cse6363\lib\site-packages\torch\utils\data\datapipes\iter\callable.py", line 78, in _apply_fn
    return self.fn(data)
  File "C:\Users\Kiran\AppData\Local\Temp/ipykernel_15896/953267928.py", line 28, in <lambda>
NameError: name 'tokenizer' is not defined
