In [47]:
import sys
sys.path.append("..")
from pathlib import Path

from src.slot.data_manager import SlotDataManager

In [48]:
data_manager = SlotDataManager(
    cache_dir=Path("../cache/slot"),
    max_len=128,
    batch_size=32,
    num_workers=8,
    data_dir=Path("../data/slot"),
    test_file=Path("../data/slot/test.json")
)

2022-03-03 08:35:53 | INFO | Vocab loaded from /home/jacky/110-2_ADL/homeworks/hw01/cache/slot/vocab.pkl
2022-03-03 08:35:53 | INFO | Tag-2-Index loaded from /home/jacky/110-2_ADL/homeworks/hw01/cache/slot/tag2idx.json
2022-03-03 08:35:53 | INFO | Embeddings loaded from /home/jacky/110-2_ADL/homeworks/hw01/cache/slot/embeddings.pt


In [49]:
valid_dataloader = data_manager.get_valid_dataloader()
valid_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f879561aeb0>

In [50]:
x, length, y = next(iter(valid_dataloader))
x.shape, length.shape, y.shape

(torch.Size([32, 33]), torch.Size([32]), torch.Size([32, 128]))

In [51]:
from src.slot.models import SlotTagger

model = SlotTagger(
    embeddings=data_manager.embeddings,
    hidden_size=128,
    num_layers=2,
    dropout=.2,
    bidirectional=True,
    num_class=data_manager.num_class,
    lr=1e-3,
    weight_decay=1e-5
)
model

SlotTagger(
  (embedding): Embedding(4117, 300, padding_idx=0)
  (rnn): GRU(300, 128, num_layers=2, dropout=0.2, bidirectional=True)
  (fc): Sequential(
    (0): Dropout(p=0.2, inplace=False)
    (1): Linear(in_features=256, out_features=256, bias=True)
    (2): PReLU(num_parameters=1)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=256, out_features=9, bias=True)
  )
  (loss): CrossEntropyLoss()
)

In [52]:
embeddings = model.embedding(x)
embeddings.shape

torch.Size([32, 33, 300])

In [53]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

packed_features = pack_padded_sequence(
    input=embeddings,
    lengths=length.cpu(),
    batch_first=True,
    enforce_sorted=False
)
packed_output_features, _ = model.rnn(packed_features)
output_features, _ = pad_packed_sequence(
    sequence=packed_output_features,
    batch_first=True
)
output_features.shape

torch.Size([32, 18, 256])

In [54]:
flatten_features = output_features.view(-1, 128*2)
flatten_features.shape

torch.Size([576, 256])

In [55]:
output = model.fc(flatten_features).view(output_features.shape[0], output_features.shape[1], -1)
output.shape

torch.Size([32, 18, 9])

In [56]:
import torch
flatten_output = torch.cat([
    sen_output[:sen_len, :]
    for sen_output, sen_len, in zip(output, length)
])
flatten_output.shape

torch.Size([213, 9])

In [57]:
flatten_y = torch.cat([
    sen_tags[:sen_len]
    for sen_tags, sen_len, in zip(y, length)
])
flatten_y.shape

torch.Size([213])

In [58]:
model.loss(input=flatten_output, target=flatten_y)

tensor(2.1592, grad_fn=<NllLossBackward0>)

In [59]:
output[0][:length[0]].argmax(dim=1)

tensor([6, 2, 2, 2, 6])

In [60]:
pred = [
    sen_val[:sen_len].argmax(dim=1)
    for sen_val, sen_len in zip(output, length)
]
len(pred)

32

In [61]:
pred[:10]


[tensor([6, 2, 2, 2, 6]),
 tensor([0, 2]),
 tensor([6, 7, 8]),
 tensor([4, 4, 6, 6, 7, 4]),
 tensor([7, 2, 4, 4, 4]),
 tensor([2, 4, 6, 7, 2, 2, 6, 6, 6]),
 tensor([8, 8, 4, 6, 2, 7, 6]),
 tensor([4, 6, 6, 4, 4, 4, 4, 6]),
 tensor([6, 4, 7, 6, 8, 1]),
 tensor([8, 8, 7, 6, 4, 6, 6, 4, 6, 7, 6, 6, 0])]

In [62]:
[
    {
        'id': i,
        'tags': ' '.join([
            data_manager.idx2tag[idx]
            for idx in p.tolist()
        ])
    }
    for i, p in zip(valid_dataloader.dataset.ids, pred)
][:3]

[{'id': 'eval-0', 'tags': 'O B-date B-date B-date O'},
 {'id': 'eval-1', 'tags': 'I-time B-date'},
 {'id': 'eval-2', 'tags': 'O B-people I-people'}]

In [63]:
data_manager.tag2idx.keys()

dict_keys(['I-time', 'B-first_name', 'B-date', 'B-last_name', 'B-time', 'I-date', 'O', 'B-people', 'I-people'])

In [72]:
from pytorch_lightning import Trainer

model = SlotTagger.load_from_checkpoint(Path("../ckpt/intent/20220303_1542/intent-epoch=13-val_acc=0.91-val_loss=0.52.ckpt"))
trainer = Trainer(
    devices=[7],
    accelerator="gpu",
    deterministic=True,
)
test_dataloader = data_manager.get_test_dataloader()
pred = trainer.predict(model, test_dataloader, return_predictions=True)

2022-03-03 08:37:49 | INFO | GPU available: True, used: True
2022-03-03 08:37:49 | INFO | TPU available: False, using: 0 TPU cores
2022-03-03 08:37:49 | INFO | IPU available: False, using: 0 IPUs
2022-03-03 08:37:49 | INFO | LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


Predicting: 100%|██████████| 117/117 [00:02<00:00, 41.22it/s]


In [73]:
pred = [sen for batch in pred for sen in batch]

In [74]:
result = [
        {
            'id': i,
            'tags': ' '.join([data_manager.idx2tag[idx] for idx in p.tolist()])
        }
        for i, p in zip(test_dataloader.dataset.ids, pred)
]

KeyError: 50

In [78]:
x, length = next(iter(test_dataloader))
embeddings = model.embedding(x)
packed_features = pack_padded_sequence(
    input=embeddings,
    lengths=length.cpu(),
    batch_first=True,
    enforce_sorted=False
)
packed_output_features, _ = model.rnn(packed_features)
output_features, _ = pad_packed_sequence(
    sequence=packed_output_features,
    batch_first=True
)
flatten_features = output_features.view(-1, model.fc_hidden_size)
flatten_output = model.fc(flatten_features)
output = flatten_output.view(output_features.shape[0], output_features.shape[1], -1)

In [79]:
output.shape

torch.Size([32, 22, 150])

In [80]:
data_manager.num_class

9

In [81]:
model

SlotTagger(
  (embedding): Embedding(5963, 300, padding_idx=0)
  (rnn): GRU(300, 512, num_layers=2, dropout=0.2, bidirectional=True)
  (fc): Sequential(
    (0): Dropout(p=0.2, inplace=False)
    (1): Linear(in_features=1024, out_features=1024, bias=True)
    (2): PReLU(num_parameters=1)
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=1024, out_features=150, bias=True)
  )
  (loss): CrossEntropyLoss()
)