In [115]:
import json, random, pathlib, torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

DATA_FILE = "../datasets/intention/intent_dataset.json"
BATCH_SIZE = 4
EPOCHS     = 20
EMB_DIM    = 32
HID_DIM    = 32
LR         = 5e-3
SEED       = 42
random.seed(SEED); torch.manual_seed(SEED)

<torch._C.Generator at 0x1984601e0d0>

In [116]:
def tokenize(text: str):
    """Very simple whitespace + lowercase tokenizer."""
    return text.lower().split()

with open(DATA_FILE, "r", encoding="utf-8") as fp:
    records = json.load(fp)

# Build vocab
tokens = {tok for r in records for tok in tokenize(r["sentence"])}
PAD, UNK = "<pad>", "<unk>"
vocab = {PAD: 0, UNK: 1, **{tok: idx + 2 for idx, tok in enumerate(sorted(tokens))}}

In [117]:

def numericalise(sentence: str):
    return [vocab.get(tok, vocab[UNK]) for tok in tokenize(sentence)]

class IntentDS(Dataset):
    def __init__(self, rows):  self.rows = rows
    def __len__(self):         return len(self.rows)
    def __getitem__(self, i):
        vec = numericalise(self.rows[i]["sentence"])
        lbl = float(self.rows[i]["label"])
        return torch.tensor(vec, dtype=torch.long), torch.tensor(lbl)

def collate(batch):
    seqs, labels = zip(*batch)
    lens = torch.tensor([len(s) for s in seqs])
    max_len = max(lens)
    pad = torch.full((len(seqs), max_len), vocab[PAD], dtype=torch.long)
    for i, seq in enumerate(seqs):
        pad[i, :len(seq)] = seq
    return pad, lens, torch.tensor(labels)

In [118]:
random.shuffle(records)
split = int(0.8 * len(records))
train_dl = DataLoader(IntentDS(records[:split]),  BATCH_SIZE, True,  collate_fn=collate)
test_dl  = DataLoader(IntentDS(records[split:]), BATCH_SIZE, False, collate_fn=collate)

In [119]:
class LSTMIntent(nn.Module):
    def __init__(self, vsz, emb=EMB_DIM, hid=HID_DIM):
        super().__init__()
        self.emb = nn.Embedding(vsz, emb, padding_idx=vocab[PAD])
        self.lstm = nn.LSTM(emb, hid, batch_first=True)
        self.out  = nn.Linear(hid, 1)
    def forward(self, x, lengths):
        x = self.emb(x)
        x = nn.utils.rnn.pack_padded_sequence(
                x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (h, _) = self.lstm(x)
        return torch.sigmoid(self.out(h[-1])).squeeze(1)

In [120]:
model = LSTMIntent(len(vocab))
criterion, optimiser = nn.BCELoss(), optim.Adam(model.parameters(), lr=LR)

In [121]:
def run_epoch(dl, train=False):
    model.train() if train else model.eval()
    tot_loss = tot_ok = tot = 0
    with torch.set_grad_enabled(train):
        for x, lens, y in dl:
            if train: optimiser.zero_grad()
            p = model(x, lens)
            loss = criterion(p, y)
            if train:
                loss.backward(); optimiser.step()
            tot_loss += loss.item() * y.size(0)
            tot_ok   += ((p >= .5).float() == y).sum().item()
            tot      += y.size(0)
    return tot_loss / tot, tot_ok / tot


def predict_intent(sentence: str):
    model.eval()
    vec = numericalise(sentence)
    lens = torch.tensor([len(vec)])
    x = torch.tensor([vec])
    with torch.no_grad():
        prob = model(x, lens).item()
    return (1 if prob >= 0.5 else 0), prob





In [122]:
for epoch in range(1, EPOCHS + 1):
    tr_loss, tr_acc = run_epoch(train_dl, train=True)
    te_loss, te_acc = run_epoch(test_dl,  train=False)
    print(f"Epoch {epoch:02d}: "
          f"train acc={tr_acc:.2%}  test acc={te_acc:.2%}")
    

Epoch 01: train acc=67.74%  test acc=43.75%
Epoch 02: train acc=80.65%  test acc=56.25%
Epoch 03: train acc=91.94%  test acc=68.75%
Epoch 04: train acc=100.00%  test acc=62.50%
Epoch 05: train acc=98.39%  test acc=81.25%
Epoch 06: train acc=100.00%  test acc=81.25%
Epoch 07: train acc=100.00%  test acc=81.25%
Epoch 08: train acc=100.00%  test acc=81.25%
Epoch 09: train acc=100.00%  test acc=87.50%
Epoch 10: train acc=100.00%  test acc=81.25%
Epoch 11: train acc=100.00%  test acc=81.25%
Epoch 12: train acc=100.00%  test acc=81.25%
Epoch 13: train acc=100.00%  test acc=81.25%
Epoch 14: train acc=100.00%  test acc=81.25%
Epoch 15: train acc=100.00%  test acc=81.25%
Epoch 16: train acc=100.00%  test acc=81.25%
Epoch 17: train acc=100.00%  test acc=75.00%
Epoch 18: train acc=100.00%  test acc=75.00%
Epoch 19: train acc=100.00%  test acc=75.00%
Epoch 20: train acc=100.00%  test acc=75.00%


In [None]:
# Quick test
test_sentence = "Please don't delete every temporary table after you finish the analysis."
pred_label, pred_prob = predict_intent(test_sentence)
print(f"\nTest sentence: {test_sentence}\nPredicted label: {pred_label} (prob={pred_prob:.3f})")


Test sentence:  don't remove any associated promotional discounts
Predicted label: 1 (prob=0.995)


In [134]:
test_sentence = "How many movies have a popularity of more than 400 but less than 500? Indicate the name of the movies and the highest rating score each movie has received."
pred_label, pred_prob = predict_intent( test_sentence)
print(f"\nTest sentence: {test_sentence}\nPredicted label: {pred_label} (prob={pred_prob:.3f})")


Test sentence: How many movies have a popularity of more than 400 but less than 500? Indicate the name of the movies and the highest rating score each movie has received.
Predicted label: 0 (prob=0.001)
