In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from copy import deepcopy

Mounted at /content/drive


In [15]:
def get_tags(text, variable_map):
    tokens = text.split()
    tags = []
    for token in tokens:
        matched = False
        for var in variable_map:
            if var in token:
                tags.append(var)
                matched = True
                break
        if not matched:
            tags.append("O")
    return tags

def preprocess(data):
    examples = []
    template_map = {}
    template_id_counter = 0

    for block in data:
        sql_templates = block["sql"]
        shortest_template = min(sql_templates, key=len).replace('\n', ' ')

        if shortest_template not in template_map:
            template_map[shortest_template] = template_id_counter
            template_id_counter += 1

        for sent in block["sentences"]:
            split = block.get("query-split", "train")
            text = sent["text"]
            variables = sent["variables"]
            filled_question = text
            for var, val in variables.items():
                filled_question = filled_question.replace(var, val)

            examples.append({
                "split": split,
                "question": filled_question,
                "tokens": text.split(),
                "tags": get_tags(text, variables),
                "template_sql": shortest_template,
                "template_id": template_map[shortest_template],
                "sql_gold": sql_templates
            })

    return examples, template_map

In [3]:
class ATISDataset(Dataset):
    def __init__(self, data, vocab, tag_vocab):
        self.data = data
        self.vocab = vocab
        self.tag_vocab = tag_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ex = self.data[idx]
        token_ids = [self.vocab.get(w.lower(), 0) for w in ex["tokens"]]
        tag_ids = [self.tag_vocab.get(t, 0) for t in ex["tags"]]
        return {
            "tokens": torch.tensor(token_ids),
            "tags": torch.tensor(tag_ids),
            "template_id": ex["template_id"]
        }

def collate_fn(batch):
    tokens = nn.utils.rnn.pad_sequence([b["tokens"] for b in batch], batch_first=True)
    tags = nn.utils.rnn.pad_sequence([b["tags"] for b in batch], batch_first=True)
    template_ids = torch.tensor([b["template_id"] for b in batch])
    return {"tokens": tokens, "tags": tags, "template": template_ids}

In [4]:
class LinearTagger(nn.Module):
    def __init__(self, vocab_size, tag_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 64)
        self.out = nn.Linear(64, tag_size)

    def forward(self, x):
        return self.out(self.emb(x))

class LinearClassifier(nn.Module):
    def __init__(self, vocab_size, num_templates):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 64)
        self.out = nn.Linear(64, num_templates)

    def forward(self, x):
        return self.out(self.emb(x).mean(dim=1))

In [5]:
class FFTagger(nn.Module):
    def __init__(self, vocab_size, tag_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 64)
        self.mlp = nn.Sequential(nn.Linear(64, 128), nn.ReLU(), nn.Linear(128, tag_size))

    def forward(self, x):
        return self.mlp(self.emb(x))

class FFClassifier(nn.Module):
    def __init__(self, vocab_size, num_templates):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 64)
        self.mlp = nn.Sequential(nn.Linear(64, 128), nn.ReLU(), nn.Linear(128, num_templates))

    def forward(self, x):
        return self.mlp(self.emb(x).mean(dim=1))

In [6]:
class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, tag_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 128, padding_idx=0)
        self.lstm = nn.LSTM(128, 128, batch_first=True, num_layers=1, dropout=0, bidirectional=False)
        self.dropout = nn.Dropout(0)
        self.tagger = nn.Linear(128, tag_size)

    def forward(self, x):
        emb = self.emb(x)
        out, _ = self.lstm(emb)
        return self.tagger(self.dropout(out))

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, num_templates):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 128, padding_idx=0)
        self.lstm = nn.LSTM(128, 128, batch_first=True, num_layers=1, dropout=0, bidirectional=False)
        self.dropout = nn.Dropout(0)
        self.classifier = nn.Linear(128, num_templates)

    def forward(self, x):
        emb = self.emb(x)
        _, (h_n, _) = self.lstm(emb)
        return self.classifier(self.dropout(h_n[-1]))

In [7]:
class TransformerTagger(nn.Module):
    def __init__(self, vocab_size, tag_size, max_len=128, nhead=8, num_layers=1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 128, padding_idx=0)
        self.pos_emb = nn.Embedding(max_len, 128)
        encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=nhead, dim_feedforward=256, dropout=0.5, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.tagger = nn.Linear(128, tag_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1)
        x = self.emb(x) + self.pos_emb(positions)
        return self.tagger(self.transformer(x))

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, num_templates, max_len=128, nhead=8, num_layers=1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 128, padding_idx=0)
        self.pos_emb = nn.Embedding(max_len, 128)
        encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=nhead, dim_feedforward=256, dropout=0.5, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(128, num_templates)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1)
        x = self.emb(x) + self.pos_emb(positions)
        return self.classifier(self.transformer(x).mean(dim=1))

In [13]:
def evaluate_template_and_tags(name, tagger, classifier, data, vocab, tag_vocab, template_map):
    tagger.eval()
    classifier.eval()
    tag_rev = {v: k for k, v in tag_vocab.items()}

    correct_template = 0
    correct_tags = 0
    total = len(data)

    with torch.no_grad():
        for ex in data:
            tokens = ex["tokens"]
            token_ids = torch.tensor([vocab.get(w.lower(), 0) for w in tokens]).unsqueeze(0)
            pred_template = classifier(token_ids).argmax(dim=1).item()
            if pred_template == ex["template_id"]:
                correct_template += 1

            pred_tags = tagger(token_ids).squeeze(0).argmax(dim=-1).cpu().tolist()
            target_tags = [tag_vocab[t] for t in ex["tags"]]
            correct_tags += sum([p == g for p, g in zip(pred_tags, target_tags)])

    tag_total = sum(len(ex["tags"]) for ex in data)
    tag_acc = correct_tags / tag_total
    template_acc = correct_template / total

    print(f"\nTagging Accuracy on \"{name.lower()}\": {correct_tags}/{tag_total} = {tag_acc:.5f}")
    print(f"Template Classification Accuracy on \"{name.lower()}\": {correct_template}/{total} = {template_acc:.5f}")

In [11]:
with open("atis.json") as dataset:
    atis_data = json.load(dataset)

In [16]:
def train_linear(tagger, classifier, dataloader, tag_size, num_templates, epochs=20):
    criterion_tag = nn.CrossEntropyLoss()
    criterion_template = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(tagger.parameters()) + list(classifier.parameters()), lr=1e-3)

    for epoch in range(epochs):
        tagger.train()
        classifier.train()
        total_loss = 0
        for batch in dataloader:
            tokens, tags, templates = batch["tokens"], batch["tags"], batch["template"]
            tag_loss = criterion_tag(tagger(tokens).view(-1, tag_size), tags.view(-1))
            template_loss = criterion_template(classifier(tokens), templates)
            loss = tag_loss + template_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"[Linear] Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

examples, template_map = preprocess(atis_data)
train_data = [ex for ex in examples if ex["split"] == "train"]
dev_data = [ex for ex in examples if ex["split"] == "dev"]
test_data = [ex for ex in examples if ex["split"] == "test"]
vocab = {"<PAD>": 0}
tag_vocab = {"O": 0}
for ex in train_data + dev_data + test_data:
    for tok in ex["tokens"]:
        vocab.setdefault(tok.lower(), len(vocab))
    for tag in ex["tags"]:
        tag_vocab.setdefault(tag, len(tag_vocab))
dataset = ATISDataset(train_data, vocab, tag_vocab)
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
tagger = LinearTagger(len(vocab), len(tag_vocab))
classifier = LinearClassifier(len(vocab), len(template_map))
train_linear(tagger, classifier, loader, len(tag_vocab), len(template_map))
evaluate_template_and_tags("dev", tagger, classifier, dev_data, vocab, tag_vocab, template_map)
evaluate_template_and_tags("test", tagger, classifier, test_data, vocab, tag_vocab, template_map)

[Linear] Epoch 1: Loss = 6.4668
[Linear] Epoch 2: Loss = 4.6847
[Linear] Epoch 3: Loss = 4.2751
[Linear] Epoch 4: Loss = 3.9803
[Linear] Epoch 5: Loss = 3.7053
[Linear] Epoch 6: Loss = 3.4566
[Linear] Epoch 7: Loss = 3.2271
[Linear] Epoch 8: Loss = 3.0082
[Linear] Epoch 9: Loss = 2.7976
[Linear] Epoch 10: Loss = 2.6385
[Linear] Epoch 11: Loss = 2.4338
[Linear] Epoch 12: Loss = 2.2908
[Linear] Epoch 13: Loss = 2.1414
[Linear] Epoch 14: Loss = 2.0041
[Linear] Epoch 15: Loss = 1.8747
[Linear] Epoch 16: Loss = 1.7531
[Linear] Epoch 17: Loss = 1.6453
[Linear] Epoch 18: Loss = 1.5510
[Linear] Epoch 19: Loss = 1.4452
[Linear] Epoch 20: Loss = 1.3505

Tagging Accuracy on "dev": 1384/1390 = 0.99568
Template Classification Accuracy on "dev": 0/121 = 0.00000

Tagging Accuracy on "test": 3933/3947 = 0.99645
Template Classification Accuracy on "test": 0/347 = 0.00000


In [17]:
def train_feedforward(tagger, classifier, dataloader, tag_size, num_templates, epochs=20):
    criterion_tag = nn.CrossEntropyLoss()
    criterion_template = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(tagger.parameters()) + list(classifier.parameters()), lr=1e-3)

    for epoch in range(epochs):
        tagger.train()
        classifier.train()
        total_loss = 0
        for batch in dataloader:
            tokens, tags, templates = batch["tokens"], batch["tags"], batch["template"]
            tag_loss = criterion_tag(tagger(tokens).view(-1, tag_size), tags.view(-1))
            template_loss = criterion_template(classifier(tokens), templates)
            loss = tag_loss + template_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"[Feedforward] Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

examples, template_map = preprocess(atis_data)
train_data = [ex for ex in examples if ex["split"] == "train"]
dev_data = [ex for ex in examples if ex["split"] == "dev"]
test_data = [ex for ex in examples if ex["split"] == "test"]
vocab = {"<PAD>": 0}
tag_vocab = {"O": 0}
for ex in train_data + dev_data + test_data:
    for tok in ex["tokens"]:
        vocab.setdefault(tok.lower(), len(vocab))
    for tag in ex["tags"]:
        tag_vocab.setdefault(tag, len(tag_vocab))
dataset = ATISDataset(train_data, vocab, tag_vocab)
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
tagger = FFTagger(len(vocab), len(tag_vocab))
classifier = FFClassifier(len(vocab), len(template_map))
train_feedforward(tagger, classifier, loader, len(tag_vocab), len(template_map))
evaluate_template_and_tags("dev", tagger, classifier, dev_data, vocab, tag_vocab, template_map)
evaluate_template_and_tags("test", tagger, classifier, test_data, vocab, tag_vocab, template_map)

[Feedforward] Epoch 1: Loss = 5.3478
[Feedforward] Epoch 2: Loss = 3.9680
[Feedforward] Epoch 3: Loss = 3.2969
[Feedforward] Epoch 4: Loss = 2.7282
[Feedforward] Epoch 5: Loss = 2.2296
[Feedforward] Epoch 6: Loss = 1.8039
[Feedforward] Epoch 7: Loss = 1.4488
[Feedforward] Epoch 8: Loss = 1.1710
[Feedforward] Epoch 9: Loss = 0.9376
[Feedforward] Epoch 10: Loss = 0.7727
[Feedforward] Epoch 11: Loss = 0.6603
[Feedforward] Epoch 12: Loss = 0.5550
[Feedforward] Epoch 13: Loss = 0.4780
[Feedforward] Epoch 14: Loss = 0.4188
[Feedforward] Epoch 15: Loss = 0.3655
[Feedforward] Epoch 16: Loss = 0.3302
[Feedforward] Epoch 17: Loss = 0.2900
[Feedforward] Epoch 18: Loss = 0.2624
[Feedforward] Epoch 19: Loss = 0.2390
[Feedforward] Epoch 20: Loss = 0.2142

Tagging Accuracy on "dev": 1390/1390 = 1.00000
Template Classification Accuracy on "dev": 0/121 = 0.00000

Tagging Accuracy on "test": 3947/3947 = 1.00000
Template Classification Accuracy on "test": 0/347 = 0.00000


In [18]:
def train_lstm(tagger, classifier, dataloader, tag_size, num_templates, epochs=20):
    criterion_tag = nn.CrossEntropyLoss()
    criterion_template = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(tagger.parameters()) + list(classifier.parameters()), lr=1e-3)

    for epoch in range(epochs):
        tagger.train()
        classifier.train()
        total_loss = 0
        for batch in dataloader:
            tokens, tags, templates = batch["tokens"], batch["tags"], batch["template"]
            tag_loss = criterion_tag(tagger(tokens).view(-1, tag_size), tags.view(-1))
            template_loss = criterion_template(classifier(tokens), templates)
            loss = tag_loss + template_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"[LSTM] Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

examples, template_map = preprocess(atis_data)
train_data = [ex for ex in examples if ex["split"] == "train"]
dev_data = [ex for ex in examples if ex["split"] == "dev"]
test_data = [ex for ex in examples if ex["split"] == "test"]
vocab = {"<PAD>": 0}
tag_vocab = {"O": 0}
for ex in train_data + dev_data + test_data:
    for tok in ex["tokens"]:
        vocab.setdefault(tok.lower(), len(vocab))
    for tag in ex["tags"]:
        tag_vocab.setdefault(tag, len(tag_vocab))
dataset = ATISDataset(train_data, vocab, tag_vocab)
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
tagger = LSTMTagger(len(vocab), len(tag_vocab))
classifier = LSTMClassifier(len(vocab), len(template_map))
train_lstm(tagger, classifier, loader, len(tag_vocab), len(template_map))
evaluate_template_and_tags("dev", tagger, classifier, dev_data, vocab, tag_vocab, template_map)
evaluate_template_and_tags("test", tagger, classifier, test_data, vocab, tag_vocab, template_map)

[LSTM] Epoch 1: Loss = 5.5461
[LSTM] Epoch 2: Loss = 3.9807
[LSTM] Epoch 3: Loss = 3.0319
[LSTM] Epoch 4: Loss = 2.3724
[LSTM] Epoch 5: Loss = 1.8882
[LSTM] Epoch 6: Loss = 1.5598
[LSTM] Epoch 7: Loss = 1.2455
[LSTM] Epoch 8: Loss = 1.0397
[LSTM] Epoch 9: Loss = 0.8302
[LSTM] Epoch 10: Loss = 0.6790
[LSTM] Epoch 11: Loss = 0.5680
[LSTM] Epoch 12: Loss = 0.4665
[LSTM] Epoch 13: Loss = 0.3812
[LSTM] Epoch 14: Loss = 0.3260
[LSTM] Epoch 15: Loss = 0.2667
[LSTM] Epoch 16: Loss = 0.2160
[LSTM] Epoch 17: Loss = 0.2324
[LSTM] Epoch 18: Loss = 0.2104
[LSTM] Epoch 19: Loss = 0.1997
[LSTM] Epoch 20: Loss = 0.1567

Tagging Accuracy on "dev": 1390/1390 = 1.00000
Template Classification Accuracy on "dev": 0/121 = 0.00000

Tagging Accuracy on "test": 3941/3947 = 0.99848
Template Classification Accuracy on "test": 0/347 = 0.00000


In [19]:
def train_transformer(tagger, classifier, dataloader, tag_size, num_templates, epochs=20):
    criterion_tag = nn.CrossEntropyLoss()
    criterion_template = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(tagger.parameters()) + list(classifier.parameters()), lr=1e-3)

    for epoch in range(epochs):
        tagger.train()
        classifier.train()
        total_loss = 0
        for batch in dataloader:
            tokens, tags, templates = batch["tokens"], batch["tags"], batch["template"]
            tag_loss = criterion_tag(tagger(tokens).view(-1, tag_size), tags.view(-1))
            template_loss = criterion_template(classifier(tokens), templates)
            loss = tag_loss + template_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"[Transformer] Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

examples, template_map = preprocess(atis_data)
train_data = [ex for ex in examples if ex["split"] == "train"]
dev_data = [ex for ex in examples if ex["split"] == "dev"]
test_data = [ex for ex in examples if ex["split"] == "test"]
vocab = {"<PAD>": 0}
tag_vocab = {"O": 0}
for ex in train_data + dev_data + test_data:
    for tok in ex["tokens"]:
        vocab.setdefault(tok.lower(), len(vocab))
    for tag in ex["tags"]:
        tag_vocab.setdefault(tag, len(tag_vocab))
dataset = ATISDataset(train_data, vocab, tag_vocab)
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
tagger = TransformerTagger(len(vocab), len(tag_vocab))
classifier = TransformerClassifier(len(vocab), len(template_map))
train_transformer(tagger, classifier, loader, len(tag_vocab), len(template_map))
evaluate_template_and_tags("dev", tagger, classifier, dev_data, vocab, tag_vocab, template_map)
evaluate_template_and_tags("test", tagger, classifier, test_data, vocab, tag_vocab, template_map)

[Transformer] Epoch 1: Loss = 4.8375
[Transformer] Epoch 2: Loss = 3.0456
[Transformer] Epoch 3: Loss = 2.2298
[Transformer] Epoch 4: Loss = 1.7113
[Transformer] Epoch 5: Loss = 1.3237
[Transformer] Epoch 6: Loss = 1.0222
[Transformer] Epoch 7: Loss = 0.7708
[Transformer] Epoch 8: Loss = 0.5752
[Transformer] Epoch 9: Loss = 0.4358
[Transformer] Epoch 10: Loss = 0.3149
[Transformer] Epoch 11: Loss = 0.2457
[Transformer] Epoch 12: Loss = 0.2047
[Transformer] Epoch 13: Loss = 0.1767
[Transformer] Epoch 14: Loss = 0.1482
[Transformer] Epoch 15: Loss = 0.1255
[Transformer] Epoch 16: Loss = 0.1136
[Transformer] Epoch 17: Loss = 0.1067
[Transformer] Epoch 18: Loss = 0.0971
[Transformer] Epoch 19: Loss = 0.0905
[Transformer] Epoch 20: Loss = 0.0903

Tagging Accuracy on "dev": 1383/1390 = 0.99496
Template Classification Accuracy on "dev": 0/121 = 0.00000

Tagging Accuracy on "test": 3947/3947 = 1.00000
Template Classification Accuracy on "test": 0/347 = 0.00000
