In [1]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from copy import deepcopy

Mounted at /content/drive


In [24]:
def get_tags(text, variable_map):
    tokens = text.split()
    tags = []
    for token in tokens:
        matched = False
        for var in variable_map:
            if var in token:
                tags.append(var)
                matched = True
                break
        if not matched:
            tags.append("O")
    return tags

def preprocess(data):
    examples = []
    template_map = {}
    template_id_counter = 0

    for block in data:
        sql_templates = block["sql"]
        shortest_template = min(sql_templates, key=len).replace('\n', ' ')

        if shortest_template not in template_map:
            template_map[shortest_template] = template_id_counter
            template_id_counter += 1

        for sent in block["sentences"]:
            split = sent.get("question-split", "train")
            text = sent["text"]
            variables = sent["variables"]
            filled_question = text
            for var, val in variables.items():
                filled_question = filled_question.replace(var, val)

            examples.append({
                "split": split,
                "question": filled_question,
                "tokens": text.split(),
                "tags": get_tags(text, variables),
                "template_sql": shortest_template,
                "template_id": template_map[shortest_template],
                "sql_gold": sql_templates
            })

    return examples, template_map

In [25]:
class ATISDataset(Dataset):
    def __init__(self, data, vocab, tag_vocab):
        self.data = data
        self.vocab = vocab
        self.tag_vocab = tag_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ex = self.data[idx]
        token_ids = [self.vocab.get(w.lower(), 0) for w in ex["tokens"]]
        tag_ids = [self.tag_vocab.get(t, 0) for t in ex["tags"]]
        return {
            "tokens": torch.tensor(token_ids),
            "tags": torch.tensor(tag_ids),
            "template_id": ex["template_id"]
        }

def collate_fn(batch):
    tokens = nn.utils.rnn.pad_sequence([b["tokens"] for b in batch], batch_first=True)
    tags = nn.utils.rnn.pad_sequence([b["tags"] for b in batch], batch_first=True)
    template_ids = torch.tensor([b["template_id"] for b in batch])
    return {"tokens": tokens, "tags": tags, "template": template_ids}

In [26]:
class LinearTagger(nn.Module):
    def __init__(self, vocab_size, tag_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 64)
        self.out = nn.Linear(64, tag_size)

    def forward(self, x):
        return self.out(self.emb(x))

class LinearClassifier(nn.Module):
    def __init__(self, vocab_size, num_templates):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 64)
        self.out = nn.Linear(64, num_templates)

    def forward(self, x):
        return self.out(self.emb(x).mean(dim=1))

In [27]:
class FFTagger(nn.Module):
    def __init__(self, vocab_size, tag_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 64)
        self.mlp = nn.Sequential(nn.Linear(64, 128), nn.ReLU(), nn.Linear(128, tag_size))

    def forward(self, x):
        return self.mlp(self.emb(x))

class FFClassifier(nn.Module):
    def __init__(self, vocab_size, num_templates):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 64)
        self.mlp = nn.Sequential(nn.Linear(64, 128), nn.ReLU(), nn.Linear(128, num_templates))

    def forward(self, x):
        return self.mlp(self.emb(x).mean(dim=1))

In [28]:
class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, tag_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 128, padding_idx=0)
        self.lstm = nn.LSTM(128, 128, batch_first=True, num_layers=1, dropout=0, bidirectional=False)
        self.dropout = nn.Dropout(0)
        self.tagger = nn.Linear(128, tag_size)

    def forward(self, x):
        emb = self.emb(x)
        out, _ = self.lstm(emb)
        return self.tagger(self.dropout(out))

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, num_templates):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 128, padding_idx=0)
        self.lstm = nn.LSTM(128, 128, batch_first=True, num_layers=1, dropout=0, bidirectional=False)
        self.dropout = nn.Dropout(0)
        self.classifier = nn.Linear(128, num_templates)

    def forward(self, x):
        emb = self.emb(x)
        _, (h_n, _) = self.lstm(emb)
        return self.classifier(self.dropout(h_n[-1]))

In [29]:
class TransformerTagger(nn.Module):
    def __init__(self, vocab_size, tag_size, max_len=128, nhead=8, num_layers=2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 128, padding_idx=0)
        self.pos_emb = nn.Embedding(max_len, 128)
        encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=nhead, dim_feedforward=256, dropout=0.5, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.tagger = nn.Linear(128, tag_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1)
        x = self.emb(x) + self.pos_emb(positions)
        return self.tagger(self.transformer(x))

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, num_templates, max_len=128, nhead=8, num_layers=2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, 128, padding_idx=0)
        self.pos_emb = nn.Embedding(max_len, 128)
        encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=nhead, dim_feedforward=256, dropout=0.5, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(128, num_templates)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1)
        x = self.emb(x) + self.pos_emb(positions)
        return self.classifier(self.transformer(x).mean(dim=1))

In [30]:
def evaluate_template_and_tags(name, tagger, classifier, data, vocab, tag_vocab, template_map):
    tagger.eval()
    classifier.eval()
    tag_rev = {v: k for k, v in tag_vocab.items()}

    correct_template = 0
    correct_tags = 0
    total = len(data)

    with torch.no_grad():
        for ex in data:
            tokens = ex["tokens"]
            token_ids = torch.tensor([vocab.get(w.lower(), 0) for w in tokens]).unsqueeze(0)
            pred_template = classifier(token_ids).argmax(dim=1).item()
            if pred_template == ex["template_id"]:
                correct_template += 1

            pred_tags = tagger(token_ids).squeeze(0).argmax(dim=-1).cpu().tolist()
            target_tags = [tag_vocab[t] for t in ex["tags"]]
            correct_tags += sum([p == g for p, g in zip(pred_tags, target_tags)])

    tag_total = sum(len(ex["tags"]) for ex in data)
    tag_acc = correct_tags / tag_total
    template_acc = correct_template / total

    print(f"\nTagging Accuracy on \"{name.lower()}\": {correct_tags}/{tag_total} = {tag_acc:.5f}")
    print(f"Template Classification Accuracy on \"{name.lower()}\": {correct_template}/{total} = {template_acc:.5f}")

In [9]:
with open("atis.json") as dataset:
    atis_data = json.load(dataset)

In [31]:
def train_linear(tagger, classifier, dataloader, tag_size, num_templates, epochs=20):
    criterion_tag = nn.CrossEntropyLoss()
    criterion_template = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(tagger.parameters()) + list(classifier.parameters()), lr=1e-3)

    for epoch in range(epochs):
        tagger.train()
        classifier.train()
        total_loss = 0
        for batch in dataloader:
            tokens, tags, templates = batch["tokens"], batch["tags"], batch["template"]
            tag_loss = criterion_tag(tagger(tokens).view(-1, tag_size), tags.view(-1))
            template_loss = criterion_template(classifier(tokens), templates)
            loss = tag_loss + template_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"[Linear] Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

examples, template_map = preprocess(atis_data)
train_data = [ex for ex in examples if ex["split"] == "train"]
dev_data = [ex for ex in examples if ex["split"] == "dev"]
test_data = [ex for ex in examples if ex["split"] == "test"]
vocab = {"<PAD>": 0}
tag_vocab = {"O": 0}
for ex in train_data + dev_data + test_data:
    for tok in ex["tokens"]:
        vocab.setdefault(tok.lower(), len(vocab))
    for tag in ex["tags"]:
        tag_vocab.setdefault(tag, len(tag_vocab))
dataset = ATISDataset(train_data, vocab, tag_vocab)
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
tagger = LinearTagger(len(vocab), len(tag_vocab))
classifier = LinearClassifier(len(vocab), len(template_map))
train_linear(tagger, classifier, loader, len(tag_vocab), len(template_map))
evaluate_template_and_tags("dev", tagger, classifier, dev_data, vocab, tag_vocab, template_map)
evaluate_template_and_tags("test", tagger, classifier, test_data, vocab, tag_vocab, template_map)

[Linear] Epoch 1: Loss = 6.7569
[Linear] Epoch 2: Loss = 4.9411
[Linear] Epoch 3: Loss = 4.5473
[Linear] Epoch 4: Loss = 4.2800
[Linear] Epoch 5: Loss = 4.0296
[Linear] Epoch 6: Loss = 3.8115
[Linear] Epoch 7: Loss = 3.5926
[Linear] Epoch 8: Loss = 3.3861
[Linear] Epoch 9: Loss = 3.1779
[Linear] Epoch 10: Loss = 3.0107
[Linear] Epoch 11: Loss = 2.8301
[Linear] Epoch 12: Loss = 2.6644
[Linear] Epoch 13: Loss = 2.5029
[Linear] Epoch 14: Loss = 2.3646
[Linear] Epoch 15: Loss = 2.2378
[Linear] Epoch 16: Loss = 2.1042
[Linear] Epoch 17: Loss = 1.9750
[Linear] Epoch 18: Loss = 1.8582
[Linear] Epoch 19: Loss = 1.7506
[Linear] Epoch 20: Loss = 1.6474

Tagging Accuracy on "dev": 5169/5179 = 0.99807
Template Classification Accuracy on "dev": 306/486 = 0.62963

Tagging Accuracy on "test": 4002/4015 = 0.99676
Template Classification Accuracy on "test": 192/447 = 0.42953


In [32]:
def train_feedforward(tagger, classifier, dataloader, tag_size, num_templates, epochs=20):
    criterion_tag = nn.CrossEntropyLoss()
    criterion_template = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(tagger.parameters()) + list(classifier.parameters()), lr=1e-3)

    for epoch in range(epochs):
        tagger.train()
        classifier.train()
        total_loss = 0
        for batch in dataloader:
            tokens, tags, templates = batch["tokens"], batch["tags"], batch["template"]
            tag_loss = criterion_tag(tagger(tokens).view(-1, tag_size), tags.view(-1))
            template_loss = criterion_template(classifier(tokens), templates)
            loss = tag_loss + template_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"[Feedforward] Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

examples, template_map = preprocess(atis_data)
train_data = [ex for ex in examples if ex["split"] == "train"]
dev_data = [ex for ex in examples if ex["split"] == "dev"]
test_data = [ex for ex in examples if ex["split"] == "test"]
vocab = {"<PAD>": 0}
tag_vocab = {"O": 0}
for ex in train_data + dev_data + test_data:
    for tok in ex["tokens"]:
        vocab.setdefault(tok.lower(), len(vocab))
    for tag in ex["tags"]:
        tag_vocab.setdefault(tag, len(tag_vocab))
dataset = ATISDataset(train_data, vocab, tag_vocab)
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
tagger = FFTagger(len(vocab), len(tag_vocab))
classifier = FFClassifier(len(vocab), len(template_map))
train_feedforward(tagger, classifier, loader, len(tag_vocab), len(template_map))
evaluate_template_and_tags("dev", tagger, classifier, dev_data, vocab, tag_vocab, template_map)
evaluate_template_and_tags("test", tagger, classifier, test_data, vocab, tag_vocab, template_map)

[Feedforward] Epoch 1: Loss = 5.5219
[Feedforward] Epoch 2: Loss = 4.2376
[Feedforward] Epoch 3: Loss = 3.6420
[Feedforward] Epoch 4: Loss = 3.1008
[Feedforward] Epoch 5: Loss = 2.6266
[Feedforward] Epoch 6: Loss = 2.1927
[Feedforward] Epoch 7: Loss = 1.8119
[Feedforward] Epoch 8: Loss = 1.4852
[Feedforward] Epoch 9: Loss = 1.2242
[Feedforward] Epoch 10: Loss = 1.0077
[Feedforward] Epoch 11: Loss = 0.8347
[Feedforward] Epoch 12: Loss = 0.7018
[Feedforward] Epoch 13: Loss = 0.5903
[Feedforward] Epoch 14: Loss = 0.5215
[Feedforward] Epoch 15: Loss = 0.4440
[Feedforward] Epoch 16: Loss = 0.4066
[Feedforward] Epoch 17: Loss = 0.3546
[Feedforward] Epoch 18: Loss = 0.3249
[Feedforward] Epoch 19: Loss = 0.2886
[Feedforward] Epoch 20: Loss = 0.2610

Tagging Accuracy on "dev": 5179/5179 = 1.00000
Template Classification Accuracy on "dev": 337/486 = 0.69342

Tagging Accuracy on "test": 4013/4015 = 0.99950
Template Classification Accuracy on "test": 215/447 = 0.48098


In [33]:
def train_lstm(tagger, classifier, dataloader, tag_size, num_templates, epochs=20):
    criterion_tag = nn.CrossEntropyLoss()
    criterion_template = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(tagger.parameters()) + list(classifier.parameters()), lr=1e-3)

    for epoch in range(epochs):
        tagger.train()
        classifier.train()
        total_loss = 0
        for batch in dataloader:
            tokens, tags, templates = batch["tokens"], batch["tags"], batch["template"]
            tag_loss = criterion_tag(tagger(tokens).view(-1, tag_size), tags.view(-1))
            template_loss = criterion_template(classifier(tokens), templates)
            loss = tag_loss + template_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"[LSTM] Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

examples, template_map = preprocess(atis_data)
train_data = [ex for ex in examples if ex["split"] == "train"]
dev_data = [ex for ex in examples if ex["split"] == "dev"]
test_data = [ex for ex in examples if ex["split"] == "test"]
vocab = {"<PAD>": 0}
tag_vocab = {"O": 0}
for ex in train_data + dev_data + test_data:
    for tok in ex["tokens"]:
        vocab.setdefault(tok.lower(), len(vocab))
    for tag in ex["tags"]:
        tag_vocab.setdefault(tag, len(tag_vocab))
dataset = ATISDataset(train_data, vocab, tag_vocab)
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
tagger = LSTMTagger(len(vocab), len(tag_vocab))
classifier = LSTMClassifier(len(vocab), len(template_map))
train_lstm(tagger, classifier, loader, len(tag_vocab), len(template_map))
evaluate_template_and_tags("dev", tagger, classifier, dev_data, vocab, tag_vocab, template_map)
evaluate_template_and_tags("test", tagger, classifier, test_data, vocab, tag_vocab, template_map)

[LSTM] Epoch 1: Loss = 5.7310
[LSTM] Epoch 2: Loss = 4.2079
[LSTM] Epoch 3: Loss = 3.3437
[LSTM] Epoch 4: Loss = 2.7265
[LSTM] Epoch 5: Loss = 2.2249
[LSTM] Epoch 6: Loss = 1.8099
[LSTM] Epoch 7: Loss = 1.4537
[LSTM] Epoch 8: Loss = 1.2132
[LSTM] Epoch 9: Loss = 0.9721
[LSTM] Epoch 10: Loss = 0.7859
[LSTM] Epoch 11: Loss = 0.6363
[LSTM] Epoch 12: Loss = 0.5125
[LSTM] Epoch 13: Loss = 0.4000
[LSTM] Epoch 14: Loss = 0.3313
[LSTM] Epoch 15: Loss = 0.2926
[LSTM] Epoch 16: Loss = 0.2932
[LSTM] Epoch 17: Loss = 0.2747
[LSTM] Epoch 18: Loss = 0.2054
[LSTM] Epoch 19: Loss = 0.1567
[LSTM] Epoch 20: Loss = 0.1355

Tagging Accuracy on "dev": 5177/5179 = 0.99961
Template Classification Accuracy on "dev": 325/486 = 0.66872

Tagging Accuracy on "test": 4013/4015 = 0.99950
Template Classification Accuracy on "test": 206/447 = 0.46085


In [34]:
def train_transformer(tagger, classifier, dataloader, tag_size, num_templates, epochs=20):
    criterion_tag = nn.CrossEntropyLoss()
    criterion_template = nn.CrossEntropyLoss()
    optimizer = optim.Adam(list(tagger.parameters()) + list(classifier.parameters()), lr=1e-3)

    for epoch in range(epochs):
        tagger.train()
        classifier.train()
        total_loss = 0
        for batch in dataloader:
            tokens, tags, templates = batch["tokens"], batch["tags"], batch["template"]
            tag_loss = criterion_tag(tagger(tokens).view(-1, tag_size), tags.view(-1))
            template_loss = criterion_template(classifier(tokens), templates)
            loss = tag_loss + template_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"[Transformer] Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")

examples, template_map = preprocess(atis_data)
train_data = [ex for ex in examples if ex["split"] == "train"]
dev_data = [ex for ex in examples if ex["split"] == "dev"]
test_data = [ex for ex in examples if ex["split"] == "test"]
vocab = {"<PAD>": 0}
tag_vocab = {"O": 0}
for ex in train_data + dev_data + test_data:
    for tok in ex["tokens"]:
        vocab.setdefault(tok.lower(), len(vocab))
    for tag in ex["tags"]:
        tag_vocab.setdefault(tag, len(tag_vocab))
dataset = ATISDataset(train_data, vocab, tag_vocab)
loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
tagger = TransformerTagger(len(vocab), len(tag_vocab))
classifier = TransformerClassifier(len(vocab), len(template_map))
train_transformer(tagger, classifier, loader, len(tag_vocab), len(template_map))
evaluate_template_and_tags("dev", tagger, classifier, dev_data, vocab, tag_vocab, template_map)
evaluate_template_and_tags("test", tagger, classifier, test_data, vocab, tag_vocab, template_map)

[Transformer] Epoch 1: Loss = 5.0490
[Transformer] Epoch 2: Loss = 3.1613
[Transformer] Epoch 3: Loss = 2.3737
[Transformer] Epoch 4: Loss = 1.8978
[Transformer] Epoch 5: Loss = 1.5472
[Transformer] Epoch 6: Loss = 1.2640
[Transformer] Epoch 7: Loss = 1.0397
[Transformer] Epoch 8: Loss = 0.8323
[Transformer] Epoch 9: Loss = 0.6844
[Transformer] Epoch 10: Loss = 0.5675
[Transformer] Epoch 11: Loss = 0.4571
[Transformer] Epoch 12: Loss = 0.3690
[Transformer] Epoch 13: Loss = 0.3135
[Transformer] Epoch 14: Loss = 0.2738
[Transformer] Epoch 15: Loss = 0.2628
[Transformer] Epoch 16: Loss = 0.2399
[Transformer] Epoch 17: Loss = 0.2106
[Transformer] Epoch 18: Loss = 0.1754
[Transformer] Epoch 19: Loss = 0.1703
[Transformer] Epoch 20: Loss = 0.1660

Tagging Accuracy on "dev": 5179/5179 = 1.00000
Template Classification Accuracy on "dev": 361/486 = 0.74280

Tagging Accuracy on "test": 4009/4015 = 0.99851
Template Classification Accuracy on "test": 232/447 = 0.51902
