In [1]:

import os

from torch.utils.tensorboard import SummaryWriter
import json
from pprint import pprint
import random
import torch
import torch.utils.data as data
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from torch.utils.data import DataLoader
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
from conll import evaluate
from sklearn.metrics import classification_report
import time

id_run = time.strftime("%d%m%y_%H%M%S")
writer = SummaryWriter(log_dir='runs/CustomIAS'+id_run)
device = 'cuda:0' # cuda:0 means we are using the GPU with id 0, if you have multiple GPU
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # Used to report errors on CUDA side
PAD_TOKEN = 0

In [2]:
class CustomModelIAS(nn.Module):
    def __init__(self, hid_size, out_slot, out_int, emb_size, vocab_len, n_layer=1, pad_index=0):
        super(CustomModelIAS, self).__init__()
        
        self.embedding = nn.Embedding(vocab_len, emb_size, padding_idx=pad_index)
        
        self.utt_encoder = nn.LSTM(emb_size, hid_size, n_layer, 
                                   bidirectional=True, batch_first=True)
        
        self.dropout = nn.Dropout(0.3)  # Dropout rate can be tuned
        
        # Since bidirectional=True, hidden size is doubled (forward + backward)
        self.slot_out = nn.Linear(hid_size * 2, out_slot)
        self.intent_out = nn.Linear(hid_size * 2, out_int)

    def forward(self, utterance, seq_lengths):
        utt_emb = self.embedding(utterance)
        utt_emb = self.dropout(utt_emb)  # Dropout after embedding

        packed_input = pack_padded_sequence(utt_emb, seq_lengths.cpu().numpy(), batch_first=True)
        packed_output, (last_hidden, cell) = self.utt_encoder(packed_input)
        utt_encoded, input_sizes = pad_packed_sequence(packed_output, batch_first=True)

        # Concatenate last forward and last backward hidden states
        last_hidden_fwd = last_hidden[-2, :, :]  # Forward
        last_hidden_bwd = last_hidden[-1, :, :]  # Backward
        last_hidden_cat = torch.cat((last_hidden_fwd, last_hidden_bwd), dim=1)

        # Apply dropout before output layers
        utt_encoded = self.dropout(utt_encoded)
        last_hidden_cat = self.dropout(last_hidden_cat)

        slots = self.slot_out(utt_encoded)
        intent = self.intent_out(last_hidden_cat)
        
        slots = slots.permute(0, 2, 1)
        return slots, intent

In [3]:
def init_weights(mat):
    for m in mat.modules():
        if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
            for name, param in m.named_parameters():
                if 'weight_ih' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.xavier_uniform_(param[idx*mul:(idx+1)*mul])
                elif 'weight_hh' in name:
                    for idx in range(4):
                        mul = param.shape[0]//4
                        torch.nn.init.orthogonal_(param[idx*mul:(idx+1)*mul])
                elif 'bias' in name:
                    param.data.fill_(0)
        else:
            if type(m) in [nn.Linear]:
                torch.nn.init.uniform_(m.weight, -0.01, 0.01)
                if m.bias != None:
                    m.bias.data.fill_(0.01)

In [4]:
def train_loop(data, optimizer, criterion_slots, criterion_intents, model, clip=5):
    model.train()
    loss_array = []
    for sample in data:
        optimizer.zero_grad() # Zeroing the gradient
        slots, intent = model(sample['utterances'], sample['slots_len'])
        loss_intent = criterion_intents(intent, sample['intents'])
        loss_slot = criterion_slots(slots, sample['y_slots'])
        loss = loss_intent + loss_slot # In joint training we sum the losses. 
                                       # Is there another way to do that?
        loss_array.append(loss.item())
        loss.backward() # Compute the gradient, deleting the computational graph
        # clip the gradient to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  
        optimizer.step() # Update the weights
    return loss_array

def eval_loop(data, criterion_slots, criterion_intents, model, lang):
    model.eval()
    loss_array = []
    
    ref_intents = []
    hyp_intents = []
    
    ref_slots = []
    hyp_slots = []
    #softmax = nn.Softmax(dim=1) # Use Softmax if you need the actual probability
    with torch.no_grad(): # It used to avoid the creation of computational graph
        for sample in data:
            slots, intents = model(sample['utterances'], sample['slots_len'])
            loss_intent = criterion_intents(intents, sample['intents'])
            loss_slot = criterion_slots(slots, sample['y_slots'])
            loss = loss_intent + loss_slot 
            loss_array.append(loss.item())
            # Intent inference
            # Get the highest probable class
            out_intents = [lang.id2intent[x] 
                           for x in torch.argmax(intents, dim=1).tolist()] 
            gt_intents = [lang.id2intent[x] for x in sample['intents'].tolist()]
            ref_intents.extend(gt_intents)
            hyp_intents.extend(out_intents)
            
            # Slot inference 
            output_slots = torch.argmax(slots, dim=1)
            for id_seq, seq in enumerate(output_slots):
                length = sample['slots_len'].tolist()[id_seq]
                utt_ids = sample['utterance'][id_seq][:length].tolist()
                gt_ids = sample['y_slots'][id_seq].tolist()
                gt_slots = [lang.id2slot[elem] for elem in gt_ids[:length]]
                utterance = [lang.id2word[elem] for elem in utt_ids]
                to_decode = seq[:length].tolist()
                ref_slots.append([(utterance[id_el], elem) for id_el, elem in enumerate(gt_slots)])
                tmp_seq = []
                for id_el, elem in enumerate(to_decode):
                    tmp_seq.append((utterance[id_el], lang.id2slot[elem]))
                hyp_slots.append(tmp_seq)
    try:            
        results = evaluate(ref_slots, hyp_slots)
    except Exception as ex:
        # Sometimes the model predicts a class that is not in REF
        print("Warning:", ex)
        ref_s = set([x[1] for x in ref_slots])
        hyp_s = set([x[1] for x in hyp_slots])
        print(hyp_s.difference(ref_s))
        results = {"total":{"f":0}}
        
    report_intent = classification_report(ref_intents, hyp_intents, 
                                          zero_division=False, output_dict=True)
    return results, report_intent, loss_array

In [5]:
class IntentsAndSlots (data.Dataset):
    # Mandatory methods are __init__, __len__ and __getitem__
    def __init__(self, dataset, lang, unk='unk'):
        self.utterances = []
        self.intents = []
        self.slots = []
        self.unk = unk
        
        for x in dataset:
            self.utterances.append(x['utterance'])
            self.slots.append(x['slots'])
            self.intents.append(x['intent'])

        self.utt_ids = self.mapping_seq(self.utterances, lang.word2id)
        self.slot_ids = self.mapping_seq(self.slots, lang.slot2id)
        self.intent_ids = self.mapping_lab(self.intents, lang.intent2id)

    def __len__(self):
        return len(self.utterances)

    def __getitem__(self, idx):
        utt = torch.Tensor(self.utt_ids[idx])
        slots = torch.Tensor(self.slot_ids[idx])
        intent = self.intent_ids[idx]
        sample = {'utterance': utt, 'slots': slots, 'intent': intent}
        return sample
    
    # Auxiliary methods
    
    def mapping_lab(self, data, mapper):
        return [mapper[x] if x in mapper else mapper[self.unk] for x in data]
    
    def mapping_seq(self, data, mapper): # Map sequences to number
        res = []
        for seq in data:
            tmp_seq = []
            for x in seq.split():
                if x in mapper:
                    tmp_seq.append(mapper[x])
                else:
                    tmp_seq.append(mapper[self.unk])
            res.append(tmp_seq)
        return res

In [6]:

def load_data(path):
    '''
        input: path/to/data
        output: json 
    '''
    dataset = []
    with open(path) as f:
        dataset = json.loads(f.read())
    return dataset

tmp_train_raw = load_data(os.path.join('dataset','ATIS','train.json'))
test_raw = load_data(os.path.join('dataset','ATIS','test.json'))

# pprint(tmp_train_raw[0])

portion = 0.10

intents = [x['intent'] for x in tmp_train_raw] # We stratify on intents
count_y = Counter(intents)

labels = []
inputs = []
mini_train = []

for id_y, y in enumerate(intents):
    if count_y[y] > 1: # If some intents occurs only once, we put them in training
        inputs.append(tmp_train_raw[id_y])
        labels.append(y)
    else:
        mini_train.append(tmp_train_raw[id_y])
# Random Stratify
X_train, X_dev, y_train, y_dev = train_test_split(inputs, labels, test_size=portion, 
                                                    random_state=42, 
                                                    shuffle=True,
                                                    stratify=labels)
X_train.extend(mini_train)
train_raw = X_train
dev_raw = X_dev

y_test = [x['intent'] for x in test_raw]

# Intent distributions
# print('Train:')
# pprint({k:round(v/len(y_train),3)*100 for k, v in sorted(Counter(y_train).items())})
# print('Dev:'), 
# pprint({k:round(v/len(y_dev),3)*100 for k, v in sorted(Counter(y_dev).items())})
# print('Test:') 
# pprint({k:round(v/len(y_test),3)*100 for k, v in sorted(Counter(y_test).items())})
# print('='*89)

# Dataset size
print('TRAIN size:', len(train_raw))
print('DEV size:', len(dev_raw))
print('TEST size:', len(test_raw))

TRAIN size: 4480
DEV size: 498
TEST size: 893


In [7]:
class Lang():
    def __init__(self, words, intents, slots, cutoff=0):
        self.word2id = self.w2id(words, cutoff=cutoff, unk=True)
        self.slot2id = self.lab2id(slots)
        self.intent2id = self.lab2id(intents, pad=False)
        self.id2word = {v:k for k, v in self.word2id.items()}
        self.id2slot = {v:k for k, v in self.slot2id.items()}
        self.id2intent = {v:k for k, v in self.intent2id.items()}
        
    def w2id(self, elements, cutoff=None, unk=True):
        vocab = {'pad': PAD_TOKEN}
        if unk:
            vocab['unk'] = len(vocab)
        count = Counter(elements)
        for k, v in count.items():
            if v > cutoff:
                vocab[k] = len(vocab)
        return vocab
    
    def lab2id(self, elements, pad=True):
        vocab = {}
        if pad:
            vocab['pad'] = PAD_TOKEN
        for elem in elements:
                vocab[elem] = len(vocab)
        return vocab

In [8]:

words = sum([x['utterance'].split() for x in train_raw], []) # No set() since we want to compute 
                                                            # the cutoff
corpus = train_raw + dev_raw + test_raw # We do not wat unk labels, 
                                        # however this depends on the research purpose
slots = set(sum([line['slots'].split() for line in corpus],[]))
intents = set([line['intent'] for line in corpus])

lang = Lang(words, intents, slots, cutoff=0)

# Create our datasets
train_dataset = IntentsAndSlots(train_raw, lang)
dev_dataset = IntentsAndSlots(dev_raw, lang)
test_dataset = IntentsAndSlots(test_raw, lang)

In [9]:
def collate_fn(data):
    def merge(sequences):
        '''
        merge from batch * sent_len to batch * max_len 
        '''
        lengths = [len(seq) for seq in sequences]
        max_len = 1 if max(lengths)==0 else max(lengths)
        # Pad token is zero in our case
        # So we create a matrix full of PAD_TOKEN (i.e. 0) with the shape 
        # batch_size X maximum length of a sequence
        padded_seqs = torch.LongTensor(len(sequences),max_len).fill_(PAD_TOKEN)
        for i, seq in enumerate(sequences):
            end = lengths[i]
            padded_seqs[i, :end] = seq # We copy each sequence into the matrix
        # print(padded_seqs)
        padded_seqs = padded_seqs.detach()  # We remove these tensors from the computational graph
        return padded_seqs, lengths
    # Sort data by seq lengths
    data.sort(key=lambda x: len(x['utterance']), reverse=True) 
    new_item = {}
    for key in data[0].keys():
        new_item[key] = [d[key] for d in data]
        
    # We just need one length for packed pad seq, since len(utt) == len(slots)
    src_utt, _ = merge(new_item['utterance'])
    y_slots, y_lengths = merge(new_item["slots"])
    intent = torch.LongTensor(new_item["intent"])
    
    src_utt = src_utt.to(device) # We load the Tensor on our selected device
    y_slots = y_slots.to(device)
    intent = intent.to(device)
    y_lengths = torch.LongTensor(y_lengths).to(device)
    
    new_item["utterances"] = src_utt
    new_item["intents"] = intent
    new_item["y_slots"] = y_slots
    new_item["slots_len"] = y_lengths
    return new_item

# Dataloader instantiations
train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_fn,  shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=64, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=collate_fn)

In [10]:

n_epochs = 200
patience = 3
losses_train = []
losses_dev = []
sampled_epochs = []
best_f1 = 0

hid_size = 200
emb_size = 300
lr = 0.0001 # learning rate
clip = 5 # Clip the gradient
out_slot = len(lang.slot2id)
out_int = len(lang.intent2id)
vocab_len = len(lang.word2id)

# Re-initialize model to include new bidirectional + dropout changes
model = CustomModelIAS(hid_size, out_slot, out_int, emb_size, vocab_len, pad_index=PAD_TOKEN).to(device)
model.apply(init_weights)

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion_slots = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
criterion_intents = nn.CrossEntropyLoss() # Because we do not have the pad token

for epoch in tqdm(range(1, n_epochs)):
    # Training loop
    loss = train_loop(train_loader, optimizer, criterion_slots, 
                      criterion_intents, model, clip=clip)
    
    if epoch % 5 == 0:  # Evaluate every 5 epochs
        sampled_epochs.append(epoch)
        losses_train.append(np.asarray(loss).mean())

        # Dev set evaluation
        results_dev, intent_res, loss_dev = eval_loop(dev_loader, 
                                                      criterion_slots, 
                                                      criterion_intents, 
                                                      model, lang)
        losses_dev.append(np.asarray(loss_dev).mean())
        
        f1 = results_dev['total']['f']
        print(f"Epoch {epoch} — Slot F1: {f1:.4f}, Intent Acc: {intent_res['accuracy']:.4f}")

        writer.add_scalar(tag="Loss/val", 
                          scalar_value=losses_dev[-1], 
                          global_step=epoch)
        writer.add_scalar(tag="Intent_Accuracy/val", 
                          scalar_value=intent_res['accuracy'], 
                          global_step=epoch)
        writer.add_scalar(tag="Slot_F1/val",
                          scalar_value=f1, 
                          global_step=epoch)
        writer.add_scalar(tag="Loss/train",
                          scalar_value=losses_train[-1], 
                          global_step=epoch)
        
        # Save best model (optional)
        if f1 > best_f1:
            best_f1 = f1
            best_model = model.state_dict()
            patience = 3
        else:
            patience -= 1
        
        if patience <= 0:
            print("Early stopping triggered.")
            break

# Load best model for final evaluation
model.load_state_dict(best_model)

# Test set evaluation
results_test, intent_test, _ = eval_loop(test_loader, criterion_slots, 
                                         criterion_intents, model, lang)
print('Final Test Results')
print('Slot F1:', results_test['total']['f'])
print('Intent Accuracy:', intent_test['accuracy'])

# Save the model
torch.save(model.state_dict(), 'model_bin/model_customias'+id_run+'.pth')

writer.close()

  3%|▎         | 5/199 [00:07<04:35,  1.42s/it]

Epoch 5 — Slot F1: 0.1473, Intent Acc: 0.7369


  5%|▌         | 10/199 [00:14<04:35,  1.46s/it]

Epoch 10 — Slot F1: 0.4289, Intent Acc: 0.7851


  8%|▊         | 15/199 [00:21<04:27,  1.45s/it]

Epoch 15 — Slot F1: 0.5894, Intent Acc: 0.8855


 10%|█         | 20/199 [00:28<04:04,  1.37s/it]

Epoch 20 — Slot F1: 0.7152, Intent Acc: 0.9116


 13%|█▎        | 25/199 [00:34<03:46,  1.30s/it]

Epoch 25 — Slot F1: 0.7823, Intent Acc: 0.9317


 15%|█▌        | 30/199 [00:40<03:42,  1.31s/it]

Epoch 30 — Slot F1: 0.8207, Intent Acc: 0.9337


 18%|█▊        | 35/199 [00:47<03:36,  1.32s/it]

Epoch 35 — Slot F1: 0.8362, Intent Acc: 0.9418


 20%|██        | 40/199 [00:54<03:27,  1.30s/it]

Epoch 40 — Slot F1: 0.8520, Intent Acc: 0.9558


 23%|██▎       | 45/199 [01:00<03:19,  1.29s/it]

Epoch 45 — Slot F1: 0.8613, Intent Acc: 0.9598


 25%|██▌       | 50/199 [01:06<03:12,  1.29s/it]

Epoch 50 — Slot F1: 0.8745, Intent Acc: 0.9618


 28%|██▊       | 55/199 [01:13<03:07,  1.30s/it]

Epoch 55 — Slot F1: 0.8846, Intent Acc: 0.9618


 30%|███       | 60/199 [01:19<03:00,  1.30s/it]

Epoch 60 — Slot F1: 0.8937, Intent Acc: 0.9719


 33%|███▎      | 65/199 [01:25<02:53,  1.29s/it]

Epoch 65 — Slot F1: 0.9055, Intent Acc: 0.9739


 35%|███▌      | 70/199 [01:32<02:46,  1.29s/it]

Epoch 70 — Slot F1: 0.9146, Intent Acc: 0.9759


 38%|███▊      | 75/199 [01:38<02:40,  1.29s/it]

Epoch 75 — Slot F1: 0.9282, Intent Acc: 0.9759


 40%|████      | 80/199 [01:45<02:34,  1.30s/it]

Epoch 80 — Slot F1: 0.9331, Intent Acc: 0.9759


 43%|████▎     | 85/199 [01:51<02:27,  1.30s/it]

Epoch 85 — Slot F1: 0.9368, Intent Acc: 0.9779


 45%|████▌     | 90/199 [01:57<02:22,  1.31s/it]

Epoch 90 — Slot F1: 0.9454, Intent Acc: 0.9799


 48%|████▊     | 95/199 [02:04<02:15,  1.30s/it]

Epoch 95 — Slot F1: 0.9508, Intent Acc: 0.9799


 50%|█████     | 100/199 [02:10<02:08,  1.30s/it]

Epoch 100 — Slot F1: 0.9538, Intent Acc: 0.9799


 53%|█████▎    | 105/199 [02:17<02:02,  1.30s/it]

Epoch 105 — Slot F1: 0.9582, Intent Acc: 0.9839


 55%|█████▌    | 110/199 [02:23<01:55,  1.29s/it]

Epoch 110 — Slot F1: 0.9624, Intent Acc: 0.9819


 58%|█████▊    | 115/199 [02:29<01:49,  1.31s/it]

Epoch 115 — Slot F1: 0.9657, Intent Acc: 0.9779


 60%|██████    | 120/199 [02:36<01:42,  1.30s/it]

Epoch 120 — Slot F1: 0.9668, Intent Acc: 0.9819


 63%|██████▎   | 125/199 [02:42<01:35,  1.30s/it]

Epoch 125 — Slot F1: 0.9680, Intent Acc: 0.9819


 65%|██████▌   | 130/199 [02:49<01:29,  1.29s/it]

Epoch 130 — Slot F1: 0.9684, Intent Acc: 0.9839


 68%|██████▊   | 135/199 [02:55<01:25,  1.34s/it]

Epoch 135 — Slot F1: 0.9701, Intent Acc: 0.9839


 70%|███████   | 140/199 [03:02<01:18,  1.33s/it]

Epoch 140 — Slot F1: 0.9713, Intent Acc: 0.9839


 73%|███████▎  | 145/199 [03:09<01:13,  1.35s/it]

Epoch 145 — Slot F1: 0.9716, Intent Acc: 0.9819


 75%|███████▌  | 150/199 [03:15<01:06,  1.35s/it]

Epoch 150 — Slot F1: 0.9713, Intent Acc: 0.9819


 78%|███████▊  | 155/199 [03:22<00:59,  1.35s/it]

Epoch 155 — Slot F1: 0.9722, Intent Acc: 0.9839


 80%|████████  | 160/199 [03:29<00:54,  1.39s/it]

Epoch 160 — Slot F1: 0.9740, Intent Acc: 0.9819


 83%|████████▎ | 165/199 [03:36<00:45,  1.34s/it]

Epoch 165 — Slot F1: 0.9731, Intent Acc: 0.9819


 85%|████████▌ | 170/199 [03:42<00:39,  1.37s/it]

Epoch 170 — Slot F1: 0.9760, Intent Acc: 0.9839


 88%|████████▊ | 175/199 [03:49<00:31,  1.31s/it]

Epoch 175 — Slot F1: 0.9766, Intent Acc: 0.9839


 90%|█████████ | 180/199 [03:55<00:24,  1.30s/it]

Epoch 180 — Slot F1: 0.9766, Intent Acc: 0.9839


 93%|█████████▎| 185/199 [04:01<00:18,  1.29s/it]

Epoch 185 — Slot F1: 0.9772, Intent Acc: 0.9839


 95%|█████████▌| 190/199 [04:08<00:11,  1.29s/it]

Epoch 190 — Slot F1: 0.9763, Intent Acc: 0.9839


 98%|█████████▊| 195/199 [04:14<00:05,  1.29s/it]

Epoch 195 — Slot F1: 0.9763, Intent Acc: 0.9839


100%|██████████| 199/199 [04:19<00:00,  1.30s/it]


Final Test Results
Slot F1: 0.941903584672435
Intent Accuracy: 0.9540873460246361
