# Intent Detection and Slot Filling

In [None]:
!git clone https://github.com/GitYCC/bert-minimal-tutorial.git

Cloning into 'bert-minimal-tutorial'...
remote: Enumerating objects: 117, done.[K
remote: Counting objects: 100% (117/117), done.[K
remote: Compressing objects: 100% (96/96), done.[K
remote: Total 117 (delta 60), reused 63 (delta 19), pack-reused 0[K
Receiving objects: 100% (117/117), 38.87 MiB | 11.21 MiB/s, done.
Resolving deltas: 100% (60/60), done.


In [None]:
%cd bert-minimal-tutorial

/content/bert-minimal-tutorial


In [None]:
!pip install -q -r requirements.txt

[K     |████████████████████████████████| 235kB 15.7MB/s 
[K     |████████████████████████████████| 829kB 24.3MB/s 
[K     |████████████████████████████████| 1.3MB 43.7MB/s 
[K     |████████████████████████████████| 225kB 44.3MB/s 
[K     |████████████████████████████████| 512kB 44.3MB/s 
[K     |████████████████████████████████| 727kB 42.3MB/s 
[K     |████████████████████████████████| 71kB 8.6MB/s 
[K     |████████████████████████████████| 890kB 43.7MB/s 
[K     |████████████████████████████████| 6.8MB 41.2MB/s 
[K     |████████████████████████████████| 25.9MB 124kB/s 
[K     |████████████████████████████████| 1.1MB 42.3MB/s 
[K     |████████████████████████████████| 51kB 7.4MB/s 
[K     |████████████████████████████████| 2.9MB 42.3MB/s 
[K     |████████████████████████████████| 1.3MB 38.3MB/s 
[K     |████████████████████████████████| 133kB 46.2MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l

In [None]:
import os

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, BertPreTrainedModel
from tqdm.notebook import tqdm
from sklearn import metrics
from seqeval.metrics import f1_score as seq_f1_score

from utils import RunningAverage, tokenize_and_map

MODEL_NAME = 'bert-base-chinese'
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

## Dataloader

In [None]:
intents = []
with open('data/cais.train.intent') as fr:
    for line in fr.readlines():
        line = line.strip()
        intents.append(line)

In [None]:
INTENT_LABELS = [
    'AskKnowledge',
    'Chat',
    'ControlSystem',
    'GetNews',
    'GetWeather',
    'PlayFMCrosstalk',
    'PlayFMFiction',
    'PlayFMJoke',
    'PlayFMStory',
    'PlayMusic',
    'PlayPoem',
    'Remind',
    'Sing'
]

In [None]:
texts = []
tag_lists = []
with open('data/cais.train') as fr:
    text = ''
    tag_list = []
    for line in fr.readlines():
        line = line.strip('\n')
        if line == '':
            assert len(text) == len(tag_list)
            texts.append(text)
            tag_lists.append(tag_list)
            text = ''
            tag_list = []
        else:
            char = line[0]
            tag = line[2:]
            text += char
            tag_list.append(tag)
    texts.append(text)
    tag_lists.append(tag_list)

In [None]:
SLOTS = [
    'actor_name',
    'album_name',
    'author_name',
    'crosstalk_name',
    'date',
    'event',
    'fiction_name',
    'fiction_tag',
    'joke_name',
    'joke_tag',
    'list',
    'location',
    'movie_name',
    'news_tag',
    'news_time',
    'poem_name',
    'poem_tag',
    'ranking',
    'singer_name',
    'song_language',
    'song_name',
    'song_type',
    'story_name',
    'story_tag',
]

SLOT_LABELS = ['O'] + [f'{prefix}-{slot}' for slot in SLOTS for prefix in ['B', 'I', 'E', 'S']]

In [None]:
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
assert len(intents) == len(texts) == len(tag_lists)

idx = 0
print('text:', texts[idx])
print('intent:', intents[idx])
print('tag list:', tag_lists[idx])

text: 我要听吴雨霏的歌
intent: PlayMusic
tag list: ['O', 'O', 'O', 'B-singer_name', 'I-singer_name', 'E-singer_name', 'O', 'O']


In [None]:
class IntentAndSlotDataset(Dataset):
    def __init__(self, tokenizer, texts, intents=None, tag_lists=None, max_len=512, for_train=True):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.for_train = for_train

        self.texts = texts
        self.intents = intents
        self.tag_lists = tag_lists

    def __getitem__(self, idx):
        text = self.texts[idx].lower()

        tokens, index_map = tokenize_and_map(self.tokenizer, text)

        cut_index = self.max_len - 2
        if cut_index < len(tokens):
            cut_text_index = index_map.index(cut_index)
            tokens = tokens[:cut_index]
            text = text[:cut_text_index]
            index_map = index_map[:cut_text_index]

        processed_tokens = ['[CLS]'] + tokens + ['[SEP]']

        input_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(processed_tokens))
        token_type_ids = torch.tensor([0] * len(processed_tokens))
        attention_mask = torch.tensor([1] * len(processed_tokens))

        outputs = (input_ids, token_type_ids, attention_mask)

        if self.for_train:
            intent = INTENT_LABELS.index(self.intents[idx])
            intent = torch.tensor(intent)
            
            slots = []
            tag_list = self.tag_lists[idx]
            for tag, token_index in zip(tag_list, index_map):
                if token_index is None:
                    continue
                if token_index >= len(slots):
                    slots.append(SLOT_LABELS.index(tag))

            slots = [0] + slots + [0]  # for [CLS] and [SEP]
            slots = torch.tensor(slots)
            assert slots.size(0) == input_ids.size(0)
    
            outputs += (intent, slots, )

        info = {
            'text': text,
            'tokens': tokens,
            'index_map': index_map
        }
        outputs += (info, )
        return outputs

    def __len__(self):
        return len(self.texts)

    def create_mini_batch(self, samples):
        outputs = list(zip(*samples))

        # zero pad 到同一序列長度
        input_ids = pad_sequence(outputs[0], batch_first=True)
        token_type_ids = pad_sequence(outputs[1], batch_first=True)
        attention_mask = pad_sequence(outputs[2], batch_first=True)

        batch_output = (input_ids, token_type_ids, attention_mask)
    
        if self.for_train:
            intents = torch.stack(outputs[3])
            slot_lists = pad_sequence(outputs[4], batch_first=True)
            batch_output += (intents, slot_lists, )
        else:
            infos = outputs[3]
            batch_output += (infos, )

        return batch_output

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

SKIP_TOKEN_IDS = [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]

dataset = IntentAndSlotDataset(tokenizer, texts, intents, tag_lists)

CUT_RATIO = 0.9
train_size = int(CUT_RATIO * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




In [None]:
batch_size = 2

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn=dataset.create_mini_batch,
    shuffle=True
)
valid_loader = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    collate_fn=dataset.create_mini_batch,
)

## Model

In [None]:
class BertForIntentDetectionAndSlotFilling(BertPreTrainedModel):
    def __init__(self, config, num_intents, num_slots, slot_loss_coef=1.0):
        super().__init__(config)
        self.num_intents = num_intents
        self.num_slots = num_slots
        self.slot_loss_coef = slot_loss_coef

        self.bert = BertModel(config)
        self.intent_dropout = nn.Dropout(config.hidden_dropout_prob)
        self.intent_classifier = nn.Linear(config.hidden_size, num_intents)
        self.slot_dropout = nn.Dropout(config.hidden_dropout_prob)
        self.slot_classifier = nn.Linear(config.hidden_size, num_slots)

        self.init_weights()

    def forward(self, input_ids, token_type_ids, attention_mask, intents=None, slot_lists=None):
        sequence_output, pooled_output = self.bert(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )

        pooled_output = self.intent_dropout(pooled_output)
        intent_logits = self.intent_classifier(pooled_output)

        sequence_output = self.slot_dropout(sequence_output)
        slot_logits = self.slot_classifier(sequence_output)

        if intents is not None and slot_lists is not None:
            intent_loss_fct = nn.CrossEntropyLoss()
            intent_loss = intent_loss_fct(intent_logits, intents)

            slot_loss_fct = nn.CrossEntropyLoss()
            # Only obtain loss on active parts
            active_mask = attention_mask.view(-1) == 1
            active_logits = slot_logits.view(-1, self.num_slots)[active_mask]
            active_labels = slot_lists.view(-1)[active_mask]
            slot_loss = slot_loss_fct(active_logits, active_labels)

            loss = intent_loss + self.slot_loss_coef * slot_loss

            return intent_logits, slot_logits, loss
        else:
            return intent_logits, slot_logits

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

model = BertForIntentDetectionAndSlotFilling.from_pretrained(
    MODEL_NAME, 
    num_intents=len(INTENT_LABELS),
    num_slots=len(SLOT_LABELS)
)
model.to(device)

device: cuda


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411577189.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForIntentDetectionAndSlotFilling: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForIntentDetectionAndSlotFilling from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForIntentDetectionAndSlotFilling from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForIntentDetectionAndSlotFilling were not initialized 

BertForIntentDetectionAndSlotFilling(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

## Train

In [None]:
def train_batch(model, data, optimizer, device):
    model.train()
    input_ids, token_type_ids, attention_mask, intents, slot_lists = [d.to(device) for d in data]

    _, _, loss = model(
        input_ids=input_ids,
        token_type_ids=token_type_ids,
        attention_mask=attention_mask,
        intents=intents,
        slot_lists=slot_lists
    )

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, valid_loader, device):
    model.eval()

    loss_averager = RunningAverage()
    intent_acc_averager = RunningAverage()
    slot_all_labels, slot_all_preds = [], []

    with torch.no_grad():
        for data in tqdm(valid_loader, desc='evaluate'):
            input_ids, token_type_ids, attention_mask, intents, slot_lists = [d.to(device) for d in data]

            intent_logits, slot_logits, loss = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask,
                intents=intents,
                slot_lists=slot_lists
            )

            loss_averager.add(loss.item())

            intent_corrects = (intent_logits.argmax(dim=-1) == intents).cpu().tolist()
            intent_acc_averager.add_all(intent_corrects)

            slot_preds = slot_logits.argmax(dim=-1).cpu().tolist()
            for token_id_list, label_list, pred_list in zip(input_ids, slot_lists, slot_preds):
                label_list = [SLOT_LABELS[i] for i, token_id in zip(label_list, token_id_list)
                              if token_id not in SKIP_TOKEN_IDS]
                pred_list = [SLOT_LABELS[i] for i, token_id in zip(pred_list, token_id_list)
                             if token_id not in SKIP_TOKEN_IDS]
                slot_all_labels.append(label_list)
                slot_all_preds.append(pred_list)

    slot_f1 = seq_f1_score(slot_all_labels, slot_all_preds)
    return loss_averager.get(), intent_acc_averager.get(), slot_f1

In [None]:
lr = 0.00001
max_iter = 4000
show_per_iter = 200
valid_per_iter = 1000
save_per_iter = 2000
save_checkpoint_dir = 'models/'
model_prefix = 'en_intent_slots_'

assert save_per_iter % valid_per_iter == 0

optimizer = optim.Adam(model.parameters(), lr=lr)

i = 1
is_running = True
train_loss = RunningAverage()
model_paths = []
while is_running:
    for train_data in train_loader:
        loss = train_batch(model, train_data, optimizer, device)
        train_loss.add(loss)

        if i % show_per_iter == 0:
            print('train [{}]: loss={}'.format(i, train_loss.get()))
            train_loss.flush()

        if i % valid_per_iter == 0:
            loss, intent_acc, slot_f1 = evaluate(model, valid_loader, device)
            print(f'valid: loss={loss}, intent_acc={intent_acc}, slot_f1={slot_f1}')

        if i % save_per_iter == 0:
            path = os.path.join(save_checkpoint_dir, model_prefix + f'loss{loss:.5}/')
            print(f'save model at {path}')
            model.save_pretrained(path)
            model_paths.append(path)
        
        if i == max_iter:
            is_running = False
            break

        i += 1

train [200]: loss=2.821393225193024
train [400]: loss=1.3827903934568166
train [600]: loss=1.035668111294508
train [800]: loss=0.9853589922189713
train [1000]: loss=0.7487957425229251


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=400.0, style=ProgressStyle(description_wid…


valid: loss=0.7197713376954198, intent_acc=0.9325, slot_f1=0.7278617710583154
train [1200]: loss=0.7724239754863084
train [1400]: loss=0.6736878294497728
train [1600]: loss=0.6827305120229721
train [1800]: loss=0.7269253741018474
train [2000]: loss=0.7416138973738998


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=400.0, style=ProgressStyle(description_wid…


valid: loss=0.5705913363490254, intent_acc=0.9425, slot_f1=0.7672273467173087
save model at models/en_intent_slots_loss0.57059/
train [2200]: loss=0.5761318169347942
train [2400]: loss=0.5193181126844137
train [2600]: loss=0.5947787444200366
train [2800]: loss=0.49476780821569266
train [3000]: loss=0.5441268782876432


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=400.0, style=ProgressStyle(description_wid…


valid: loss=0.46358252032427116, intent_acc=0.95375, slot_f1=0.8034744842562432
train [3200]: loss=0.3766982997581363
train [3400]: loss=0.503841726211831
train [3600]: loss=0.49588797326199713
train [3800]: loss=0.31539054627064617
train [4000]: loss=0.3830396037083119


HBox(children=(FloatProgress(value=0.0, description='evaluate', max=400.0, style=ProgressStyle(description_wid…


valid: loss=0.4086999019107316, intent_acc=0.9575, slot_f1=0.8159740960604425
save model at models/en_intent_slots_loss0.4087/


## Predict

In [None]:
reload_checkpoint = model_paths[-1]

texts = [
    '唱陈奕迅的歌给我听',
    '今天出门要带伞吗'
]

pred_dataset = IntentAndSlotDataset(tokenizer, texts, for_train=False)

pred_loader = DataLoader(
    dataset=pred_dataset,
    batch_size=batch_size,
    collate_fn=pred_dataset.create_mini_batch,
)

model = BertForIntentDetectionAndSlotFilling.from_pretrained(
    reload_checkpoint,
    num_intents=len(INTENT_LABELS),
    num_slots=len(SLOT_LABELS)
)
model.to(device)

intent_preds = []
slot_results = []
with torch.no_grad():
    for data in tqdm(pred_loader, desc='predict'):
        input_ids, token_type_ids, attention_mask = [d.to(device) for d in data[:3]]
        infos = data[3]

        intent_logits, slot_logits = model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )

        intent_preds += [INTENT_LABELS[i] for i in intent_logits.argmax(dim=-1).cpu().tolist()]

        slot_pred_list = slot_logits.argmax(dim=-1).cpu().tolist()
        for token_id_list, pred_list, info in zip(input_ids, slot_pred_list, infos):
            pred_list = [SLOT_LABELS[i] for i, token_id in zip(pred_list, token_id_list)
                         if token_id not in SKIP_TOKEN_IDS]
            tokens = info['tokens']
            result = list(zip(tokens, pred_list))
            slot_results.append(result)

print('predict result: ')
for text, intent, slot_result in zip(texts, intent_preds, slot_results):
    print(f'  text: {text}\n  intent: {intent}\n  slots: {slot_result}\n')

HBox(children=(FloatProgress(value=0.0, description='predict', max=1.0, style=ProgressStyle(description_width=…


predict result: 
  text: 唱陈奕迅的歌给我听
  intent: PlayMusic
  slots: [('唱', 'O'), ('陈', 'B-singer_name'), ('奕', 'I-singer_name'), ('迅', 'E-singer_name'), ('的', 'O'), ('歌', 'O'), ('给', 'O'), ('我', 'O'), ('听', 'O')]

  text: 今天出门要带伞吗
  intent: GetWeather
  slots: [('今', 'B-date'), ('天', 'E-date'), ('出', 'O'), ('门', 'O'), ('要', 'O'), ('带', 'O'), ('伞', 'O'), ('吗', 'O')]

