# Practice 1. BERT를 이용한 NLU 실습

# 목차
## Step 1. 데이터 전처리 (Data Preprocessing)
## Step 2. BERT 세부조정하기 (Fine-tuning)
## Step 3. 학습한 모델 평가하기

최근 자연어처리 모듈에서 높은 성능을 보이는 대규모 사전학습 언어모델의 하나인 BERT를 활용하여 대화시스템의 모듈 중 하나인 NLU를 구현해보는 것이 이번 실습의 목표입니다.

## Step 0. 사전설정
아래 코드 블럭에서는 이 실습을 수행하기 위해 필요한 모듈들을 import합니다.

In [None]:
# For data preprocessing
import json
import os
import zipfile
import sys
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# For Bert pretraining and postprecessing
from torch.utils.tensorboard import SummaryWriter
import random
import numpy as np
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
from convlab2.nlu.jointBERT.dataloader import Dataloader
from convlab2.nlu.jointBERT.jointBERT import JointBERT

CUDA_IDX = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = CUDA_IDX

아래 코드 블럭에서는 다른 코드 구현을 간편하게 해주는 helper function들을 정의합니다.

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


def read_zipped_json(filepath, filename):
    archive = zipfile.ZipFile(filepath, 'r')
    return json.load(archive.open(filename))


def phrase_in_utt(phrase, utt):
    phrase_low = phrase.lower()
    utt_low = utt.lower()
    return (' ' + phrase_low in utt_low) or utt_low.startswith(phrase_low)


def phrase_idx_utt(phrase, utt):
    phrase_low = phrase.lower()
    utt_low = utt.lower()
    if ' ' + phrase_low in utt_low or utt_low.startswith(phrase_low):
        return get_idx(phrase_low, utt_low)
    return None


def get_idx(phrase, utt):
    char_index_begin = utt.index(phrase)
    char_index_end = char_index_begin + len(phrase)
    word_index_begin = len(utt[:char_index_begin].split())
    word_index_end = len(utt[:char_index_end].split()) - 1
    return word_index_begin, word_index_end


def da2triples(dialog_act):
    triples = []
    for intent, svs in dialog_act.items():
        for slot, value in svs:
            triples.append([intent, slot, value])
    return triples


def das2tags(sen, das):
    tokens = word_tokenize(sen)
    new_sen = ' '.join(tokens)
    new_das = {}
    span_info = []
    intents = []
    for da, svs in das.items():
        new_das.setdefault(da, [])
        if da == 'inform':
            for s, v in svs:
                v = ' '.join(word_tokenize(v))
                if v != 'dontcare' and phrase_in_utt(v, new_sen):
                    word_index_begin, word_index_end = phrase_idx_utt(v, new_sen)
                    span_info.append((da, s, v, word_index_begin, word_index_end))
                else:
                    intents.append(da + '+' + s + '*' + v)
                new_das[da].append([s, v])
        else:
            for s, v in svs:
                new_das[da].append([s, v])
                intents.append(da + '+' + s + '*' + v)
    tags = []
    for i, _ in enumerate(tokens):
        for span in span_info:
            if i == span[3]:
                tag = "B-" + span[0] + "+" + span[1]
                tags.append(tag)
                break
            if span[3] < i <= span[4]:
                tag = "I-" + span[0] + "+" + span[1]
                tags.append(tag)
                break
        else:
            tags.append("O")

    return tokens, tags, intents, da2triples(new_das)

# Step 1. 데이터 전처리 (Data Preprocessing)

- BERT는 토큰화된 자연어(Natural Language)들을 입력받는 모델인데, NLU의 예측 타겟(prediction target)들은 모두 구조화된 데이터입니다. 
- 데이터 전처리 단계에서는 특수토큰(special token)들을 이용하여 구조화된 데이터를 자연어로 바꿔주는 처리를 합니다.
- 또한 원래 하나였던 데이터셋을 train, validation, test 셋으로 분리합니다.

먼저 아래 코드블럭에서는 실습에서 사용할 데이터셋인 Camrest 데이터셋을 불러옵니다.

In [None]:
cur_dir = os.path.abspath(os.curdir)
data_dir = "ConvLab-2/data/camrest"
processed_data_dir = os.path.join(cur_dir, 'data/all_data')
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)

data_key = ['train', 'val', 'test']
data = {}
for key in data_key:
    data[key] = read_zipped_json(os.path.join(data_dir, key + '.json.zip'), key + '.json')
    print('load {}, size {}'.format(key, len(data[key])))

아래 코드블럭에서는 본격적으로 전처리 작업을 진행합니다.

In [None]:
mode = 'all'
processed_data = {}
all_da = []
all_intent = []
all_tag = []
context_size = 3
for key in data_key:
    processed_data[key] = []
    for dialog in data[key]:
        context = []
        for turn in dialog['dial']:
            if mode == 'usr' or mode == 'all':
                tokens, tags, intents, new_das = das2tags(turn['usr']['transcript'], turn['usr']['dialog_act'])

                processed_data[key].append([tokens, tags, intents, new_das, context[-context_size:]])

                all_da += [da for da in turn['usr']['dialog_act']]
                all_intent += intents
                all_tag += tags

            context.append(turn['usr']['transcript'])

            if mode == 'sys' or mode == 'all':
                tokens, tags, intents, new_das = das2tags(turn['sys']['sent'], turn['sys']['dialog_act'])

                processed_data[key].append([tokens, tags, intents, new_das, context[-context_size:]])
                all_da += [da for da in turn['sys']['dialog_act']]
                all_intent += intents
                all_tag += tags

            context.append(turn['sys']['sent'])
    
    all_da = [x[0] for x in dict(Counter(all_da)).items() if x[1]]
    all_intent = [x[0] for x in dict(Counter(all_intent)).items() if x[1]]
    all_tag = [x[0] for x in dict(Counter(all_tag)).items() if x[1]]

    print('loaded {}, size {}'.format(key, len(processed_data[key])))
    json.dump(processed_data[key], 
              open(os.path.join(processed_data_dir, '{}_data.json'.format(key)), 'w'),
              indent=2)

아래 코드블럭에서는 전처리 작업결과를 각각 json 파일에 저장합니다.

In [None]:
print('dialog act num:', len(all_da))
print('sentence label num:', len(all_intent))
print('tag num:', len(all_tag))
json.dump(all_da, open(os.path.join(processed_data_dir, 'all_act.json'), 'w'), indent=2)
json.dump(all_intent, open(os.path.join(processed_data_dir, 'intent_vocab.json'), 'w'), indent=2)
json.dump(all_tag, open(os.path.join(processed_data_dir, 'tag_vocab.json'), 'w'), indent=2)


# Step 2. BERT 세부조정하기 (Fine-tuning)
Pre-training된 BERT의 파라메터를 불러온 뒤 Camrest 데이터셋을 맞게 파라메터를 Fine-tuning 하는 부분을 실습해봅시다.

## Step 2.1 환경설정

In [None]:
### Configurations ###
main_dir = processed_data_dir
config = {
  "data_dir": main_dir,
  "output_dir": main_dir + "/output/all",
  "zipped_model_path": main_dir + "/output/all/bert_camrest_all.zip",
  "log_dir": main_dir + "/log/all",
  "DEVICE": "cuda:"+CUDA_IDX,
  "seed": 2019,
  "cut_sen_len": 40,
  "use_bert_tokenizer": True,
  "model": {
    "finetune": True,
    "context": False,
    "context_grad": False,
    "pretrained_weights": "bert-base-uncased",
    "check_step": 1000,
    "max_step": 10000,
    "batch_size": 20,
    "learning_rate": 3e-5,
    "adam_epsilon": 1e-8,
    "warmup_steps": 0,
    "weight_decay": 0.0,
    "dropout": 0.1,
    "hidden_units": 768
  }
}

data_dir = config['data_dir']
output_dir = config['output_dir']
log_dir = config['log_dir']
DEVICE = config['DEVICE']

set_seed(config['seed'])

print('-' * 20 + 'dataset:camrest' + '-' * 20)
from convlab2.nlu.jointBERT.camrest.postprocess import is_slot_da, calculateF1, recover_intent

## Step 2.2 Pre-trained BERT 모델 불러오기

In [None]:
intent_vocab = json.load(open(os.path.join(data_dir, 'intent_vocab.json')))
tag_vocab = json.load(open(os.path.join(data_dir, 'tag_vocab.json')))
dataloader = Dataloader(intent_vocab=intent_vocab, tag_vocab=tag_vocab,
                        pretrained_weights=config['model']['pretrained_weights'])
print('intent num:', len(intent_vocab))
print('tag num:', len(tag_vocab))
for data_key in ['train', 'val', 'test']:
    print(data_key)
    dataloader.load_data(json.load(open(os.path.join(data_dir, '{}_data.json'.format(data_key)))), data_key,
                            cut_sen_len=config['cut_sen_len'], use_bert_tokenizer=config['use_bert_tokenizer'])
    print('{} set size: {}'.format(data_key, len(dataloader.data[data_key])))

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

writer = SummaryWriter(log_dir)

model = JointBERT(config['model'], DEVICE, dataloader.tag_dim, dataloader.intent_dim, dataloader.intent_weight)
model.to(DEVICE)

if config['model']['finetune']:
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if
                    not any(nd in n for nd in no_decay) and p.requires_grad],
            'weight_decay': config['model']['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
            'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config['model']['learning_rate'],
                        eps=config['model']['adam_epsilon'])
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config['model']['warmup_steps'],
                                                num_training_steps=config['model']['max_step'])
else:
    for n, p in model.named_parameters():
        if 'bert' in n:
            p.requires_grad = False
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                                    lr=config['model']['learning_rate'])

for name, param in model.named_parameters():
    print(name, param.shape, param.device, param.requires_grad)

## Step 2.3 불러온 BERT 모델을 Fine-tuning 하기

In [None]:
max_step = config['model']['max_step']
check_step = config['model']['check_step']
batch_size = config['model']['batch_size']
model.zero_grad()
train_slot_loss, train_intent_loss = 0, 0
best_val_f1 = 0.

writer.add_text('config', json.dumps(config))

for step in range(1, max_step + 1):
    model.train()
    batched_data = dataloader.get_train_batch(batch_size)
    batched_data = tuple(t.to(DEVICE) for t in batched_data)
    word_seq_tensor, tag_seq_tensor, intent_tensor, word_mask_tensor, tag_mask_tensor, context_seq_tensor, context_mask_tensor = batched_data
    if not config['model']['context']:
        context_seq_tensor, context_mask_tensor = None, None
    _, _, slot_loss, intent_loss = model.forward(word_seq_tensor, word_mask_tensor, tag_seq_tensor, tag_mask_tensor,
                                                    intent_tensor, context_seq_tensor, context_mask_tensor)
    train_slot_loss += slot_loss.item()
    train_intent_loss += intent_loss.item()
    loss = slot_loss + intent_loss
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    if config['model']['finetune']:
        scheduler.step()  # Update learning rate schedule
    model.zero_grad()
    if step % check_step == 0:
        train_slot_loss = train_slot_loss / check_step
        train_intent_loss = train_intent_loss / check_step
        print('[%d|%d] step' % (step, max_step))
        print('\t slot loss:', train_slot_loss)
        print('\t intent loss:', train_intent_loss)

        predict_golden = {'intent': [], 'slot': [], 'overall': []}

        val_slot_loss, val_intent_loss = 0, 0
        model.eval()
        for pad_batch, ori_batch, real_batch_size in dataloader.yield_batches(batch_size, data_key='val'):
            pad_batch = tuple(t.to(DEVICE) for t in pad_batch)
            word_seq_tensor, tag_seq_tensor, intent_tensor, word_mask_tensor, tag_mask_tensor, context_seq_tensor, context_mask_tensor = pad_batch
            if not config['model']['context']:
                context_seq_tensor, context_mask_tensor = None, None

            with torch.no_grad():
                slot_logits, intent_logits, slot_loss, intent_loss = model.forward(word_seq_tensor,
                                                                                    word_mask_tensor,
                                                                                    tag_seq_tensor,
                                                                                    tag_mask_tensor,
                                                                                    intent_tensor,
                                                                                    context_seq_tensor,
                                                                                    context_mask_tensor)
            val_slot_loss += slot_loss.item() * real_batch_size
            val_intent_loss += intent_loss.item() * real_batch_size
            for j in range(real_batch_size):
                predicts = recover_intent(dataloader, intent_logits[j], slot_logits[j], tag_mask_tensor[j],
                                            ori_batch[j][0], ori_batch[j][-4])
                labels = ori_batch[j][3]

                predict_golden['overall'].append({
                    'predict': predicts,
                    'golden': labels
                })
                predict_golden['slot'].append({
                    'predict': [x for x in predicts if is_slot_da(x)],
                    'golden': [x for x in labels if is_slot_da(x)]
                })
                predict_golden['intent'].append({
                    'predict': [x for x in predicts if not is_slot_da(x)],
                    'golden': [x for x in labels if not is_slot_da(x)]
                })

        for j in range(10):
            writer.add_text('val_sample_{}'.format(j),
                            json.dumps(predict_golden['overall'][j], indent=2, ensure_ascii=False),
                            global_step=step)

        total = len(dataloader.data['val'])
        val_slot_loss /= total
        val_intent_loss /= total
        print('%d samples val' % total)
        print('\t slot loss:', val_slot_loss)
        print('\t intent loss:', val_intent_loss)

        writer.add_scalar('intent_loss/train', train_intent_loss, global_step=step)
        writer.add_scalar('intent_loss/val', val_intent_loss, global_step=step)

        writer.add_scalar('slot_loss/train', train_slot_loss, global_step=step)
        writer.add_scalar('slot_loss/val', val_slot_loss, global_step=step)

        for x in ['intent', 'slot', 'overall']:
            precision, recall, F1 = calculateF1(predict_golden[x])
            print('-' * 20 + x + '-' * 20)
            print('\t Precision: %.2f' % (100 * precision))
            print('\t Recall: %.2f' % (100 * recall))
            print('\t F1: %.2f' % (100 * F1))

            writer.add_scalar('val_{}/precision'.format(x), precision, global_step=step)
            writer.add_scalar('val_{}/recall'.format(x), recall, global_step=step)
            writer.add_scalar('val_{}/F1'.format(x), F1, global_step=step)

        if F1 > best_val_f1:
            best_val_f1 = F1
            torch.save(model.state_dict(), os.path.join(output_dir, 'pytorch_model.bin'))
            print('best val F1 %.4f' % best_val_f1)
            print('save on', output_dir)

        train_slot_loss, train_intent_loss = 0, 0

writer.add_text('val overall F1', '%.2f' % (100 * best_val_f1))
writer.close()

model_path = os.path.join(output_dir, 'pytorch_model.bin')
zip_path = config['zipped_model_path']
print('zip model to', zip_path)

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
    zf.write(model_path)


# Step 3. 학습한 모델 평가하기
학습한 모델을 양적 (quantitatively), 질적(qualitatively)으로 각각 평가해봅시다.

## Step 3.1 양적 평가

In [None]:
intent_vocab = json.load(open(os.path.join(data_dir, 'intent_vocab.json')))
tag_vocab = json.load(open(os.path.join(data_dir, 'tag_vocab.json')))
dataloader = Dataloader(intent_vocab=intent_vocab, tag_vocab=tag_vocab,
                        pretrained_weights=config['model']['pretrained_weights'])
print('intent num:', len(intent_vocab))
print('tag num:', len(tag_vocab))
for data_key in ['val', 'test']:
    dataloader.load_data(json.load(open(os.path.join(data_dir, '{}_data.json'.format(data_key)))), data_key,
                            cut_sen_len=0, use_bert_tokenizer=config['use_bert_tokenizer'])
    print('{} set size: {}'.format(data_key, len(dataloader.data[data_key])))

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

model = JointBERT(config['model'], DEVICE, dataloader.tag_dim, dataloader.intent_dim)
model.load_state_dict(torch.load(os.path.join(output_dir, 'pytorch_model.bin'), DEVICE))
model.to(DEVICE)
model.eval()

batch_size = config['model']['batch_size']
GT_sent = []
GT_slot = []
data_key = 'test'
predict_golden = {'intent': [], 'slot': [], 'overall': []}
slot_loss, intent_loss = 0, 0
for pad_batch, ori_batch, real_batch_size in dataloader.yield_batches(batch_size, data_key=data_key):
    pad_batch = tuple(t.to(DEVICE) for t in pad_batch)
    word_seq_tensor, tag_seq_tensor, intent_tensor, word_mask_tensor, tag_mask_tensor, context_seq_tensor, context_mask_tensor = pad_batch
    if not config['model']['context']:
        context_seq_tensor, context_mask_tensor = None, None

    with torch.no_grad():
        slot_logits, intent_logits, batch_slot_loss, batch_intent_loss = model.forward(word_seq_tensor,
                                                                                        word_mask_tensor,
                                                                                        tag_seq_tensor,
                                                                                        tag_mask_tensor,
                                                                                        intent_tensor,
                                                                                        context_seq_tensor,
                                                                                        context_mask_tensor)
    slot_loss += batch_slot_loss.item() * real_batch_size
    intent_loss += batch_intent_loss.item() * real_batch_size
    for j in range(real_batch_size):
        predicts = recover_intent(dataloader, intent_logits[j], slot_logits[j], tag_mask_tensor[j],
                                    ori_batch[j][0], ori_batch[j][-4])
        labels = ori_batch[j][3]
        GT_sent.append(ori_batch[j][0])
        GT_slot.append(ori_batch[j][1])
        predict_golden['overall'].append({
            'predict': predicts,
            'golden': labels
        })
        predict_golden['slot'].append({
            'predict': [x for x in predicts if is_slot_da(x)],
            'golden': [x for x in labels if is_slot_da(x)]
        })
        predict_golden['intent'].append({
            'predict': [x for x in predicts if not is_slot_da(x)],
            'golden': [x for x in labels if not is_slot_da(x)]
        })
    print('[%d|%d] samples' % (len(predict_golden['overall']), len(dataloader.data[data_key])))

total = len(dataloader.data[data_key])
slot_loss /= total
intent_loss /= total
print('%d samples %s' % (total, data_key))
print('\t slot loss:', slot_loss)
print('\t intent loss:', intent_loss)

for x in ['intent', 'slot', 'overall']:
    precision, recall, F1 = calculateF1(predict_golden[x])
    print('-' * 20 + x + '-' * 20)
    print('\t Precision: %.2f' % (100 * precision))
    print('\t Recall: %.2f' % (100 * recall))
    print('\t F1: %.2f' % (100 * F1))

output_file = os.path.join(output_dir, 'output.json')
json.dump(predict_golden['overall'], open(output_file, 'w', encoding='utf-8'), indent=2, ensure_ascii=False)


## Step 3.2 질적 평가 (Exercise)

위 양적 평가에서 BERT가 slot과 intention 예측에서 뛰어난 정확도를 보이고 있는 것을 확인하셨을 것입니다.

이번 파트에서는 문장 별 예측 결과를 실제로 출력해보고, 얼마나 모델이 정확하게 예측하고 있는지 실제로 확인해 봅시다.

실습과제 구현을 시작하시기 전에, 다음 List와 Dictionary들을 각각 출력해보시기를 권장합니다.

- `GT_slot`, `GT_sent` : 각각 모든 Ground-truth 슬롯 값들과 문장들의 정보가 저장되어 있는 List
- `predict_golden` : slot/intent 예측값이 구조화되어 저장되어 있는 Dictionary

최종적으로 다음과 같은 형식으로 예측 결과를 출력하세요.

### Example #1

Query(slot_loc): Yes(O) .(O) what(O) type(O) of(O) food(O) do(O) you(O) want(O) ?(O) 

SLOT   predict : []    / label: []

INTENT predict : [['request', 'food', '?']]    / label: [['request', 'food', '?']]

### Example #2

Query(slot_loc): How(O) about(O) Italian(B-inform+food) ?(O) 

SLOT   predict : [['inform', 'food', 'Italian']]    / label: [['inform', 'food', 'italian']]

INTENT predict : []    / label: []

In [None]:
print()
print('Qualitative results:')
for i in range(len(GT_sent)):
    # Implement Here
    pass
