In [47]:
import json
import os
import zipfile
import sys
from collections import Counter
from nltk.tokenize import word_tokenize

In [48]:
def read_zipped_json(filepath, filename):
    archive = zipfile.ZipFile(filepath, 'r')
    return json.load(archive.open(filename))


def phrase_in_utt(phrase, utt):
    phrase_low = phrase.lower()
    utt_low = utt.lower()
    return (' ' + phrase_low in utt_low) or utt_low.startswith(phrase_low)


def phrase_idx_utt(phrase, utt):
    phrase_low = phrase.lower()
    utt_low = utt.lower()
    if ' ' + phrase_low in utt_low or utt_low.startswith(phrase_low):
        return get_idx(phrase_low, utt_low)
    return None


def get_idx(phrase, utt):
    char_index_begin = utt.index(phrase)
    char_index_end = char_index_begin + len(phrase)
    word_index_begin = len(utt[:char_index_begin].split())
    word_index_end = len(utt[:char_index_end].split()) - 1
    return word_index_begin, word_index_end


def da2triples(dialog_act):
    triples = []
    for intent, svs in dialog_act.items():
        for slot, value in svs:
            triples.append([intent, slot, value])
    return triples


def das2tags(sen, das):
    tokens = word_tokenize(sen)
    new_sen = ' '.join(tokens)
    new_das = {}
    span_info = []
    intents = []
    for da, svs in das.items():
        new_das.setdefault(da, [])
        if da == 'inform':
            for s, v in svs:
                v = ' '.join(word_tokenize(v))
                if v != 'dontcare' and phrase_in_utt(v, new_sen):
                    word_index_begin, word_index_end = phrase_idx_utt(v, new_sen)
                    span_info.append((da, s, v, word_index_begin, word_index_end))
                else:
                    intents.append(da + '+' + s + '*' + v)
                new_das[da].append([s, v])
        else:
            for s, v in svs:
                new_das[da].append([s, v])
                intents.append(da + '+' + s + '*' + v)
    tags = []
    for i, _ in enumerate(tokens):
        for span in span_info:
            if i == span[3]:
                tag = "B-" + span[0] + "+" + span[1]
                tags.append(tag)
                break
            if span[3] < i <= span[4]:
                tag = "I-" + span[0] + "+" + span[1]
                tags.append(tag)
                break
        else:
            tags.append("O")

    return tokens, tags, intents, da2triples(new_das)

In [10]:
cur_dir = os.path.abspath(os.curdir)
data_dir = "ConvLab-2/data/camrest"
processed_data_dir = os.path.join(cur_dir, 'data/all_data')
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)

# Step 1. Data Preprocessing

In [11]:
data_key = ['train', 'val', 'test']
data = {}
for key in data_key:
    data[key] = read_zipped_json(os.path.join(data_dir, key + '.json.zip'), key + '.json')
    print('load {}, size {}'.format(key, len(data[key])))

load train, size 406
load val, size 135
load test, size 135


In [64]:
mode = 'all'
processed_data = {}
all_da = []
all_intent = []
all_tag = []
context_size = 3
for key in data_key:
    processed_data[key] = []
    for dialog in data[key]:
        context = []
        for turn in dialog['dial']:
            if mode == 'usr' or mode == 'all':
                tokens, tags, intents, new_das = das2tags(turn['usr']['transcript'], turn['usr']['dialog_act'])

                processed_data[key].append([tokens, tags, intents, new_das, context[-context_size:]])

                all_da += [da for da in turn['usr']['dialog_act']]
                all_intent += intents
                all_tag += tags

            context.append(turn['usr']['transcript'])

            if mode == 'sys' or mode == 'all':
                tokens, tags, intents, new_das = das2tags(turn['sys']['sent'], turn['sys']['dialog_act'])

                processed_data[key].append([tokens, tags, intents, new_das, context[-context_size:]])
                all_da += [da for da in turn['sys']['dialog_act']]
                all_intent += intents
                all_tag += tags

            context.append(turn['sys']['sent'])

    print('loaded {}, size {}'.format(key, len(processed_data[key])))
    json.dump(processed_data[key], 
              open(os.path.join(processed_data_dir, '{}_data.json'.format(key)), 'w'),
              indent=2)

loaded train, size 3342
loaded val, size 1076
loaded test, size 1070


In [65]:
print('dialog act num:', len(all_da))
print('sentence label num:', len(all_intent))
print('tag num:', len(all_tag))
json.dump(all_da, open(os.path.join(processed_data_dir, 'all_act.json'), 'w'), indent=2)
json.dump(all_intent, open(os.path.join(processed_data_dir, 'intent_vocab.json'), 'w'), indent=2)
json.dump(all_tag, open(os.path.join(processed_data_dir, 'tag_vocab.json'), 'w'), indent=2)


dialog act num: 4437
sentence label num: 2428
tag num: 68055


# Step 2. BERT Fine-tuning

In [66]:
print('-' * 20 + 'dataset:camrest' + '-' * 20)
from convlab2.nlu.jointBERT.camrest.postprocess import is_slot_da, calculateF1, recover_intent

--------------------dataset:camrest--------------------


In [73]:
data_dir = os.path.join(cur_dir, 'data/all_data')
output_dir = os.path.join(cur_dir, 'outputs')
log_dir = os.path.join(cur_dir, 'logs')

max_len = 40
pretrained_model_name = "bert-base-uncased"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

intent_vocab = json.load(open(os.path.join(data_dir, 'intent_vocab.json')))
tag_vocab = json.load(open(os.path.join(data_dir, 'tag_vocab.json')))
dataloader = Dataloader(intent_vocab=intent_vocab, tag_vocab=tag_vocab,
                        pretrained_weights=pretrained_model_name)

print('intent num:', len(intent_vocab))
print('tag num:', len(tag_vocab))
for data_key in ['val', 'test']:
    dataloader.load_data(json.load(open(os.path.join(data_dir, '{}_data.json'.format(data_key)))), data_key,
                            cut_sen_len=max_len, use_bert_tokenizer=True)
    print('{} set size: {}'.format(data_key, len(dataloader.data[data_key])))


intent num: 2428
tag num: 68055
max sen bert len 50
[(1, 4), (2, 19), (3, 17), (4, 68), (5, 125), (6, 43), (7, 45), (8, 62), (9, 73), (10, 41), (11, 53), (12, 44), (13, 54), (14, 42), (15, 43), (16, 37), (17, 38), (18, 29), (19, 27), (20, 26), (21, 18), (22, 26), (23, 26), (24, 21), (25, 21), (26, 13), (27, 6), (28, 8), (29, 7), (30, 6), (31, 13), (32, 4), (33, 1), (34, 4), (35, 2), (36, 1), (39, 4), (40, 2), (43, 2), (50, 1)]
max context bert len 108
[(3, 135), (9, 3), (10, 3), (11, 3), (12, 7), (13, 11), (14, 8), (15, 12), (16, 10), (17, 12), (18, 15), (19, 11), (20, 16), (21, 4), (22, 10), (23, 9), (24, 10), (25, 14), (26, 13), (27, 15), (28, 8), (29, 10), (30, 15), (31, 22), (32, 18), (33, 17), (34, 12), (35, 23), (36, 17), (37, 21), (38, 17), (39, 29), (40, 19), (41, 18), (42, 25), (43, 28), (44, 25), (45, 23), (46, 18), (47, 11), (48, 19), (49, 27), (50, 20), (51, 21), (52, 26), (53, 22), (54, 12), (55, 16), (56, 15), (57, 13), (58, 15), (59, 9), (60, 6), (61, 11), (62, 15), (63,

In [97]:
# word_seq_tensor, tag_seq_tensor, intent_tensor, word_mask_tensor, tag_mask_tensor, context_seq_tensor, context_mask_tensor
dataloader.data['val'][0][3]

[['inform', 'food', 'brazilian'], ['inform', 'area', 'north']]