In [20]:
%cd /content/drive/MyDrive/Colab Notebooks/nlp/apps/chatbot
!pip install -q -U trax

/content/drive/MyDrive/Colab Notebooks/nlp/apps/chatbot


In [21]:
import json
import random
import numpy as np
import trax   

# Dataset

Get the [MultiWoz dataset](https://github.com/budzianowski/multiwoz/tree/master/data)

# Paths

In [22]:
data = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/chatbot/data/MultiWOZ_2.1'
subwords = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/chatbot/data/en_32k.subword'
output_dir = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/chatbot/models/reformer/'

In [23]:
def load_json(directory, file):
    with open(f'{directory}/{file}') as f:
        db = json.load(f)
    return db

# Load the dialogue dataset
dialogue = load_json(data, 'data.json')

In [24]:
dialogue_keys = list(dialogue.keys())
print(f'The amount of dialogues is: {len(dialogue_keys)}')
print(f'These are some of the dialogue keys: {dialogue_keys[10:20]}')

The amount of dialogues is: 10438
These are some of the dialogue keys: ['PMUL1170.json', 'SNG01741.json', 'PMUL4899.json', 'MUL2261.json', 'SSNG0348.json', 'MUL0784.json', 'MUL0886.json', 'PMUL2512.json', 'SNG0548.json', 'MUL1474.json']


As we can see the dataset is composed of multiple files and the filenames are used as keys in our dictionary. Those with multi-domain dialogues have "MUL" in their filenames while single domain dialogues have either "SNG" or "WOZ".

In [25]:
# Get the keys of a file
n = 15
print(dialogue[dialogue_keys[n]].keys())

dict_keys(['goal', 'log'])


Each file is a dictionary with 2 keys. The `goal` also points to a dictionary and it contains several keys pertaining to the objectives of the conversation.

In [26]:
# print goal
from pprint import pprint
pprint(dialogue[dialogue_keys[n]]['goal'])

{'attraction': {},
 'hospital': {},
 'hotel': {'fail_info': {'area': 'north',
                         'parking': 'yes',
                         'pricerange': 'cheap',
                         'type': 'hotel'},
           'info': {'area': 'north',
                    'parking': 'yes',
                    'pricerange': 'cheap',
                    'type': 'guesthouse'},
           'reqt': ['postcode', 'internet']},
 'message': ['You are planning your trip in Cambridge',
             "You are looking for a <span class='emphasis'>train</span>. The "
             "train should <span class='emphasis'>arrive by 08:15</span> and "
             "should go to <span class='emphasis'>cambridge</span>",
             "The train should leave on <span class='emphasis'>monday</span> "
             "and should depart from <span class='emphasis'>bishops "
             'stortford</span>',
             'Once you find the train you want to make a booking for <span '
             "class='emphasis'>2 people

The `log` on the other hand contains the dialog. It is a list of dictionaries and each element of this list contains several descriptions as well. Let's look at an example:

In [27]:
# get first element of the log list
dialogue[dialogue_keys[n]]['log'][0]

{'dialog_act': {'Train-Inform': [['Dest', 'cambridge'], ['Arrive', '08:15']]},
 'metadata': {},
 'span_info': [['Train-Inform', 'Dest', 'cambridge', 10, 10],
  ['Train-Inform', 'Arrive', '08:15', 12, 12]],
 'text': 'Hi I am looking for a train to arrive in Cambridge by 08:15.'}

We are only interested in the conversation which is in the `text` field.
The conversation goes back and forth between two persons. Let's call them 'Person 1' and 'Person 2'. This implies that
```
data['SNG0073.json']['log'][0]['text']
``` is 'Person 1' and
```data['SNG0073.json']['log'][1]['text']``` is 'Person 2' and so on. The even offsets are 'Person 1' and the odd offsets are 'Person 2'.

In [28]:
print(' Person 1: ', dialogue[dialogue_keys[n]]['log'][0]['text'])
print(' Person 2: ',dialogue[dialogue_keys[n]]['log'][1]['text'])

 Person 1:  Hi I am looking for a train to arrive in Cambridge by 08:15.
 Person 2:  Certainly, where will you be departing from?


# Extract conversations

In [29]:
def get_conversation(dataset, filename):
    """
    Takes the dialogue dataset and extracts the
    dialogues for each log
    Args:
        dataset: dict
        filename:str
    returns:
        result: str
    """
    result = ''

    # Get length of file's log list
    message_len = len(dataset[filename]['log'])

    # Set delimiter strings for each person in the dialogue
    delimiter1 = ' Person 1: '
    delimiter2 = ' Person 2: '

    for i in range(message_len):
        current_log = dataset[filename]['log'][i]

        # check person, if even = person1
        if i % 2 == 0:
            result += delimiter1
        else:
            result += delimiter2

        # append message text from the log
        result += current_log['text']

    return result

# Uncomment for testing

# # test
# n_file = 50
# filename = dialogue_keys[n_file]
# result_dialogue_test = get_conversation(dialogue, filename)
# print(result_dialogue_test)

# Process the conversations for the reformer inputs 

In [30]:
def get_all_conversations(dataset, files):
    """
    Takes the dialogue dataset and gets all the conversations
    available. Then returns each conversation in a list of
    strings
    Args:
        dataset: dict
        files: list
    returns:
        all_conversations: list
    """
    all_conversations = []

    for filename in files:
        conversation = get_conversation(dataset, filename)
        all_conversations.append(conversation)

    return all_conversations

In [31]:
conversations = get_all_conversations(dialogue, dialogue_keys)

# Split data into train/test

In [32]:
random.shuffle(conversations)
train_split = int(len(conversations) * 0.95)
train = conversations[:train_split]
test = conversations[train_split:]

print(f'number of conversations in the data set: {len(conversations)}')
print(f'number of conversations in train set: {len(train)}')
print(f'number of conversations in test set: {len(test)}')



number of conversations in the data set: 10438
number of conversations in train set: 9916
number of conversations in test set: 522


# Tokenize data

First we will define a utility generator function to yield elements from our dataset. Then, we will define our data pipeline for tokenizing and batching our data. We will bucket by length and also have an upper bound on the token length.

In [33]:
def stream(data):
    while True:
        conversation = random.choice(data)

        yield (conversation, conversation)

data_pipeline = trax.data.Serial(
    # Randomize the stream
    trax.data.Shuffle(),

    # Tokenize the data
    trax.data.Tokenize(vocab_file=subwords),

    # Filter long sequences
    trax.data.FilterByLength(2048),

    # Bucket by length
    trax.data.BucketByLength(boundaries=[128, 256, 512, 1024],
                             batch_sizes=[16, 8, 4, 2, 1]),
    
    # Add loss weights but do not add it to the padding tokens
    trax.data.AddLossWeights(id_to_mask=0)
)

# Apply the data pipeline to our train and eval sets
train_stream = data_pipeline(stream(train))
eval_stream = data_pipeline(stream(test))


In [34]:
# Avoiding scrolling bars
from IPython.display import HTML
display(HTML('''
<style>
  pre {
      white-space: normal;
  }
</style>
'''))

# Uncomment for test

# # The stream generators will yield (input, target, weights). let's just grab the input for inspection
# inp, _, _ = next(train_stream)

# # Print the shape. format is (batch size, token length)
# print("input shape: ", inp.shape)

# # Detokenize the first element
# print(trax.data.detokenize(inp[0], vocab_file=subwords))

# Reformer language model

In [35]:
def reformer_language_model(vocab_size=33000, n_layers=2, mode='train', attention_type=trax.layers.SelfAttention):
    """
    Implements a wrapper that returns a Reformer Language Model
    Args:
        vocab_size: int
        n_layers: int. number of decoder layers
        mode: str
        attention_type: class. an attention class to use
    returns:
        model: ReformerLM implemented in trax
    """
    model = trax.models.ReformerLM(vocab_size=vocab_size, n_layers=n_layers, 
                                   mode=mode, attention_type=attention_type)
    
    return model


# Training loop

In [41]:
def training_loop(Reformer, train_gen, test_gen, output_dir, learning_rate=0.01):

    # Use the warmup_and_rsqrt_decay learning rate schedule
    lr_schedule = trax.lr.warmup_and_rsqrt_decay(
        n_warmup_steps=1000, max_value=0.01)
    
    train_task = trax.supervised.TrainTask(
        train_gen,
        trax.layers.CrossEntropyLoss(),
        trax.optimizers.Adam(learning_rate),
        lr_schedule,
        n_steps_per_checkpoint=50
    )

    eval_task = trax.supervised.EvalTask(
        test_gen,
        metrics=[trax.layers.CrossEntropyLoss(), trax.layers.Accuracy()]
    )

    loop = trax.supervised.training.Loop(
        Reformer(mode='train'),
        train_task,
        eval_tasks=[eval_task],
        output_dir=output_dir
    )

    return loop


In [48]:
# an approximation of en_steps by epoch. Because we don't have a 
# fix batch_size we can really calculate how many steps a epoch might take
epochs = 10
n_steps = int(len(train) / 16) * epochs
loop = training_loop(reformer_language_model, train_stream, eval_stream, output_dir)
print(f'n_steps: {n_steps}')
loop.run(n_steps=n_steps)

n_steps: 6190

Step   1850: Ran 50 train steps in 64.28 secs
Step   1850: train CrossEntropyLoss |  4.61635876
Step   1850: eval  CrossEntropyLoss |  4.67196035
Step   1850: eval          Accuracy |  0.19824088

Step   1900: Ran 50 train steps in 36.21 secs
Step   1900: train CrossEntropyLoss |  4.67583561
Step   1900: eval  CrossEntropyLoss |  4.44042587
Step   1900: eval          Accuracy |  0.22091976

Step   1950: Ran 50 train steps in 35.65 secs
Step   1950: train CrossEntropyLoss |  4.52507496
Step   1950: eval  CrossEntropyLoss |  4.43591690
Step   1950: eval          Accuracy |  0.20526724

Step   2000: Ran 50 train steps in 35.94 secs
Step   2000: train CrossEntropyLoss |  4.50447607
Step   2000: eval  CrossEntropyLoss |  4.56526995
Step   2000: eval          Accuracy |  0.22515635

Step   2050: Ran 50 train steps in 35.98 secs
Step   2050: train CrossEntropyLoss |  4.46527195
Step   2050: eval  CrossEntropyLoss |  4.66826296
Step   2050: eval          Accuracy |  0.19634990

