In [1]:
%cd /content/drive/MyDrive/Colab Notebooks/nlp/apps/chatbot
!pip install -q -U trax

/content/drive/MyDrive/Colab Notebooks/nlp/apps/chatbot
[K     |████████████████████████████████| 471kB 5.4MB/s 
[K     |████████████████████████████████| 3.4MB 10.8MB/s 
[K     |████████████████████████████████| 174kB 20.8MB/s 
[K     |████████████████████████████████| 1.5MB 25.4MB/s 
[K     |████████████████████████████████| 348kB 32.2MB/s 
[K     |████████████████████████████████| 1.1MB 31.8MB/s 
[K     |████████████████████████████████| 71kB 7.9MB/s 
[K     |████████████████████████████████| 3.7MB 33.7MB/s 
[K     |████████████████████████████████| 890kB 35.8MB/s 
[K     |████████████████████████████████| 2.9MB 35.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import json
import random
import numpy as np
import trax   

# Dataset

Get the [MultiWoz dataset](https://github.com/budzianowski/multiwoz/tree/master/data)

# Paths

In [35]:
data = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/chatbot/data/MultiWOZ_2.1'
subwords = '/content/drive/MyDrive/Colab Notebooks/nlp/apps/chatbot/data/en_32k.subword'

In [4]:
def load_json(directory, file):
    with open(f'{directory}/{file}') as f:
        db = json.load(f)
    return db

# Load the dialogue dataset
dialogue = load_json(data, 'data.json')

In [9]:
dialogue_keys = list(dialogue.keys())
print(f'The amount of dialogues is: {len(dialogue_keys)}')
print(f'These are some of the dialogue keys: {dialogue_keys[10:20]}')

The amount of dialogues is: 10438
These are some of the dialogue keys: ['PMUL1170.json', 'SNG01741.json', 'PMUL4899.json', 'MUL2261.json', 'SSNG0348.json', 'MUL0784.json', 'MUL0886.json', 'PMUL2512.json', 'SNG0548.json', 'MUL1474.json']


As we can see the dataset is composed of multiple files and the filenames are used as keys in our dictionary. Those with multi-domain dialogues have "MUL" in their filenames while single domain dialogues have either "SNG" or "WOZ".

In [14]:
# Get the keys of a file
n = 15
print(dialogue[dialogue_keys[n]].keys())

dict_keys(['goal', 'log'])


Each file is a dictionary with 2 keys. The `goal` also points to a dictionary and it contains several keys pertaining to the objectives of the conversation.

In [15]:
# print goal
from pprint import pprint
pprint(dialogue[dialogue_keys[n]]['goal'])

{'attraction': {},
 'hospital': {},
 'hotel': {'fail_info': {'area': 'north',
                         'parking': 'yes',
                         'pricerange': 'cheap',
                         'type': 'hotel'},
           'info': {'area': 'north',
                    'parking': 'yes',
                    'pricerange': 'cheap',
                    'type': 'guesthouse'},
           'reqt': ['postcode', 'internet']},
 'message': ['You are planning your trip in Cambridge',
             "You are looking for a <span class='emphasis'>train</span>. The "
             "train should <span class='emphasis'>arrive by 08:15</span> and "
             "should go to <span class='emphasis'>cambridge</span>",
             "The train should leave on <span class='emphasis'>monday</span> "
             "and should depart from <span class='emphasis'>bishops "
             'stortford</span>',
             'Once you find the train you want to make a booking for <span '
             "class='emphasis'>2 people

The `log` on the other hand contains the dialog. It is a list of dictionaries and each element of this list contains several descriptions as well. Let's look at an example:

In [17]:
# get first element of the log list
dialogue[dialogue_keys[n]]['log'][0]

{'dialog_act': {'Train-Inform': [['Dest', 'cambridge'], ['Arrive', '08:15']]},
 'metadata': {},
 'span_info': [['Train-Inform', 'Dest', 'cambridge', 10, 10],
  ['Train-Inform', 'Arrive', '08:15', 12, 12]],
 'text': 'Hi I am looking for a train to arrive in Cambridge by 08:15.'}

We are only interested in the conversation which is in the `text` field.
The conversation goes back and forth between two persons. Let's call them 'Person 1' and 'Person 2'. This implies that
```
data['SNG0073.json']['log'][0]['text']
``` is 'Person 1' and
```data['SNG0073.json']['log'][1]['text']``` is 'Person 2' and so on. The even offsets are 'Person 1' and the odd offsets are 'Person 2'.

In [20]:
print(' Person 1: ', dialogue[dialogue_keys[n]]['log'][0]['text'])
print(' Person 2: ',dialogue[dialogue_keys[n]]['log'][1]['text'])

 Person 1:  Hi I am looking for a train to arrive in Cambridge by 08:15.
 Person 2:  Certainly, where will you be departing from?


# Extract conversations

In [26]:
def get_conversation(dataset, filename):
    """
    Takes the dialogue dataset and extracts the
    dialogues for each log
    Args:
        dataset: dict
        filename:str
    returns:
        result: str
    """
    result = ''

    # Get length of file's log list
    message_len = len(dataset[filename]['log'])

    # Set delimiter strings for each person in the dialogue
    delimiter1 = ' Person 1: '
    delimiter2 = ' Person 2: '

    for i in range(message_len):
        current_log = dataset[filename]['log'][i]

        # check person, if even = person1
        if i % 2 == 0:
            result += delimiter1
        else:
            result += delimiter2

        # append message text from the log
        result += current_log['text']

    return result

# Uncomment for testing

# # test
# n_file = 50
# filename = dialogue_keys[n_file]
# result_dialogue_test = get_conversation(dialogue, filename)
# print(result_dialogue_test)

 Person 1: I need a restaurant to dine at in Cambridge on my upcoming trip. Person 2: There are lots to choose from. What type of cuisine are you looking for? Person 1: I don't care. It needs to be on the south side and moderately priced.  Person 2: There are 2 options, pizza hut cherry hinton which serves italian and restaurant alimentum which serves modern european.  Can I book you for those? Person 1: Yes please.  I also need a hotel with at least 3 stars and free parking. Person 2: There is no hotel in south side,do you want me to try different area? Person 1: How about a Guesthouse in South side instead? Person 2: I'm sorry, there is no guest house that meets those criteria, either. Would you like to try a different rating, or a different area? Person 1: Sure, what about in the city centre? Person 2: I am not finding a guesthouse that meets your criteria.  Might a hotel work? Person 1: Is there not one in the same area as the restaurant? Person 2: There are 2, the Alexander Bed an

# Process the conversations for the reformer inputs 

In [28]:
def get_all_conversations(dataset, files):
    """
    Takes the dialogue dataset and gets all the conversations
    available. Then returns each conversation in a list of
    strings
    Args:
        dataset: dict
        files: list
    returns:
        all_conversations: list
    """
    all_conversations = []

    for filename in files:
        conversation = get_conversation(dataset, filename)
        all_conversations.append(conversation)

    return all_conversations

In [29]:
conversations = get_all_conversations(dialogue, dialogue_keys)

# Split data into train/test

In [33]:
random.shuffle(conversations)
train_split = int(len(conversations) * 0.95)
train = conversations[:train_split]
test = conversations[train_split:]

print(f'number of conversations in the data set: {len(conversations)}')
print(f'number of conversations in train set: {len(train)}')
print(f'number of conversations in test set: {len(test)}')



number of conversations in the data set: 10438
number of conversations in train set: 9916
number of conversations in test set: 522


# Tokenize data

First we will define a utility generator function to yield elements from our dataset. Then, we will define our data pipeline for tokenizing and batching our data. We will bucket by length and also have an upper bound on the token length.

In [36]:
def stream(data):
    while True:
        conversation = random.choice(data)

        yield (conversation, conversation)

data_pipeline = trax.data.Serial(
    # Randomize the stream
    trax.data.Shuffle(),

    # Tokenize the data
    trax.data.Tokenize(vocab_file=subwords),

    # Filter long sequences
    trax.data.FilterByLength(2048),

    # Bucket by length
    trax.data.BucketByLength(boundaries=[128, 256, 512, 1024],
                             batch_size=[16, 8, 4, 2, 1]),
    
    # Add loss weights but do not add it to the padding tokens
    trax.data.AddLossWeights(id_to_mask=0)
)

# Apply the data pipeline to our train and eval sets
train_stream = data_pipeline(stream(train))
eval_stream = data_pipeline(stream(test))


TypeError: ignored