## 01 - Data Preprocessing
In this notebook, we will read in and pre-process data in train and dev set

In [1]:
import pandas as pd

In [2]:
path = "./data/MultiWOZ/"

In [3]:
utt_train = "./data/MultiWOZ/WOZ_train_utt.txt"
ans_train = "./data/MultiWOZ/WOZ_train_ans.txt"

In [4]:
# read in train set utterance
lines = []
with open(utt_train, encoding="utf-8") as f:
    lines = f.readlines()
    print("Length =", len(lines))

Length = 3760


In [5]:
df = pd.DataFrame({"text": lines})

# apply data cleaning
df["text"] = df["text"].apply(lambda text: text.strip())
df.head()

Unnamed: 0,text
0,"Guten Tag, I am staying overnight in Cambridge..."
1,Hi there! Can you give me some info on Cityroomz?
2,I am looking for a hotel named alyesbray lodge...
3,I am looking for a restaurant. I would like so...
4,I'm looking for an expensive restaurant in the...


In [6]:
df.iloc[110]["text"]

'Would you be able to suggest a steakhouse restaurant in Cambridge?'

In [7]:
# read in answers
lines = []
with open(ans_train, encoding="utf-8") as f:
    lines = f.readlines()
    print("Length =", len(lines))

Length = 3760


In [8]:
# df["answer"]
# order = hotel-area / hotel-internet

In [9]:
df["answer"] = lines
df["answer"] = df["answer"].apply(lambda text: text.strip())
df.head()

Unnamed: 0,text,answer
0,"Guten Tag, I am staying overnight in Cambridge...",find_hotel|hotel-area=centre|hotel-internet=ye...
1,Hi there! Can you give me some info on Cityroomz?,find_hotel|hotel-name=cityroomz
2,I am looking for a hotel named alyesbray lodge...,find_hotel|hotel-name=alyesbray lodge guest house
3,I am looking for a restaurant. I would like so...,find_restaurant|restaurant-food=chinese|restau...
4,I'm looking for an expensive restaurant in the...,find_restaurant|restaurant-area=centre|restaur...


In [10]:
df.iloc[110]["answer"]

'find_restaurant|restaurant-food=steakhouse'

In [11]:
# split the slots
df["answer"] = df["answer"].apply(lambda text: text.split("|"))
df.head()

Unnamed: 0,text,answer
0,"Guten Tag, I am staying overnight in Cambridge...","[find_hotel, hotel-area=centre, hotel-internet..."
1,Hi there! Can you give me some info on Cityroomz?,"[find_hotel, hotel-name=cityroomz]"
2,I am looking for a hotel named alyesbray lodge...,"[find_hotel, hotel-name=alyesbray lodge guest ..."
3,I am looking for a restaurant. I would like so...,"[find_restaurant, restaurant-food=chinese, res..."
4,I'm looking for an expensive restaurant in the...,"[find_restaurant, restaurant-area=centre, rest..."


In [12]:
def list2dict(item_slots):
    """ 
    convert slot lists in format A=B to dict A:B
    """
    result = dict()
    # skip the first item (intent)
    for slot in item_slots[1:]:
        # print(slot)
        key, value = slot.split("=")
        result[key] = value
    
    return result

list2dict(["find_restaurant", "restaurant-food=asian oriental", "restaurant-pricerange=expensive"])

{'restaurant-food': 'asian oriental', 'restaurant-pricerange': 'expensive'}

In [13]:
# get the intent
df["intent"] = df["answer"].apply(lambda item:item[0])
df.head()

Unnamed: 0,text,answer,intent
0,"Guten Tag, I am staying overnight in Cambridge...","[find_hotel, hotel-area=centre, hotel-internet...",find_hotel
1,Hi there! Can you give me some info on Cityroomz?,"[find_hotel, hotel-name=cityroomz]",find_hotel
2,I am looking for a hotel named alyesbray lodge...,"[find_hotel, hotel-name=alyesbray lodge guest ...",find_hotel
3,I am looking for a restaurant. I would like so...,"[find_restaurant, restaurant-food=chinese, res...",find_restaurant
4,I'm looking for an expensive restaurant in the...,"[find_restaurant, restaurant-area=centre, rest...",find_restaurant


In [14]:
# build the slots into dictionary format
df["slots"] = df["answer"].apply(list2dict)
df.head()

Unnamed: 0,text,answer,intent,slots
0,"Guten Tag, I am staying overnight in Cambridge...","[find_hotel, hotel-area=centre, hotel-internet...",find_hotel,"{'hotel-area': 'centre', 'hotel-internet': 'ye..."
1,Hi there! Can you give me some info on Cityroomz?,"[find_hotel, hotel-name=cityroomz]",find_hotel,{'hotel-name': 'cityroomz'}
2,I am looking for a hotel named alyesbray lodge...,"[find_hotel, hotel-name=alyesbray lodge guest ...",find_hotel,{'hotel-name': 'alyesbray lodge guest house'}
3,I am looking for a restaurant. I would like so...,"[find_restaurant, restaurant-food=chinese, res...",find_restaurant,"{'restaurant-food': 'chinese', 'restaurant-pri..."
4,I'm looking for an expensive restaurant in the...,"[find_restaurant, restaurant-area=centre, rest...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric..."


In [15]:
# check percentage for 'dontcare' answer
w = 'dontcare'
sum(df["slots"].apply(lambda slots: w in slots.values())) * 100 / len(df)

2.898936170212766

In [16]:
output_file = "./data/train.csv"
print("Writing to ", output_file)
df.to_csv(output_file, index=False)
print("Completed")

Writing to  ./data/train.csv
Completed


In [17]:
# analyze the slots
slots = set() # empty set
restauant_slots = set()
hotel_slots = set()

n = len(df)
max_row = n # set to n for full set

print("Building slot sets...")
for item in df["answer"][:max_row]:
    # print(item)
    # item_set = set(item)
    # print(item_set)
    slots.update(item)
    restauant_slots.update([slot 
                            for slot in item
                            if slot.startswith("restaurant")
                           ])
    
    hotel_slots.update([slot 
                        for slot in item
                        if slot.startswith("hotel")
                        ])

print("Done.")
print("Slot count =", len(slots))
# print(slots)
print("Restaurant slots =", len(restauant_slots))
print("Hotel slots =", len(hotel_slots))

Building slot sets...
Done.
Slot count = 293
Restaurant slots = 218
Hotel slots = 73


In [18]:
# finding unique slots keys
from collections import defaultdict

restaurant_dict = defaultdict(list)
for item in restauant_slots:
    key, value = item.split("=")
    restaurant_dict[key].append(value)

print("Keys =", restaurant_dict.keys())


Keys = dict_keys(['restaurant-food', 'restaurant-name', 'restaurant-pricerange', 'restaurant-area'])


In [19]:
for k in restaurant_dict.keys():
    print(k, restaurant_dict[k])
    print("-------")

restaurant-food ['afghan', 'afternoon tea', 'turkish', 'mexican', 'swiss', 'modern european', 'barbeque', 'swedish', 'french', 'kosher', 'modern global', 'traditional', 'german', 'scandinavian', 'bbq', 'corsica', 'brazilian', 'eritrean', 'european', 'gastropub', 'steakhouse', 'unusual', 'english', 'australian', 'north indian', 'spanish', 'korean', 'morrocan', 'international', 'northern european', 'persian', 'vegetarian', 'south indian', 'danish', 'dontcare', 'singaporean', 'catalan', 'welsh', 'north african', 'modern', 'japanese', 'muslim', 'middle eastern', 'global', 'panasian', 'christmas', 'lebanese', 'hungarian', 'americas', 'jamaican', 'british', 'chinese', 'romanian', 'bistro', 'cuban', 'russian', 'cantonese', 'thai', 'mediterranean', 'fusion', 'greek', 'polynesian', 'latin american', 'asian oriental', 'australasian', 'sri lankan', 'irish', 'new zealand', 'pizza', 'belgian', 'venetian', 'creative', 'modern eclectic', 'basque', 'molecular gastronomy', 'caribbean', 'portuguese', 's

In [20]:
hotel_dict = defaultdict(list)
for item in hotel_slots:
    key, value = item.split("=")
    hotel_dict[key].append(value)

print("Keys =", hotel_dict.keys())

Keys = dict_keys(['hotel-name', 'hotel-stars', 'hotel-area', 'hotel-internet', 'hotel-pricerange', 'hotel-parking', 'hotel-type'])


In [21]:
for k in hotel_dict.keys():
    print(k, hotel_dict[k])
    print("-------")

hotel-name ['lovell lodge', 'hobsons house', 'arbury lodge guesthouse', 'finches bed and breakfast', 'limehouse', 'a and b quest house', 'carolina b&b', 'gonville', 'alpha milton guest house', 'university arms hotel', 'b guesthouse', 'hamilton lodge', 'allenbell', 'arbury guesthouse and lodge', 'express by holiday inn in cambridge', 'a and b guest house', 'express by holiday inn - cambridge', 'kirkwood house', 'avalon', 'aylesbray lodge guest house', 'lensfield hotel', 'acorn guest house', 'cityroomz', 'worth house', 'kirkwood', 'huntingdon marriott hotel', 'dontcare', 'alexander bed and breakfast', 'express by holiday inn', 'cambridge belfry', 'autumn house', 'warkworth house', 'rosas bed and breakfast', 'alyesbray lodge guest house', 'gonville hotel', 'bridge guest house', 'ashley hotel', 'leverton house', 'finches', 'home from home', 'city centre north b and b', 'archway house', 'alexeander b&b', 'carolina bed and breakfast', 'arbury lodge', 'el shaddai', 'alpha-milton guest house',

In [22]:
# process dev set

In [23]:
utt_dev = "./data/MultiWOZ/WOZ_dev_utt.txt"
ans_dev = "./data/MultiWOZ/WOZ_dev_ans.txt"

In [24]:
# read in train set utterance
lines_utt = []
with open(utt_dev, encoding="utf-8") as f:
    lines_utt = f.readlines()
    print("Length =", len(lines_utt))

Length = 413


In [25]:
lines_ans = []
with open(ans_dev, encoding="utf-8") as f:
    lines_ans = f.readlines()
    print("Length =", len(lines_ans))

Length = 413


In [26]:
dev_df = pd.DataFrame({"text": lines_utt,
                       "answer_raw": lines_ans
                      })

# apply data cleaning
dev_df["text"] = dev_df["text"].apply(lambda text: text.strip())
dev_df["answer_raw"] = dev_df["answer_raw"].apply(lambda text: text.strip())

dev_df.head()

Unnamed: 0,text,answer_raw
0,I'm looking for a local place to dine in the c...,find_restaurant|restaurant-area=centre|restaur...
1,My husband and I are celebrating our anniversa...,find_hotel
2,I'm looking for an expensive restaurant in the...,find_restaurant|restaurant-area=centre|restaur...
3,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes
4,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...


In [27]:
# split the slots
dev_df["answer"] = dev_df["answer_raw"].apply(lambda text: text.split("|"))
# get the intent
dev_df["intent"] = dev_df["answer"].apply(lambda item:item[0])
# build the slots into dictionary format
dev_df["slots"] = dev_df["answer"].apply(list2dict)

dev_df.head()

Unnamed: 0,text,answer_raw,answer,intent,slots
0,I'm looking for a local place to dine in the c...,find_restaurant|restaurant-area=centre|restaur...,"[find_restaurant, restaurant-area=centre, rest...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-food..."
1,My husband and I are celebrating our anniversa...,find_hotel,[find_hotel],find_hotel,{}
2,I'm looking for an expensive restaurant in the...,find_restaurant|restaurant-area=centre|restaur...,"[find_restaurant, restaurant-area=centre, rest...",find_restaurant,"{'restaurant-area': 'centre', 'restaurant-pric..."
3,Are there any accommodations in the east part ...,find_hotel|hotel-area=east|hotel-parking=yes,"[find_hotel, hotel-area=east, hotel-parking=yes]",find_hotel,"{'hotel-area': 'east', 'hotel-parking': 'yes'}"
4,"I'm looking for a nice place to stay, somewher...",find_hotel|hotel-internet=yes|hotel-pricerange...,"[find_hotel, hotel-internet=yes, hotel-pricera...",find_hotel,"{'hotel-internet': 'yes', 'hotel-pricerange': ..."


In [28]:
output_file = "./data/dev.csv"
print("Writing to ", output_file)
dev_df.to_csv(output_file, index=False)
print("Completed")

Writing to  ./data/dev.csv
Completed


In [29]:
output_file = "./data/dev.json"
print("Writing to ", output_file)
dev_df.to_json(output_file, orient="records")
print("Completed")

Writing to  ./data/dev.json
Completed
