# ATIS Flight Reservations Dataset

Dataset download link: http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/




## Understanding the Data

In [104]:
import numpy as np
import pandas as pd
import nltk, pprint, os
import gzip, os, pickle
import matplotlib.pyplot as plt
import random

In [2]:
# read the first part of the dataset
# each part (.gz file) contains train, validation and test sets, plus a dict

filename = 'atis.fold0.pkl.gz'
f = gzip.open(filename, 'rb')
try:
    train_set, valid_set, test_set, dicts = pickle.load(f, encoding='latin1')
except:
    train_set, valid_set, test_set, dicts = pickle.load(f)


In [3]:
# structure of the component data files
print(np.shape(train_set))
print(np.shape(valid_set))
print(np.shape(test_set))

(3, 3983)
(3, 995)
(3, 893)


In [4]:
# each set is a tuple of length 3, each element being a list 
print(len(train_set))
print(type(train_set[0]))
print(len(train_set[0]))


3
<class 'list'>
3983


The first list has 3983 arrays, each array being a sentence. The words are although encoded by numbers (and have to be decoded using the dict provided).

Let's store the three lists into separate objects.

In [6]:
# storing the three elements of the tuple in three objects
train_x, _, train_label = train_set
val_x, _, val_label = valid_set

The first list represents the actual words (encoded), and the third list contains their labels (again, encoded).

In [8]:
# each list in the tuple is a numpy array (a sentence)
# printing first list in the tuple's first element
train_x[0]

array([554, 194, 268,  64,  62,  16,   8, 234, 481,  20,  40,  58, 234,
       415, 205])

In [81]:
# labels are stored in the third list train_label
train_label[0]

array([126, 126, 126,  48, 126,  36,  35, 126, 126,  33, 126, 126, 126,
        78, 123])

In [29]:
# dicts 
print(type(dicts))
print(dicts.keys())

<class 'dict'>
dict_keys(['labels2idx', 'tables2idx', 'words2idx'])


In [33]:
# each key:value pair is itself a dict
print(type(dicts['labels2idx']))
print(type(dicts['tables2idx']))
print(type(dicts['words2idx']))


<class 'dict'>
<class 'dict'>
<class 'dict'>


In [10]:
# storing labels and words in separate variables
words = dicts['words2idx']
labels = dicts['labels2idx']
tables = dicts['tables2idx']

In [11]:
# each key of words_dict is a word, each value its index
words.keys()

dict_keys(['all', 'coach', 'cincinnati', 'people', 'month', 'four', 'code', 'go', 'show', 'thursday', 'to', 'restriction', 'dinnertime', 'under', 'sorry', 'include', 'midwest', 'worth', 'southwest', 'me', 'returning', 'far', 'vegas', 'airfare', 'ticket', 'difference', 'arrange', 'tickets', 'louis', 'cheapest', 'list', 'wednesday', 'leave', 'heading', 'ten', 'direct', 'turboprop', 'rate', 'cost', 'quebec', 'layover', 'air', 'what', 'stands', 'chicago', 'schedule', 'transcontinental', 'goes', 'new', 'transportation', 'here', 'hours', 'let', 'twentieth', 'along', 'thrift', 'passengers', 'great', 'thirty', 'canadian', 'leaves', 'alaska', 'leaving', 'amount', 'weekday', 'makes', 'midway', 'montreal', 'via', 'depart', 'county', 'names', 'stand', 'total', 'seventeenth', 'use', 'twa', 'from', 'would', 'abbreviations', 'destination', 'only', 'next', 'live', 'shortest', 'limousine', 'tell', 'today', 'more', 'DIGIT', 'm80', 'downtown', 'train', 'tampa', 'fly', 'f', 'this', 'car', 'anywhere', 'can

In [29]:
# now, we can map the numeric values v in a sentence with the k,v in the dict
# train_x contains the list of training sentences
# this is the first sentence
[k for val in train_x[0] for k,v in words.items() if v==val]

['what',
 'flights',
 'leave',
 'atlanta',
 'at',
 'about',
 'DIGIT',
 'in',
 'the',
 'afternoon',
 'and',
 'arrive',
 'in',
 'san',
 'francisco']

In [37]:
# let's look at the first few sentences
sents = []
for i in range(30):
    sents.append(' '.join([k for val in train_x[i] for k,v in words.items() if v==val]))

sents

['what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco',
 'what is the abbreviation for canadian airlines international',
 "i 'd like to know the earliest flight from boston to atlanta",
 'show me the us air flights from atlanta to boston',
 'show me the cheapest round trips from dallas to baltimore',
 "i 'd like to see all flights from denver to philadelphia",
 'explain fare code qx',
 "i 'd like a united airlines flight on wednesday from san francisco to boston",
 'what is the price of american airlines flight DIGITDIGIT from new york to los angeles',
 'what does the meal code s stand for',
 'what are all flights to denver from philadelphia on sunday',
 'what times does the late afternoon flight leave from washington for denver',
 'what flights are available monday from san francisco to pittsburgh',
 'what airlines have business class',
 'flights from atlanta to washington dc',
 'from new york to toronto on thursday morning',
 'show me all the direct

In [58]:
# labels dict contains IOB (inside-out-beginning) labelled entities
labels.keys()

dict_keys(['B-time_relative', 'B-stoploc.state_code', 'B-depart_date.today_relative', 'B-arrive_date.date_relative', 'B-depart_date.date_relative', 'I-restriction_code', 'B-return_date.month_name', 'I-time', 'B-depart_date.day_name', 'I-arrive_time.end_time', 'B-fromloc.airport_code', 'B-cost_relative', 'B-connect', 'B-return_time.period_mod', 'B-arrive_time.period_mod', 'B-flight_number', 'B-depart_time.time_relative', 'I-toloc.city_name', 'B-arrive_time.period_of_day', 'B-depart_time.period_of_day', 'I-return_date.date_relative', 'I-depart_time.start_time', 'B-fare_amount', 'I-depart_time.time_relative', 'B-city_name', 'B-depart_date.day_number', 'I-meal_description', 'I-depart_date.today_relative', 'I-airport_name', 'I-arrive_date.day_number', 'B-toloc.state_code', 'B-arrive_date.month_name', 'B-stoploc.airport_code', 'I-depart_time.time', 'B-airport_code', 'B-arrive_time.start_time', 'B-period_of_day', 'B-arrive_time.time', 'I-flight_stop', 'B-toloc.state_name', 'B-booking_class', 

In [87]:
# converting words_to_id to id_to_words
# and labels_to_id to id_to_labels
id_to_words = {words[k]:k for k in words}
id_to_labels = {labels[k]:k for k in labels}

In [100]:
# printing a few randomly chosen sentences and the corresponding labels (tagged entities)

for i in random.sample(range(len(train_x)), 20):
    w = list(map(lambda x: id_to_words[x], train_x[i]))
    l = list(map(lambda x: id_to_labels[x], train_label[i]))
    print(list(zip(w, l)))
    print('\n')

[('i', 'O'), ('would', 'O'), ('like', 'O'), ('to', 'O'), ('see', 'O'), ('the', 'O'), ('flights', 'O'), ('from', 'O'), ('baltimore', 'B-fromloc.city_name'), ('to', 'O'), ('philadelphia', 'B-toloc.city_name'), ('again', 'O')]


[('what', 'O'), ('are', 'O'), ('the', 'O'), ('flights', 'O'), ('from', 'O'), ('nashville', 'B-fromloc.city_name'), ('to', 'O'), ('tacoma', 'B-toloc.city_name'), ('on', 'O'), ('tuesday', 'B-depart_date.day_name'), ('the', 'O'), ('eighteenth', 'B-depart_date.day_number'), ('of', 'O'), ('may', 'B-depart_date.month_name')]


[('what', 'O'), ('is', 'O'), ('the', 'O'), ('round', 'B-round_trip'), ('trip', 'I-round_trip'), ('fare', 'O'), ('on', 'O'), ('continental', 'B-airline_name'), ('DIGITDIGITDIGITDIGIT', 'B-flight_number'), ('from', 'O'), ('denver', 'B-fromloc.city_name'), ('to', 'O'), ('san', 'B-toloc.city_name'), ('francisco', 'I-toloc.city_name'), ('and', 'O'), ('return', 'B-round_trip')]


[('please', 'O'), ('list', 'O'), ('flights', 'O'), ('between', 'O'), ('den

## Extracting Structured Information

Once we have retrieved the named entities in each sentence, we can extract the information in a structured format. Let's first look at all the named entities that we have.

In [101]:
# list of all named entities
labels.keys()

dict_keys(['B-time_relative', 'B-stoploc.state_code', 'B-depart_date.today_relative', 'B-arrive_date.date_relative', 'B-depart_date.date_relative', 'I-restriction_code', 'B-return_date.month_name', 'I-time', 'B-depart_date.day_name', 'I-arrive_time.end_time', 'B-fromloc.airport_code', 'B-cost_relative', 'B-connect', 'B-return_time.period_mod', 'B-arrive_time.period_mod', 'B-flight_number', 'B-depart_time.time_relative', 'I-toloc.city_name', 'B-arrive_time.period_of_day', 'B-depart_time.period_of_day', 'I-return_date.date_relative', 'I-depart_time.start_time', 'B-fare_amount', 'I-depart_time.time_relative', 'B-city_name', 'B-depart_date.day_number', 'I-meal_description', 'I-depart_date.today_relative', 'I-airport_name', 'I-arrive_date.day_number', 'B-toloc.state_code', 'B-arrive_date.month_name', 'B-stoploc.airport_code', 'I-depart_time.time', 'B-airport_code', 'B-arrive_time.start_time', 'B-period_of_day', 'B-arrive_time.time', 'I-flight_stop', 'B-toloc.state_name', 'B-booking_class', 

In [132]:
# grammar to extract from and to city
# note that we need a '+' after city entities to include 'san francisco' type names 
grammar = r'''
        to_frm_city: {<O>?<B-fromloc.city_name><I-fromloc.city_name>*<O>?<B-toloc.city_name><I-toloc.city_name>*} #chunk from_city_to_city'''

# define chunk parser
cp = nltk.RegexpParser(grammar)

In [138]:
# choose a random index from the training sentences
w = list(map(lambda x: id_to_words[x], train_x[3786]))
l = list(map(lambda x: id_to_labels[x], train_label[3786]))
sent = list(zip(w, l))
sent

[('i', 'O'),
 ("'d", 'O'),
 ('like', 'O'),
 ('a', 'O'),
 ('flight', 'O'),
 ('from', 'O'),
 ('burbank', 'B-fromloc.city_name'),
 ('to', 'O'),
 ('tacoma', 'B-toloc.city_name'),
 ('washington', 'B-toloc.state_name')]

In [139]:
print(cp.parse(sent))

(S
  i/O
  'd/O
  like/O
  a/O
  flight/O
  (to_frm_city
    from/O
    burbank/B-fromloc.city_name
    to/O
    tacoma/B-toloc.city_name)
  washington/B-toloc.state_name)


Note that since we've not included B-toloc.state_name in our chunk regex, 'washington' is not included, which is a state name. We should thus modify the regex. 

In [143]:
# adding state names to the regex
grammar = r'''
        to_frm_city: {<O>?<B-fromloc.city_name><I-fromloc.city_name>*<B-fromloc.state_name>*<O>?<B-toloc.city_name>?<I-toloc.city_name>*<B-toloc.state_name>*} #chunk from_city_to_city'''

# define chunk parser
cp = nltk.RegexpParser(grammar)
print(cp.parse(sent))

(S
  i/O
  'd/O
  like/O
  a/O
  flight/O
  (to_frm_city
    from/O
    burbank/B-fromloc.city_name
    to/O
    tacoma/B-toloc.city_name
    washington/B-toloc.state_name))


In [174]:
# choose a random index from the training sentences 
# (run this block multiple times to see some random sentences)

i = random.randrange(len(train_x))
w = list(map(lambda x: id_to_words[x], train_x[i]))
l = list(map(lambda x: id_to_labels[x], train_label[i]))
sent = list(zip(w, l))
print(cp.parse(sent))

(S
  list/O
  all/O
  flights/O
  (to_frm_city from/O washington/B-fromloc.city_name)
  dc/B-fromloc.state_code
  to/O
  tampa/B-toloc.city_name
  florida/B-toloc.state_name)


Note that we'll still not be able to extract from and to cities from sentences like these - *leaving oakland on july 27th to boston*.


In [150]:
w = list(map(lambda x: id_to_words[x], train_x[3925]))
l = list(map(lambda x: id_to_labels[x], train_label[3925]))
sent = list(zip(w, l))
print(cp.parse(sent))

(S
  on/O
  <UNK>/B-airline_name
  air/I-airline_name
  how/O
  many/O
  flights/O
  (to_frm_city leaving/O oakland/B-fromloc.city_name on/O)
  july/B-depart_date.month_name
  twenty/B-depart_date.day_number
  seventh/I-depart_date.day_number
  to/O
  boston/B-toloc.city_name
  nonstop/B-flight_stop)


Also, some queries specify stopover cities, such as this.

In [160]:
w = list(map(lambda x: id_to_words[x], train_x[3443]))
l = list(map(lambda x: id_to_labels[x], train_label[3443]))
sent = list(zip(w, l))
print(cp.parse(sent))

(S
  is/O
  there/O
  a/O
  flight/O
  (to_frm_city
    between/O
    oakland/B-fromloc.city_name
    and/O
    boston/B-toloc.city_name)
  with/O
  a/O
  stopover/O
  in/O
  dallas/B-stoploc.city_name
  fort/I-stoploc.city_name
  worth/I-stoploc.city_name
  on/O
  twa/B-airline_code)
