# ATIS Flight Reservations Dataset

Dataset download link: http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/




## Understanding the Data

In [1]:
import numpy as np
import pandas as pd
import nltk, pprint, os
import gzip, os, pickle
import matplotlib.pyplot as plt
import random

In [2]:
# read the first part of the dataset
# each part (.gz file) contains train, validation and test sets, plus a dict

filename = 'atis.fold0.pkl.gz'
f = gzip.open(filename, 'rb')
try:
    train_set, valid_set, test_set, dicts = pickle.load(f, encoding='latin1')
except:
    train_set, valid_set, test_set, dicts = pickle.load(f)


In [3]:
# structure of the component data files
print(np.shape(train_set))
print(np.shape(valid_set))
print(np.shape(test_set))

(3, 3983)
(3, 995)
(3, 893)


In [4]:
# each set is a 3-tuple, each element of the tuple being a list 
print(len(train_set))
print(type(train_set[0]))
print(len(train_set[0]))


3
<class 'list'>
3983


The first list has 3983 arrays, each array being a sentence. The words are encoded by numbers (and have to be decoded using the dict provided).

Let's store the three lists into separate objects.

In [27]:
# storing the three elements of the tuple in three objects
train_x, _, train_label = train_set
val_x, _, val_label = valid_set
test_x, _, test_label = test_set

The first list represents the actual words (encoded), and the third list contains their labels (again, encoded).

In [6]:
# each list in the tuple is a numpy array (a sentence)
# printing first list in the tuple's first element
train_x[0]

array([554, 194, 268,  64,  62,  16,   8, 234, 481,  20,  40,  58, 234,
       415, 205], dtype=int32)

In [7]:
# labels are stored in the third list train_label
train_label[0]

array([126, 126, 126,  48, 126,  36,  35, 126, 126,  33, 126, 126, 126,
        78, 123], dtype=int32)

In [8]:
# dicts 
print(type(dicts))
print(dicts.keys())

<class 'dict'>
dict_keys(['tables2idx', 'words2idx', 'labels2idx'])


In [9]:
# each key:value pair is itself a dict
print(type(dicts['labels2idx']))
print(type(dicts['tables2idx']))
print(type(dicts['words2idx']))


<class 'dict'>
<class 'dict'>
<class 'dict'>


In [10]:
# storing labels and words in separate variables
words = dicts['words2idx']
labels = dicts['labels2idx']
tables = dicts['tables2idx']

In [11]:
# each key of words_dict is a word, each value its index
words.keys()

dict_keys(['nw', 'DIGIT', 'dc', 'louis', 'much', 'planes', 'departures', 'and', 'transcontinental', 'making', 'prices', 'three', 'some', 'midnight', 'define', 'atl', 'minneapolis', 'mco', 'find', 'database', 'now', 'philadelphia', 'night', 'than', 'dulles', 'serviced', 'up', 'latest', 'airline', 'pennsylvania', 'twenty', 'love', 'here', 'montreal', 'tuesdays', 'trying', '<UNK>', 'name', 'without', 'memphis', 'field', 'one', 'arrange', 'wednesday', 'friday', 'thirteenth', 'ua', 'cost', 'abbreviations', 'reservations', 'offer', 'serves', 'airport', 'arrives', 'rates', 'four', 'fifth', 'return', 'eighth', 'day', 'connect', 'milwaukee', 'thursday', 'limousine', 'ap80', 'see', 'cp', 'airports', 'live', 'over', 'list', 'quebec', 'business', 'weekday', 'sometime', 'california', 'meals', 'after', 'arrangements', 'tennessee', 'stands', 'listing', 'f', 'me', 'general', 'highest', 'airplane', 'friends', 'then', 'may', 'leaves', 'late', 'cheap', 'types', 'flight', 'least', 'there', 'leaving', 'mos

In [12]:
# now, we can map the numeric values v in a sentence with the k,v in the dict
# train_x contains the list of training sentences
# this is the first sentence
[k for val in train_x[0] for k,v in words.items() if v==val]

['what',
 'flights',
 'leave',
 'atlanta',
 'at',
 'about',
 'DIGIT',
 'in',
 'the',
 'afternoon',
 'and',
 'arrive',
 'in',
 'san',
 'francisco']

In [13]:
# let's look at the first few sentences
sents = []
for i in range(30):
    sents.append(' '.join([k for val in train_x[i] for k,v in words.items() if v==val]))

sents

['what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco',
 'what is the abbreviation for canadian airlines international',
 "i 'd like to know the earliest flight from boston to atlanta",
 'show me the us air flights from atlanta to boston',
 'show me the cheapest round trips from dallas to baltimore',
 "i 'd like to see all flights from denver to philadelphia",
 'explain fare code qx',
 "i 'd like a united airlines flight on wednesday from san francisco to boston",
 'what is the price of american airlines flight DIGITDIGIT from new york to los angeles',
 'what does the meal code s stand for',
 'what are all flights to denver from philadelphia on sunday',
 'what times does the late afternoon flight leave from washington for denver',
 'what flights are available monday from san francisco to pittsburgh',
 'what airlines have business class',
 'flights from atlanta to washington dc',
 'from new york to toronto on thursday morning',
 'show me all the direct

In [14]:
# labels dict contains IOB (inside-out-beginning) labelled entities
labels.keys()

dict_keys(['I-meal_code', 'B-connect', 'B-depart_date.date_relative', 'B-state_code', 'B-fare_basis_code', 'I-arrive_time.time_relative', 'B-flight_time', 'I-arrive_time.start_time', 'I-depart_time.time', 'B-toloc.airport_name', 'I-economy', 'B-city_name', 'B-toloc.state_name', 'I-cost_relative', 'I-flight_number', 'B-arrive_time.start_time', 'B-depart_time.end_time', 'I-toloc.state_name', 'B-meal', 'B-arrive_time.time', 'B-time_relative', 'B-toloc.state_code', 'I-transport_type', 'B-fromloc.city_name', 'B-depart_time.start_time', 'B-meal_description', 'I-toloc.airport_name', 'I-return_date.day_number', 'B-depart_date.month_name', 'B-arrive_time.period_of_day', 'B-fare_amount', 'B-fromloc.state_name', 'B-return_date.day_name', 'B-stoploc.airport_name', 'B-fromloc.airport_name', 'I-class_type', 'B-flight_number', 'B-depart_date.day_name', 'I-depart_time.end_time', 'B-transport_type', 'I-arrive_time.end_time', 'I-flight_time', 'B-depart_date.year', 'B-fromloc.airport_code', 'B-flight_mod

There are 127 classes of labels (including the 'O' - tokens that do not fall into any entity).

In [16]:
# number of labels
print(len(labels.keys()))

127


Since the dicts 'words' and 'labels' are key:value pairs of index:word/label, let's reverse the dicts so that we don't have to do a reverse lookup everytime.

In [17]:
# converting words_to_id to id_to_words
# and labels_to_id to id_to_labels
id_to_words = {words[k]:k for k in words}
id_to_labels = {labels[k]:k for k in labels}

Now we can print the words and corresponding labels simply by looking up the value of a numeric index of each word, for e.g.:

In [18]:
# printing a few randomly chosen sentences and the corresponding labels (tagged entities)
for i in random.sample(range(len(train_x)), 20):
    w = list(map(lambda x: id_to_words[x], train_x[i]))
    l = list(map(lambda x: id_to_labels[x], train_label[i]))
    print(list(zip(w, l)))
    print('\n')

[('show', 'O'), ('me', 'O'), ('all', 'O'), ('flights', 'O'), ('between', 'O'), ('boston', 'B-fromloc.city_name'), ('and', 'O'), ('washington', 'B-toloc.city_name')]


[('philadelphia', 'B-fromloc.city_name'), ('to', 'O'), ('boston', 'B-toloc.city_name'), ('monday', 'B-depart_date.day_name')]


[('find', 'O'), ('me', 'O'), ('a', 'O'), ('flight', 'O'), ('from', 'O'), ('boston', 'B-fromloc.city_name'), ('to', 'O'), ('san', 'B-toloc.city_name'), ('francisco', 'I-toloc.city_name'), ('with', 'O'), ('a', 'O'), ('layover', 'O'), ('in', 'O'), ('denver', 'B-stoploc.city_name')]


[('what', 'O'), ('are', 'O'), ('the', 'O'), ('flights', 'O'), ('from', 'O'), ('atlanta', 'B-fromloc.city_name'), ('to', 'O'), ('baltimore', 'B-toloc.city_name'), ('which', 'O'), ('arrive', 'O'), ('in', 'O'), ('baltimore', 'B-toloc.city_name'), ('at', 'O'), ('DIGIT', 'B-arrive_time.time'), ("o'clock", 'I-arrive_time.time'), ('pm', 'I-arrive_time.time')]


[('sure', 'O'), ('i', 'O'), ("'d", 'O'), ('like', 'O'), ('to', 'O'

Let's write a function which takes in an index and returns the corresponding query with its labels.

In [19]:
def print_query(index):
    w = list(map(lambda x: id_to_words[x], train_x[index]))
    l = list(map(lambda x: id_to_labels[x], train_label[index]))
    s = list(zip(w, l))
    return s
    

In [20]:
print_query(3925)

[('on', 'O'),
 ('<UNK>', 'B-airline_name'),
 ('air', 'I-airline_name'),
 ('how', 'O'),
 ('many', 'O'),
 ('flights', 'O'),
 ('leaving', 'O'),
 ('oakland', 'B-fromloc.city_name'),
 ('on', 'O'),
 ('july', 'B-depart_date.month_name'),
 ('twenty', 'B-depart_date.day_number'),
 ('seventh', 'I-depart_date.day_number'),
 ('to', 'O'),
 ('boston', 'B-toloc.city_name'),
 ('nonstop', 'B-flight_stop')]

Also, some queries specify stopover cities, such as this.

In [21]:
print_query(3443)

[('is', 'O'),
 ('there', 'O'),
 ('a', 'O'),
 ('flight', 'O'),
 ('between', 'O'),
 ('oakland', 'B-fromloc.city_name'),
 ('and', 'O'),
 ('boston', 'B-toloc.city_name'),
 ('with', 'O'),
 ('a', 'O'),
 ('stopover', 'O'),
 ('in', 'O'),
 ('dallas', 'B-stoploc.city_name'),
 ('fort', 'I-stoploc.city_name'),
 ('worth', 'I-stoploc.city_name'),
 ('on', 'O'),
 ('twa', 'B-airline_code')]

We can see that in this dataset, queries are far more complex (in terms of number of labels, variety in the sentence structures etc.) and thus we cannot  write simple hand-written rules to extract chunks such as to_from_city, types_of_meals etc. 

Thus, we need to train probabilistic models such as CRFs, HMMs etc. to tag each word with its corresponding entity label.

We'll use the training and validation sets ```train_x``` and ```valid_x``` as to tune the model, and finaly use test set to measure the performance.

## Models for NER

Let's experiment with a few different models for labelling words with named entities.


In [29]:
# POS tagging sentences
def pos_tag(sent_list):
    pos_tags = []
    
    for sent in sent_list:
        tagged_words = nltk.pos_tag([id_to_words[val] for val in sent])
        pos_tags.append(tagged_words)

    return pos_tags

train_pos = pos_tag(train_x)
valid_pos = pos_tag(val_x)

In [42]:
# looking at tags of some randomly chosen queries
# notice that most cities after 'TO' are tagged as VB
i = random.randrange(len(train_pos))
train_pos[i]

[('show', 'VB'),
 ('me', 'PRP'),
 ('a', 'DT'),
 ('list', 'NN'),
 ('of', 'IN'),
 ('flights', 'NNS'),
 ('from', 'IN'),
 ('san', 'JJ'),
 ('francisco', 'NN'),
 ('to', 'TO'),
 ('boston', 'VB'),
 ('for', 'IN'),
 ('august', 'NN'),
 ('thirtieth', 'NN')]

Creating list of tuples for each query of the form:
```[('New', 'NNP', u'B-GPE'), ('York', 'NNP', u'I-GPE'), ('is', 'VBZ', u'O'), ('my', 'PRP$', u'O'), ('favorite', 'JJ', u'O'), ('city', 'NN', u'O')]```

In [44]:
# converting each word in train sentences to 3-tuples 
# of the form (word, tag, IOB_tag)
train_labels = []
for sent in list(zip(train_pos, train_label)):
    pos = sent[0]
    labels = sent[1]
    l = list(zip(pos, labels))
    tuple_3 = [(i[0][0], i[0][1], id_to_labels[i[1]]) for i in l]
    train_labels.append(tuple_3)

In [55]:
# some sample training sentences
train_labels[random.randrange(len(train_labels))]

[('what', 'WP', 'O'),
 ('flights', 'VBD', 'O'),
 ('does', 'VBZ', 'O'),
 ('delta', 'NNS', 'B-airline_name'),
 ('have', 'VBP', 'O'),
 ('between', 'IN', 'O'),
 ('dallas', 'NNS', 'B-fromloc.city_name'),
 ('and', 'CC', 'O'),
 ('denver', 'NN', 'B-toloc.city_name')]

In [46]:
# doing the same for validation data

valid_labels = []
for sent in list(zip(valid_pos, val_label)):
    pos = sent[0]
    labels = sent[1]
    l = list(zip(pos, labels))
    tuple_3 = [(i[0][0], i[0][1], id_to_labels[i[1]]) for i in l]
    valid_labels.append(tuple_3)

Converting to tree format.

In [53]:
from nltk.corpus import conll2000
from nltk import conlltags2tree, tree2conlltags

# converting a sample sentence to a tree
tree = conlltags2tree(train_labels[2])
print(tree)

(S
  i/JJ
  'd/MD
  like/VB
  to/TO
  know/VB
  the/DT
  (flight_mod earliest/JJS)
  flight/NN
  from/IN
  (fromloc.city_name boston/NN)
  to/TO
  (toloc.city_name atlanta/VB))


Let's now convert all training sentences to trees.

In [56]:
# converting training and validation data to tree format
train_trees = [conlltags2tree(sent) for sent in train_labels]
valid_trees = [conlltags2tree(sent) for sent in valid_labels]

In [117]:
# print some sample training trees
print(train_trees[random.randrange(len(train_trees))])

(S
  flights/NNS
  from/IN
  (fromloc.city_name chicago/NN)
  to/TO
  (toloc.city_name denver/VB)
  on/IN
  (airline_name continental/NN)
  on/IN
  (depart_date.day_name saturday/JJ)
  (depart_time.period_of_day morning/NN))


Let's now try building some parsers. 

### Regex Based Parsers

Let's start with a dummy parser - one which tags every token as an 'O'.

In [125]:
# a dummy chunk parser - tags every word as 'O'
cp = nltk.RegexpParser(r'')
print(cp.evaluate(valid_trees))

ChunkParse score:
    IOB Accuracy:  63.6%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [126]:
print(cp.parse(valid_trees[0]))

(S
  what/WP
  aircraft/NN
  is/VBZ
  used/VBN
  on/IN
  (airline_name delta/JJ)
  flight/NN
  (flight_number DIGITDIGITDIGITDIGIT/NNP)
  from/IN
  (fromloc.city_name kansas/NNP city/NN)
  to/TO
  (toloc.city_name salt/VB lake/JJ city/NN))


The above results tell us that about 63% of the tokens are tagged as 'O', i.e. they are not a named entity of any type. The precision, recall etc. are zero because we did not find any chunks at all.

### Using a Gazetteer to Lookup Cities

URL: https://raw.githubusercontent.com/grammakov/USA-cities-and-states/master/us_cities_states_counties.csv

In [74]:
# reading a file containing list of US cities, states and counties
us_cities = pd.read_csv("us_cities_states_counties.csv", sep="|")
us_cities.head()


Unnamed: 0,City,State short,State full,County,City alias
0,Holtsville,NY,New York,SUFFOLK,Internal Revenue Service
1,Holtsville,NY,New York,SUFFOLK,Holtsville
2,Adjuntas,PR,Puerto Rico,ADJUNTAS,URB San Joaquin
3,Adjuntas,PR,Puerto Rico,ADJUNTAS,Jard De Adjuntas
4,Adjuntas,PR,Puerto Rico,ADJUNTAS,Colinas Del Gigante


In [98]:
# storing cities, states and counties as sets
cities = set(us_cities['City'].str.lower())
states = set(us_cities['State full'].str.lower())
counties = set(us_cities['County'].str.lower())

In [97]:
print(len(cities))
print(len(states))
print(len(counties))

18854
62
1932


In [107]:
# define a function to look up a given word in cities, states, county
def gazetteer_lookup(word):
    return (word in cities, word in states, word in counties)

In [118]:
# sample lookups
print(gazetteer_lookup('washington'))

# utah is not a city, but a state and county
print(gazetteer_lookup('utah'))

# chicago is a city
print(gazetteer_lookup('denver'))


(True, True, True)
(False, True, True)
(True, False, True)


### TODO

We can now lookup each word in the gazetteer and assign a class (entity label) accordingly.