# ATIS Flight Reservations Dataset

Dataset download link: http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/




## Understanding the Data

In [290]:
import numpy as np
import pandas as pd
import nltk, pprint, os
import gzip, os, pickle
import matplotlib.pyplot as plt
import random

In [357]:
# read the first part of the dataset
# each part (.gz file) contains train, validation and test sets, plus a dict

filename = 'atis.fold0.pkl.gz'
f = gzip.open(filename, 'rb')
try:
    train_set, valid_set, test_set, dicts = pickle.load(f, encoding='latin1')
except:
    train_set, valid_set, test_set, dicts = pickle.load(f)


In [292]:
# structure of the component data files
print(np.shape(train_set))
print(np.shape(valid_set))
print(np.shape(test_set))

(3, 3983)
(3, 995)
(3, 893)


In [293]:
# each set is a 3-tuple, each element of the tuple being a list 
print(len(train_set))
print(type(train_set[0]))
print(len(train_set[0]))


3
<class 'list'>
3983


The first list has 3983 arrays, each array being a sentence. The words are encoded by numbers (and have to be decoded using the dict provided).

Let's store the three lists into separate objects.

In [294]:
# storing the three elements of the tuple in three objects
train_x, _, train_label = train_set
val_x, _, val_label = valid_set
test_x, _, test_label = test_set

The first list represents the actual words (encoded), and the third list contains their labels (again, encoded).

In [295]:
# each list in the tuple is a numpy array (a sentence)
# printing first list in the tuple's first element
train_x[0]

array([554,  23, 241, 534, 358, 136, 193,  11, 208, 251, 104, 502, 413,
       256, 104])

In [296]:
# labels are stored in the third list train_label
train_label[0]

array([126, 126, 126, 126, 126,   2, 126,  43, 126,  48, 109, 126,  78,
       123, 123])

In [297]:
# dicts 
print(type(dicts))
print(dicts.keys())

<class 'dict'>
dict_keys(['labels2idx', 'tables2idx', 'words2idx'])


In [298]:
# each key:value pair is itself a dict
print(type(dicts['labels2idx']))
print(type(dicts['tables2idx']))
print(type(dicts['words2idx']))


<class 'dict'>
<class 'dict'>
<class 'dict'>


In [299]:
# storing labels and words in separate variables
words = dicts['words2idx']
labels = dicts['labels2idx']
tables = dicts['tables2idx']

In [300]:
# each key of words_dict is a word, each value its index
words.keys()

dict_keys(['all', 'coach', 'cincinnati', 'people', 'month', 'four', 'code', 'go', 'show', 'thursday', 'to', 'restriction', 'dinnertime', 'under', 'sorry', 'include', 'midwest', 'worth', 'southwest', 'me', 'returning', 'far', 'vegas', 'airfare', 'ticket', 'difference', 'arrange', 'tickets', 'louis', 'cheapest', 'list', 'wednesday', 'leave', 'heading', 'ten', 'direct', 'turboprop', 'rate', 'cost', 'quebec', 'layover', 'air', 'what', 'stands', 'chicago', 'schedule', 'transcontinental', 'goes', 'new', 'transportation', 'here', 'hours', 'let', 'twentieth', 'along', 'thrift', 'passengers', 'great', 'thirty', 'canadian', 'leaves', 'alaska', 'leaving', 'amount', 'weekday', 'makes', 'midway', 'montreal', 'via', 'depart', 'county', 'names', 'stand', 'total', 'seventeenth', 'use', 'twa', 'from', 'would', 'abbreviations', 'destination', 'only', 'next', 'live', 'shortest', 'limousine', 'tell', 'today', 'more', 'DIGIT', 'm80', 'downtown', 'train', 'tampa', 'fly', 'f', 'this', 'car', 'anywhere', 'can

In [301]:
# now, we can map the numeric values v in a sentence with the k,v in the dict
# train_x contains the list of training sentences
# this is the first sentence
[k for val in train_x[0] for k,v in words.items() if v==val]

['what',
 'aircraft',
 'is',
 'used',
 'on',
 'delta',
 'flight',
 'DIGITDIGITDIGITDIGIT',
 'from',
 'kansas',
 'city',
 'to',
 'salt',
 'lake',
 'city']

In [302]:
# let's look at the first few sentences
sents = []
for i in range(30):
    sents.append(' '.join([k for val in train_x[i] for k,v in words.items() if v==val]))

sents

['what aircraft is used on delta flight DIGITDIGITDIGITDIGIT from kansas city to salt lake city',
 'i want to go from boston to atlanta on monday',
 "i need a flight from atlanta to philadelphia and i 'm looking for the cheapest fare",
 'i need a flight from toronto to montreal reaching montreal early on friday',
 'show me the evening flights from philadelphia to baltimore',
 'tell me distance from orlando airport to the city',
 'what is restriction ap80',
 'what is the lowest cost fare that delta has between boston and san francisco',
 'flight DIGITDIGITDIGIT from cincinnati to dallas',
 'now i need a one way flight from pittsburgh to denver',
 'display all flights leaving from toronto to san diego on us air <UNK> over in washington dc',
 'list all nonstop flights on tuesday before noon from charlotte to baltimore',
 'show me the lowest <UNK> fare from dallas to baltimore',
 'what is the cheapest coach flight between dallas and baltimore leaving august tenth',
 'does midwest express s

In [303]:
# labels dict contains IOB (inside-out-beginning) labelled entities
labels.keys()

dict_keys(['B-time_relative', 'B-stoploc.state_code', 'B-depart_date.today_relative', 'B-arrive_date.date_relative', 'B-depart_date.date_relative', 'I-restriction_code', 'B-return_date.month_name', 'I-time', 'B-depart_date.day_name', 'I-arrive_time.end_time', 'B-fromloc.airport_code', 'B-cost_relative', 'B-connect', 'B-return_time.period_mod', 'B-arrive_time.period_mod', 'B-flight_number', 'B-depart_time.time_relative', 'I-toloc.city_name', 'B-arrive_time.period_of_day', 'B-depart_time.period_of_day', 'I-return_date.date_relative', 'I-depart_time.start_time', 'B-fare_amount', 'I-depart_time.time_relative', 'B-city_name', 'B-depart_date.day_number', 'I-meal_description', 'I-depart_date.today_relative', 'I-airport_name', 'I-arrive_date.day_number', 'B-toloc.state_code', 'B-arrive_date.month_name', 'B-stoploc.airport_code', 'I-depart_time.time', 'B-airport_code', 'B-arrive_time.start_time', 'B-period_of_day', 'B-arrive_time.time', 'I-flight_stop', 'B-toloc.state_name', 'B-booking_class', 

There are 127 classes of labels (including the 'O' - tokens that do not fall into any entity).

In [304]:
# number of labels
print(len(labels.keys()))

127


Since the dicts 'words' and 'labels' are key:value pairs of index:word/label, let's reverse the dicts so that we don't have to do a reverse lookup everytime.

In [305]:
# converting words_to_id to id_to_words
# and labels_to_id to id_to_labels
id_to_words = {words[k]:k for k in words}
id_to_labels = {labels[k]:k for k in labels}

Now we can print the words and corresponding labels simply by looking up the value of a numeric index of each word, for e.g.:

In [306]:
# printing a few randomly chosen sentences and the corresponding labels (tagged entities)
for i in random.sample(range(len(train_x)), 20):
    w = list(map(lambda x: id_to_words[x], train_x[i]))
    l = list(map(lambda x: id_to_labels[x], train_label[i]))
    print(list(zip(w, l)))
    print('\n')

[('tell', 'O'), ('me', 'O'), ('about', 'O'), ('twa', 'B-airline_code'), ('flight', 'O'), ('DIGITDIGITDIGIT', 'B-flight_number')]


[('i', 'O'), ("'d", 'O'), ('like', 'O'), ('to', 'O'), ('fly', 'O'), ('from', 'O'), ('denver', 'B-fromloc.city_name'), ('to', 'O'), ('atlanta', 'B-toloc.city_name'), ('with', 'O'), ('a', 'O'), ('stop', 'O'), ('in', 'O'), ('pittsburgh', 'B-stoploc.city_name')]


[('what', 'O'), ('is', 'O'), ('restriction', 'O'), ('ap57', 'B-restriction_code')]


[('show', 'O'), ('me', 'O'), ('all', 'O'), ('flights', 'O'), ('from', 'O'), ('montreal', 'B-fromloc.city_name'), ('to', 'O'), ('nashville', 'B-toloc.city_name')]


[('how', 'O'), ('do', 'O'), ('you', 'O'), ('get', 'O'), ('from', 'O'), ('the', 'O'), ('airport', 'O'), ('to', 'O'), ('downtown', 'O'), ('dallas', 'B-toloc.city_name'), ('please', 'O')]


[('i', 'O'), ('would', 'O'), ('like', 'O'), ('to', 'O'), ('fly', 'O'), ('from', 'O'), ('dallas', 'B-fromloc.city_name'), ('to', 'O'), ('san', 'B-toloc.city_name'), ('franci

Let's write a function which takes in an index and returns the corresponding query with its labels.

In [307]:
def print_query(index):
    w = list(map(lambda x: id_to_words[x], train_x[index]))
    l = list(map(lambda x: id_to_labels[x], train_label[index]))
    s = list(zip(w, l))
    return s

In [308]:
print_query(3925)

[('on', 'O'),
 ('<UNK>', 'B-airline_name'),
 ('air', 'I-airline_name'),
 ('how', 'O'),
 ('many', 'O'),
 ('flights', 'O'),
 ('leaving', 'O'),
 ('oakland', 'B-fromloc.city_name'),
 ('on', 'O'),
 ('july', 'B-depart_date.month_name'),
 ('twenty', 'B-depart_date.day_number'),
 ('seventh', 'I-depart_date.day_number'),
 ('to', 'O'),
 ('boston', 'B-toloc.city_name'),
 ('nonstop', 'B-flight_stop')]

Also, some queries specify stopover cities, such as this.

In [309]:
print_query(3443)

[('is', 'O'),
 ('there', 'O'),
 ('a', 'O'),
 ('flight', 'O'),
 ('between', 'O'),
 ('oakland', 'B-fromloc.city_name'),
 ('and', 'O'),
 ('boston', 'B-toloc.city_name'),
 ('with', 'O'),
 ('a', 'O'),
 ('stopover', 'O'),
 ('in', 'O'),
 ('dallas', 'B-stoploc.city_name'),
 ('fort', 'I-stoploc.city_name'),
 ('worth', 'I-stoploc.city_name'),
 ('on', 'O'),
 ('twa', 'B-airline_code')]

We can see that in this dataset, queries are far more complex (in terms of number of labels, variety in the sentence structures etc.) and thus we cannot  write simple hand-written rules to extract chunks such as to_from_city, types_of_meals etc. 

Thus, we need to train probabilistic models such as CRFs, HMMs etc. to tag each word with its corresponding entity label.

We'll use the training and validation sets ```train_x``` and ```valid_x``` as to tune the model, and finaly use test set to measure the performance.

## Models for NER

Let's experiment with a few different models for labelling words with named entities.


In [310]:
# POS tagging sentences
# takes in a list of sentences and returns a list of POS-tagged sentences
# in the form (word, tag)

def pos_tag(sent_list):
    pos_tags = []    
    for sent in sent_list:
        tagged_words = nltk.pos_tag([id_to_words[val] for val in sent])
        pos_tags.append(tagged_words)
    return pos_tags

train_pos = pos_tag(train_x)
valid_pos = pos_tag(val_x)

In [311]:
# looking at tags of some randomly chosen queries
# notice that most cities after 'TO' are tagged as VB
i = random.randrange(len(train_pos))
train_pos[i]

[('okay', 'NN'),
 ('could', 'MD'),
 ('you', 'PRP'),
 ('get', 'VB'),
 ('me', 'PRP'),
 ('a', 'DT'),
 ('round', 'NN'),
 ('trip', 'NN'),
 ('ticket', 'NN'),
 ('from', 'IN'),
 ('indianapolis', 'NN'),
 ('to', 'TO'),
 ('kansas', 'VB'),
 ('city', 'NN')]

To train a model, we need the entity labels of each word along with the POS tags, for e.g. in this format:
```[('New', 'NNP', u'B-GPE'), ('York', 'NNP', u'I-GPE'), ('is', 'VBZ', u'O'), ('my', 'PRP$', u'O'), ('favorite', 'JJ', u'O'), ('city', 'NN', u'O')]```

Let's convert the training and validation sentences to this form. 

In [312]:
# converting each word in train sentences to 3-tuples 
# of the form (word, tag, IOB_tag)

train_labels = []
for sent in list(zip(train_pos, train_label)):
    pos = sent[0]
    labels = sent[1]
    l = list(zip(pos, labels))
    tuple_3 = [(i[0][0], i[0][1], id_to_labels[i[1]]) for i in l]
    train_labels.append(tuple_3)

In [313]:
# some sample training sentences
train_labels[random.randrange(len(train_labels))]

[('may', 'MD', 'O'),
 ('i', 'VB', 'O'),
 ('fly', 'NN', 'O'),
 ('from', 'IN', 'O'),
 ('san', 'JJ', 'B-fromloc.city_name'),
 ('francisco', 'NN', 'I-fromloc.city_name'),
 ('to', 'TO', 'O'),
 ('baltimore', 'VB', 'B-toloc.city_name')]

In [314]:
# doing the same for validation and test data
valid_labels = []
for sent in list(zip(valid_pos, val_label)):
    pos = sent[0]
    labels = sent[1]
    l = list(zip(pos, labels))
    tuple_3 = [(i[0][0], i[0][1], id_to_labels[i[1]]) for i in l]
    valid_labels.append(tuple_3)

### Converting to Tree Format

Let's now convert the sentences into a tree format, which is needed by NLTK to train taggers.

In [315]:
from nltk.corpus import conll2000
from nltk import conlltags2tree, tree2conlltags

# converting a sample sentence to a tree
tree = conlltags2tree(train_labels[2])
print(tree)

(S
  i/NNS
  need/VBP
  a/DT
  flight/NN
  from/IN
  (fromloc.city_name atlanta/NN)
  to/TO
  (toloc.city_name philadelphia/VB)
  and/CC
  i/VB
  'm/VBP
  looking/VBG
  for/IN
  the/DT
  (cost_relative cheapest/JJS)
  fare/NN)


Let's now convert all training sentences to trees.

In [316]:
# converting training and validation data to tree format
train_trees = [conlltags2tree(sent) for sent in train_labels]
valid_trees = [conlltags2tree(sent) for sent in valid_labels]

In [317]:
# print some sample training trees
print(train_trees[random.randrange(len(train_trees))])

(S
  show/VB
  me/PRP
  the/DT
  (flight_mod last/JJ)
  flight/NN
  from/IN
  (fromloc.city_name denver/NN)
  to/TO
  (toloc.city_name boston/VB))


Let's now try building some parsers. 

### Regex Based Parsers

Let's start with a dummy parser - one which tags every token as an 'O'.

In [318]:
# a dummy chunk parser - tags every word as 'O'
cp = nltk.RegexpParser(r'')
print(cp.evaluate(valid_trees))

ChunkParse score:
    IOB Accuracy:  63.2%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


The above results tell us that about 63% of the tokens are tagged as 'O', i.e. they are not a named entity of any type. The precision, recall etc. are zero because we did not find any chunks at all.

### Unigram Chunker

Let's now try a unigram chunker.

In [319]:
# unigram chunker

from nltk import ChunkParserI

class UnigramChunker(ChunkParserI):    
    def __init__(self, train_sents):
        # convert train sents from tree format to tags
        train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] 
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)
        
    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        
        # convert to tree again
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
        

In [320]:
# unigram chunker 
unigram_chunker = UnigramChunker(train_trees)
print(unigram_chunker.evaluate(valid_trees))

ChunkParse score:
    IOB Accuracy:  66.0%%
    Precision:     37.4%%
    Recall:        19.3%%
    F-Measure:     25.5%%


The accuracy, precision and recall have of course improved compared to the previous dummy parser. Let's also look at what the unigram parser has learnt.

In [321]:
# printing the most likely IOB tags for each POS tag

# extract the list of pos tags
postags = sorted(set([pos for sent in train_trees for (word, pos) in sent.leaves()]))

# for each tag, assign the most likely IOB label
print(unigram_chunker.tagger.tag(postags))

[('CC', 'O'), ('CD', 'B-round_trip'), ('DT', 'O'), ('EX', 'O'), ('FW', 'B-fromloc.city_name'), ('IN', 'O'), ('JJ', 'O'), ('JJR', 'B-cost_relative'), ('JJS', 'B-cost_relative'), ('MD', 'O'), ('NN', 'O'), ('NNP', 'B-depart_time.time'), ('NNS', 'O'), ('PDT', 'O'), ('POS', 'O'), ('PRP', 'O'), ('PRP$', 'O'), ('RB', 'O'), ('RBR', 'B-cost_relative'), ('RBS', 'B-cost_relative'), ('RP', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'B-toloc.city_name'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'O'), ('WP', 'O'), ('WRB', 'O')]


The unigram tagger has learnt that most pos tags are indeed an 'O', i.e. don't form an entity. Some interesting patterns it has learnt are:
- JJR, JJS (relative adjectives), are most likely B-cost_relative (e.g. cheapest, cheaper)
- NNP is most likely to be B-depart_time.time

### Bigram Chunker

Let's try a bigram chunker as well - we just need to change the ```UnigramTagger``` to ```BigramTagger```.

In [322]:
# bigram tagger

class BigramChunker(ChunkParserI):    
    def __init__(self, train_sents):
        # convert train sents from tree format to tags
        train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] 
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)
        
    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        
        # convert to tree again
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
        

In [323]:
# unigram chunker 
bigram_chunker = BigramChunker(train_trees)
print(bigram_chunker.evaluate(valid_trees))

ChunkParse score:
    IOB Accuracy:  70.9%%
    Precision:     45.8%%
    Recall:        40.5%%
    F-Measure:     43.0%%


The metrics have improved significantly from unigram to bigram.

## Classifier Based Chunkers



In [324]:
class ConsecutiveNPChunkTagger(nltk.TaggerI): 

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history) 
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI): 
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [325]:
# extracts features for a given word i in a given sentence 
# history refers to the previous POS tags in the sentence
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    
    # the first word has both previous word and previous tag undefined
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]

    # gazetteer lookup features (see section below)
    gazetteer = gazetteer_lookup(word)

    return {"pos": pos, "prevpos": prevpos, 'word':word,
           'word_is_city': gazetteer[0],
           'word_is_state': gazetteer[1],
           'word_is_county': gazetteer[2]}

In [326]:
# example features for a given sentence
sent_pos = train_pos[0]
sent_pos

[('what', 'WP'),
 ('aircraft', 'NN'),
 ('is', 'VBZ'),
 ('used', 'VBN'),
 ('on', 'IN'),
 ('delta', 'JJ'),
 ('flight', 'NN'),
 ('DIGITDIGITDIGITDIGIT', 'NNP'),
 ('from', 'IN'),
 ('kansas', 'NNP'),
 ('city', 'NN'),
 ('to', 'TO'),
 ('salt', 'VB'),
 ('lake', 'JJ'),
 ('city', 'NN')]

In [327]:
# example features for a sentence
for i in range(len(sent_pos)):
    print(npchunk_features(sent_pos, i, history=[]))
    print(' ')

{'pos': 'WP', 'prevpos': '<START>', 'word': 'what', 'word_is_city': False, 'word_is_state': False, 'word_is_county': False}
 
{'pos': 'NN', 'prevpos': 'WP', 'word': 'aircraft', 'word_is_city': False, 'word_is_state': False, 'word_is_county': False}
 
{'pos': 'VBZ', 'prevpos': 'NN', 'word': 'is', 'word_is_city': False, 'word_is_state': False, 'word_is_county': False}
 
{'pos': 'VBN', 'prevpos': 'VBZ', 'word': 'used', 'word_is_city': False, 'word_is_state': False, 'word_is_county': False}
 
{'pos': 'IN', 'prevpos': 'VBN', 'word': 'on', 'word_is_city': False, 'word_is_state': False, 'word_is_county': False}
 
{'pos': 'JJ', 'prevpos': 'IN', 'word': 'delta', 'word_is_city': True, 'word_is_state': False, 'word_is_county': True}
 
{'pos': 'NN', 'prevpos': 'JJ', 'word': 'flight', 'word_is_city': False, 'word_is_state': False, 'word_is_county': False}
 
{'pos': 'NNP', 'prevpos': 'NN', 'word': 'DIGITDIGITDIGITDIGIT', 'word_is_city': False, 'word_is_state': False, 'word_is_county': False}
 
{'pos

In [328]:
# training the chunker 
chunker = ConsecutiveNPChunker(train_trees)

In [329]:
# evaluate the chunker
print(chunker.evaluate(valid_trees))

ChunkParse score:
    IOB Accuracy:  91.6%%
    Precision:     75.6%%
    Recall:        82.3%%
    F-Measure:     78.8%%


The results have improved significantly compared to the basic unigram/bigram chunkers, and they may improve further if we create better features.

For example, if the word is 'DIGIT' (numbers are labelled as 'DIGIT' in this dataset), we can have a feature which indicates that (see example below). In this dataset, 4-digit numbers are encoded as 'DIGITDIGITDIGITDIGIT'.

In [330]:
# example of 'DIGITDIGIT'
train_pos[1326]

[('do', 'VBP'),
 ('you', 'PRP'),
 ('have', 'VB'),
 ('an', 'DT'),
 ('DIGITDIGITDIGIT', 'NNP'),
 ('flight', 'NN'),
 ('from', 'IN'),
 ('denver', 'NN'),
 ('to', 'TO'),
 ('san', 'VB'),
 ('francisco', 'NN')]

Let's add some of these features and see if the performance improves.

In [331]:
# extracts features for a given word i in a given sentence 
# history refers to the previous POS tags in the sentence
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    
    # the first word has both previous word and previous tag undefined
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
        
    if i == len(sentence)-1:
        nextword, nextpos = '<END>', '<END>'
    else:
        nextword, nextpos = sentence[i+1]

    # gazetteer lookup features (see section below)
    gazetteer = gazetteer_lookup(word)

    # adding word_is_digit feature (boolean)
    return {"pos": pos, "prevpos": prevpos, 'word':word, 
           'word_is_city': gazetteer[0],
           'word_is_state': gazetteer[1],
           'word_is_county': gazetteer[2],
           'word_is_digit': word in 'DIGITDIGITDIGIT', 
           'nextword': nextword, 
           'nextpos': nextpos}

In [332]:
# train and evaluate the chunker 
chunker = ConsecutiveNPChunker(train_trees)
print(chunker.evaluate(valid_trees))

ChunkParse score:
    IOB Accuracy:  91.6%%
    Precision:     75.6%%
    Recall:        84.7%%
    F-Measure:     79.9%%


In [333]:

# ChunkParse score:
#     IOB Accuracy:  92.7%%
#     Precision:     78.1%%
#     Recall:        84.9%%
#     F-Measure:     81.4%%

# ChunkParse score:
#     IOB Accuracy:  91.7%%
#     Precision:     75.5%%
#     Recall:        82.0%%
#     F-Measure:     78.6%%

### Using a Gazetteer to Lookup Cities and States

URL: https://raw.githubusercontent.com/grammakov/USA-cities-and-states/master/us_cities_states_counties.csv

In [334]:
# reading a file containing list of US cities, states and counties
us_cities = pd.read_csv("us_cities_states_counties.csv", sep="|")
us_cities.head()


Unnamed: 0,City,State short,State full,County,City alias
0,Holtsville,NY,New York,SUFFOLK,Internal Revenue Service
1,Holtsville,NY,New York,SUFFOLK,Holtsville
2,Adjuntas,PR,Puerto Rico,ADJUNTAS,URB San Joaquin
3,Adjuntas,PR,Puerto Rico,ADJUNTAS,Jard De Adjuntas
4,Adjuntas,PR,Puerto Rico,ADJUNTAS,Colinas Del Gigante


In [335]:
# storing cities, states and counties as sets
cities = set(us_cities['City'].str.lower())
states = set(us_cities['State full'].str.lower())
counties = set(us_cities['County'].str.lower())

In [336]:
print(len(cities))
print(len(states))
print(len(counties))

18854
62
1932


In [337]:
# define a function to look up a given word in cities, states, county
def gazetteer_lookup(word):
    return (word in cities, word in states, word in counties)

In [338]:
# sample lookups
print(gazetteer_lookup('washington'))
print(gazetteer_lookup('utah'))
print(gazetteer_lookup('philadelphia'))


(True, True, True)
(False, True, True)
(True, False, True)


### CRF Based Taggers


In [339]:
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

print(sklearn.__version__)

0.19.1


In [340]:
# structure of train/validation data
train_labels[0]

[('what', 'WP', 'O'),
 ('aircraft', 'NN', 'O'),
 ('is', 'VBZ', 'O'),
 ('used', 'VBN', 'O'),
 ('on', 'IN', 'O'),
 ('delta', 'JJ', 'B-airline_name'),
 ('flight', 'NN', 'O'),
 ('DIGITDIGITDIGITDIGIT', 'NNP', 'B-flight_number'),
 ('from', 'IN', 'O'),
 ('kansas', 'NNP', 'B-fromloc.city_name'),
 ('city', 'NN', 'I-fromloc.city_name'),
 ('to', 'TO', 'O'),
 ('salt', 'VB', 'B-toloc.city_name'),
 ('lake', 'JJ', 'I-toloc.city_name'),
 ('city', 'NN', 'I-toloc.city_name')]

Let's define a function to extract features from a given sentence. This is similar to the ```npchunk_features()``` function defined above, but we'll add some new features as well such as the suffix of the word (upto the last 4 characters), prefix (upto first 4 characters) etc.

The list of features we'll extract is as follows:
```
{
            'word':word,
            'pos': pos, 
            'prevword': prevword,
            'prevpos': prevpos,  
            'nextword': nextword, 
            'nextpos': nextpos,
            'word_is_city': gazetteer[0],
            'word_is_state': gazetteer[1],
            'word_is_county': gazetteer[2],
            'word_is_digit': word in 'DIGITDIGITDIGIT',
            'suff_1': suff_1,  
            'suff_2': suff_2,  
            'suff_3': suff_3,  
            'suff_4': suff_4, 
            'pref_1': pref_1,  
            'pref_2': pref_2,  
            'pref_3': pref_3, 
            'pref_4': pref_4 

}
```



In [341]:
## other features to consider

# airline code
# airline name
# day name (monday/tuesday etc.) i=1847, 2769
# o'clock (word shape): i=379

# i=random.randrange(len(train_labels))
# train_labels[i]

In [342]:
# extract features from a given sentence
def word_features(sent, i):
    word = sent[i][0]
    pos = sent[i][1]
    
    # first word
    if i==0:
        prevword = '<START>'
        prevpos = '<START>'
    else:
        prevword = sent[i-1][0]
        prevpos = sent[i-1][1]
    
    # last word
    if i == len(sent)-1:
        nextword = '<END>'
        nextpos = '<END>'
    else:
        nextword = sent[i+1][0]
        nextpos = sent[i+1][1]
    
    # word is in gazetteer
    gazetteer = gazetteer_lookup(word)
    
    # suffixes and prefixes
    suff_1, suff_2, suff_3, suff_4 = word[:1], word[:2], word[:3], word[:4]
    pref_1, pref_2, pref_3, pref_4 = word[-1:], word[-2:], word[-3:], word[-4:]
    
    return {'word':word,
            'pos': pos, 
            'prevword': prevword,
            'prevpos': prevpos,  
            'nextword': nextword, 
            'nextpos': nextpos,
            'word_is_city': gazetteer[0],
            'word_is_state': gazetteer[1],
            'word_is_county': gazetteer[2],
            'word_is_digit': word in 'DIGITDIGITDIGIT',
            'suff_1': suff_1,  
            'suff_2': suff_2,  
            'suff_3': suff_3,  
            'suff_4': suff_4, 
            'pref_1': pref_1,  
            'pref_2': pref_2,  
            'pref_3': pref_3, 
            'pref_4': pref_4 }  

In [343]:
# example features
word_features(train_labels[0], i=3)

{'nextpos': 'IN',
 'nextword': 'on',
 'pos': 'VBN',
 'pref_1': 'd',
 'pref_2': 'ed',
 'pref_3': 'sed',
 'pref_4': 'used',
 'prevpos': 'VBZ',
 'prevword': 'is',
 'suff_1': 'u',
 'suff_2': 'us',
 'suff_3': 'use',
 'suff_4': 'used',
 'word': 'used',
 'word_is_city': False,
 'word_is_county': False,
 'word_is_digit': False,
 'word_is_state': False}

In [344]:
# defining a few more functions to extract featrues, labels, words from sentences

def sent2features(sent):
    return [word_features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]    

In [345]:
# create training and validation sets
X_train = [sent2features(s) for s in train_labels]
y_train = [sent2labels(s) for s in train_labels]

X_valid = [sent2features(s) for s in valid_labels]
y_valid = [sent2labels(s) for s in valid_labels]

In [346]:
# X_train is a list of sentences within which each feature has a corresponding dict of features
# first sentence in X_train
X_train[0]

[{'nextpos': 'NN',
  'nextword': 'aircraft',
  'pos': 'WP',
  'pref_1': 't',
  'pref_2': 'at',
  'pref_3': 'hat',
  'pref_4': 'what',
  'prevpos': '<START>',
  'prevword': '<START>',
  'suff_1': 'w',
  'suff_2': 'wh',
  'suff_3': 'wha',
  'suff_4': 'what',
  'word': 'what',
  'word_is_city': False,
  'word_is_county': False,
  'word_is_digit': False,
  'word_is_state': False},
 {'nextpos': 'VBZ',
  'nextword': 'is',
  'pos': 'NN',
  'pref_1': 't',
  'pref_2': 'ft',
  'pref_3': 'aft',
  'pref_4': 'raft',
  'prevpos': 'WP',
  'prevword': 'what',
  'suff_1': 'a',
  'suff_2': 'ai',
  'suff_3': 'air',
  'suff_4': 'airc',
  'word': 'aircraft',
  'word_is_city': False,
  'word_is_county': False,
  'word_is_digit': False,
  'word_is_state': False},
 {'nextpos': 'VBN',
  'nextword': 'used',
  'pos': 'VBZ',
  'pref_1': 's',
  'pref_2': 'is',
  'pref_3': 'is',
  'pref_4': 'is',
  'prevpos': 'NN',
  'prevword': 'aircraft',
  'suff_1': 'i',
  'suff_2': 'is',
  'suff_3': 'is',
  'suff_4': 'is',
  'w

### Training the Model

In [347]:
# instantiate a CRF trainer from pycrfsuite
trainer = pycrfsuite.Trainer(verbose=False)

# zip the X and y sets
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [348]:
# Set training parameters - using L-BFGS training algorithm (default) with Elastic Net (L1 + L2) regularization.
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [349]:
# list of possible params
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [350]:
# saving the trained model to a file
trainer.train('atis.crfsuite')

### Make Predictions

In [351]:
# create a tagger object and open the trained file
tagger = pycrfsuite.Tagger()
tagger.open('atis.crfsuite')

<contextlib.closing at 0x11c5aa30>

In [352]:
# tagging a sample sentence
sample_sent = valid_labels[0]
print(' '.join(sent2tokens(sample_sent)), end='\n')

what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco


In [353]:
print("Predicted:", ' '.join(tagger.tag(sent2features(sample_sent))))
print('\n')
print("Correct:  ", ' '.join(sent2labels(sample_sent)))

Predicted: O O O B-fromloc.city_name O O O O O B-depart_time.period_of_day O O O B-toloc.city_name I-toloc.city_name


Correct:   O O O B-fromloc.city_name O B-depart_time.time_relative B-depart_time.time O O B-depart_time.period_of_day O O O B-toloc.city_name I-toloc.city_name


### Evaluating the Model

In [354]:
def iob_classification_report(y_true, y_pred):
    """
    Classification report for a list of IOB-encoded sequences.
    It computes token-level metrics and discards "O" labels.

    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    # note that we are not including 'O' as a class
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [355]:
y_pred = [tagger.tag(xseq) for xseq in X_valid]

# predictions for first sentence
y_pred[0]

['O',
 'O',
 'O',
 'B-fromloc.city_name',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-depart_time.period_of_day',
 'O',
 'O',
 'O',
 'B-toloc.city_name',
 'I-toloc.city_name']

Let's now evaluate the model. Since we are dealing with a multiclass classification problem, we can use sklearn's ```LabelBinarizer``` to binarize labels in a one-versus-all manner.  

Read about LabelBinarizer here: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html.

In [356]:
print(iob_classification_report(y_valid, y_pred))

                              precision    recall  f1-score   support

             B-aircraft_code       1.00      0.44      0.62         9
              B-airline_code       0.83      0.79      0.81        19
              B-airline_name       1.00      0.98      0.99       141
              I-airline_name       0.99      1.00      0.99        79
              B-airport_code       0.90      1.00      0.95         9
              B-airport_name       0.71      0.56      0.63         9
              I-airport_name       0.71      0.45      0.56        11
 B-arrive_date.date_relative       0.00      0.00      0.00         1
      B-arrive_date.day_name       0.40      0.11      0.17        18
    B-arrive_date.day_number       0.33      0.40      0.36         5
    I-arrive_date.day_number       0.33      1.00      0.50         1
    B-arrive_date.month_name       0.33      0.40      0.36         5
      B-arrive_time.end_time       0.00      0.00      0.00         1
      I-arrive_time

  'precision', 'predicted', average, warn_for)
