In [23]:
# import libaries
import numpy as np
import pandas as pd
import nltk, pprint
import matplotlib.pyplot as plt
import random

import gzip, os, pickle # gzip for reading the gz files, pickle to save/dump trained model 
import _pickle as cPickle

import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

from nltk.corpus import conll2000
from nltk import conlltags2tree, tree2conlltags, ChunkParserI

from itertools import chain
from collections import Counter

# supress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# read the first part of the dataset
# each part (.gz file) contains train, validation and test sets, plus a dict

filename = './data/atis.fold0.pkl.gz'
f = gzip.open(filename, 'rb')
try:
    train_set, valid_set, test_set, dicts = pickle.load(f, encoding='latin1')
except:
    train_set, valid_set, test_set, dicts = pickle.load(f)
finally:
    f.close()


In [3]:
# storing the three elements of the tuple in three objects 
# The '_' is a conventional variable in python used to store non-useful/dummy objects
train_x, _, train_label = train_set
val_x, _, val_label = valid_set
test_x, _, test_label = test_set

In [4]:
# storing labels and words in separate variables
# we'll need only two of these dicts - words and labels
words = dicts['words2idx']
labels = dicts['labels2idx']

In [5]:
# converting words_to_id to id_to_words
# and labels_to_id to id_to_labels
id_to_words = {words[k]:k for k in words}
id_to_labels = {labels[k]:k for k in labels}

In [6]:
# POS tagging sentences
# takes in a list of sentences and returns a list of POS-tagged sentences
# in the form (word, tag)
def pos_tag(sent_list):
    pos_tags = []    
    for sent in sent_list:
        tagged_words = nltk.pos_tag([id_to_words[val] for val in sent])
        pos_tags.append(tagged_words)
    return pos_tags

In [7]:
# pos tagging train, validation and test sets
train_pos = pos_tag(train_x)
valid_pos = pos_tag(val_x)
test_pos = pos_tag(test_x)

In [8]:
# function to create (word, pos_tag, iob_label) tuples for a given dataset
def create_word_pos_label(pos_tagged_data, labels):
    iob_labels = []         # initialize the list of 3-tuples to be returned
    
    for sent in list(zip(pos_tagged_data, labels)):
        pos = sent[0]       
        labels = sent[1]    
        zipped_list = list(zip(pos, labels)) # [(word, pos), label]
        
        # create (word, pos, label) tuples from zipped list
        tuple_3 = [(word_pos_tuple[0], word_pos_tuple[1], id_to_labels[label]) 
                   for word_pos_tuple, label in zipped_list]
        iob_labels.append(tuple_3)
    return iob_labels

In [9]:
# printing some sample queries in the form (word, pos, label)
train_labels = create_word_pos_label(train_pos, train_label)
train_labels[4:6]

[[('show', 'VB', 'O'),
  ('me', 'PRP', 'O'),
  ('the', 'DT', 'O'),
  ('cheapest', 'JJS', 'B-cost_relative'),
  ('round', 'NN', 'B-round_trip'),
  ('trips', 'NNS', 'I-round_trip'),
  ('from', 'IN', 'O'),
  ('dallas', 'NN', 'B-fromloc.city_name'),
  ('to', 'TO', 'O'),
  ('baltimore', 'VB', 'B-toloc.city_name')],
 [('i', 'JJ', 'O'),
  ("'d", 'MD', 'O'),
  ('like', 'VB', 'O'),
  ('to', 'TO', 'O'),
  ('see', 'VB', 'O'),
  ('all', 'DT', 'O'),
  ('flights', 'NNS', 'O'),
  ('from', 'IN', 'O'),
  ('denver', 'NN', 'B-fromloc.city_name'),
  ('to', 'TO', 'O'),
  ('philadelphia', 'VB', 'B-toloc.city_name')]]

In [10]:
# storing validation and test data as well as (word, pos, label)
valid_labels = create_word_pos_label(valid_pos, val_label)
test_labels = create_word_pos_label(test_pos, test_label)

In [11]:
# converting the sample sentence above to tree format
tree = conlltags2tree(train_labels[3])
print(tree)

(S
  show/VB
  me/PRP
  the/DT
  (airline_name us/PRP air/NN)
  flights/NNS
  from/IN
  (fromloc.city_name atlanta/NN)
  to/TO
  (toloc.city_name boston/VB))


In [12]:
# converting training, validation and test datasets to tree format
train_trees = [conlltags2tree(sent) for sent in train_labels]
valid_trees = [conlltags2tree(sent) for sent in valid_labels]
test_trees = [conlltags2tree(sent) for sent in test_labels]

In [13]:
# reading a file containing list of US cities, states and counties
us_cities = pd.read_csv("./data/us_cities_states_counties.csv", sep="|")
us_cities.head()

Unnamed: 0,City,State short,State full,County,City alias
0,Holtsville,NY,New York,SUFFOLK,Internal Revenue Service
1,Holtsville,NY,New York,SUFFOLK,Holtsville
2,Adjuntas,PR,Puerto Rico,ADJUNTAS,URB San Joaquin
3,Adjuntas,PR,Puerto Rico,ADJUNTAS,Jard De Adjuntas
4,Adjuntas,PR,Puerto Rico,ADJUNTAS,Colinas Del Gigante


In [14]:
# storing cities, states and counties as sets
cities = set(us_cities['City'].str.lower())
states = set(us_cities['State full'].str.lower())
counties = set(us_cities['County'].str.lower())

In [15]:
# define a function to look up a given word in cities, states, county
def gazetteer_lookup(word):
    return (word in cities, word in states, word in counties)

In [16]:
# extract features from a given sentence
def word_features(sent, i):
    word = sent[i][0]
    pos = sent[i][1]
    
    # first word
    if i==0:
        prevword = '<START>'
        prevpos = '<START>'
    else:
        prevword = sent[i-1][0]
        prevpos = sent[i-1][1]
    
    # last word
    if i == len(sent)-1:
        nextword = '<END>'
        nextpos = '<END>'
    else:
        nextword = sent[i+1][0]
        nextpos = sent[i+1][1]
    
    # word is in gazetteer
    gazetteer = gazetteer_lookup(word)
    
    # suffixes and prefixes
    pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]
    suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]
    
    return {'word':word,
            'pos': pos, 
            'prevword': prevword,
            'prevpos': prevpos,  
            'nextword': nextword, 
            'nextpos': nextpos,
            'word_is_city': gazetteer[0],
            'word_is_state': gazetteer[1],
            'word_is_county': gazetteer[2],
            'word_is_digit': word in 'DIGITDIGITDIGIT',
            'suff_1': suff_1,  
            'suff_2': suff_2,  
            'suff_3': suff_3,  
            'suff_4': suff_4, 
            'pref_1': pref_1,  
            'pref_2': pref_2,  
            'pref_3': pref_3, 
            'pref_4': pref_4 }  

In [17]:
# defining a few more functions to extract featrues, labels, words from sentences

def sent2features(sent):
    return [word_features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]    

In [18]:
 # create training, validation and test sets
X_train = [sent2features(s) for s in train_labels]
y_train = [sent2labels(s) for s in train_labels]

X_valid = [sent2features(s) for s in valid_labels]
y_valid = [sent2labels(s) for s in valid_labels]

X_test = [sent2features(s) for s in test_labels]
y_test = [sent2labels(s) for s in test_labels]

In [19]:
# load the trained model
with open('tuned_crf_classifier.pkl', 'rb') as fid:
    crf = cPickle.load(fid)

### Making Predictions 

Let's now use the trained model to make predictions. 

In [20]:
# remove 'O' from the labels
labels =list(crf.classes_)


# make predictions on validation data
y_pred = crf.predict(X_valid)
metrics.flat_f1_score(y_valid, y_pred,
                      average='weighted', labels=labels)

0.9742297341770172

The overall f1-score is comparable to that on training (cross-validation) data (which was about 93% on the test folds and 97% on training folds). Let's look at class-wise metrics as well.

In [21]:
# class-wise scores on validation data
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_valid, y_pred, labels=sorted_labels, digits=3
))

                              precision    recall  f1-score   support

                           O      0.995     0.998     0.996      7198
             B-aircraft_code      1.000     1.000     1.000         3
              B-airline_code      1.000     0.963     0.981        27
              B-airline_name      1.000     0.993     0.996       139
              I-airline_name      1.000     0.975     0.987        80
              B-airport_code      0.800     0.800     0.800         5
              B-airport_name      0.500     0.429     0.462         7
              I-airport_name      0.667     0.444     0.533         9
 B-arrive_date.date_relative      0.000     0.000     0.000         1
      B-arrive_date.day_name      0.250     0.071     0.111        14
    B-arrive_date.day_number      0.600     0.353     0.444        17
    I-arrive_date.day_number      0.000     0.000     0.000         2
    B-arrive_date.month_name      0.600     0.353     0.444        17
      B-arrive_time

Let's also make predictions on the test set and see the results. 

In [22]:
# test data predictions
# make predictions on validation data
y_pred_test = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred_test,
                      average='weighted', labels=labels)

0.9603958062939824

### Understanding the CRF Classifier

Let's now understand what the classifier has learnt. The method ```crf.transition_features_``` returns a dict of key:value = (label-1, label-2):coef pairs - each key-val pair representing the transition coefficient from label-1 to label-2.

The ```Counter``` class from the collections module provides a convenient way of counting and printing the frequency of items (the top-N or bottom counts). Read the <a href="https://docs.python.org/2/library/collections.html">docs of Counter class here.</a>

In [24]:
# look up more attributes and methods of sklearn crf
# help(crf)
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

In [25]:
print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-fromloc.airport_name -> I-fromloc.airport_name 6.139828
B-airport_name -> I-airport_name 6.040046
B-depart_date.month_name -> B-depart_date.day_number 5.786839
B-airline_name -> I-airline_name 5.694494
B-arrive_date.month_name -> B-arrive_date.day_number 5.679481
B-city_name -> I-city_name 5.639162
B-toloc.airport_name -> I-toloc.airport_name 5.543314
B-fromloc.city_name -> I-fromloc.city_name 5.271730
B-flight_time -> I-flight_time 5.235649
B-stoploc.city_name -> I-stoploc.city_name 5.072771
B-toloc.city_name -> I-toloc.city_name 4.924069
I-fromloc.airport_name -> I-fromloc.airport_name 4.912550
I-depart_date.today_relative -> I-depart_date.today_relative 4.905395
B-depart_time.time_relative -> B-depart_time.time 4.724515
B-depart_date.day_number -> I-depart_date.day_number 4.704134
B-depart_time.time -> I-depart_time.time 4.514564
B-depart_time.end_time -> I-depart_time.end_time 4.491012
I-flight_time -> I-flight_time 4.480120
B-arrive_time.end_time -> I-arr

The results show that the transitions ```B-fromloc.airport_name -> I-fromloc.airport_name```, ```B-depart_date.month_name -> B-depart_date.day_number``` are very likely. You can also see the unlikely transitions.

We can also see the most useful (discriminative) features.

In [26]:
# important features
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

In [27]:
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.297233 B-fromloc.city_name prevword:from
5.104957 B-arrive_time.time_relative prevword:arriving
5.059366 B-arrive_time.time_relative prevword:arrive
4.949039 B-toloc.state_name word_is_state
4.017096 B-depart_time.start_time prevword:between
3.841796 B-stoploc.city_name prevword:through
3.826135 B-fromloc.city_name prevword:leaving
3.819081 O        pos:DT
3.725986 I-depart_date.day_number prevword:twenty
3.703128 B-toloc.city_name prevword:into
3.588173 B-arrive_time.start_time prevword:between
3.566806 B-fromloc.airport_name prevword:from
3.506935 B-fromloc.city_name prevword:between
3.481669 B-fromloc.city_name prevword:leave
3.457800 B-stoploc.city_name prevword:via
3.419583 B-toloc.city_name prevword:for
3.369874 B-flight_number prevword:flight
3.310759 B-toloc.city_name prevword:downtown
3.298952 B-city_name prevword:serve
3.286938 B-toloc.city_name prevword:arriving
3.281605 B-flight_mod nextword:flight
3.273857 O        suff_1:e
3.221108 B-fromloc.city_name prev

Now that we have a trained tagger and have looked at the overall metrics, let's zoom in and look at the actual and predicted labels of some sample sentences.

In [28]:
# tagging a sample sentence (first validation sentence)
i = 0
sample_sent = valid_labels[i]
print(' '.join(sent2tokens(sample_sent)), end='\n')

what aircraft is used on delta flight DIGITDIGITDIGITDIGIT from kansas city to salt lake city


In [29]:
# compare the predicted and actual labels for a query
print("Predicted:", "\n",' '.join(y_pred[0]))
print('\n')
print("Correct:", "\n" ,' '.join(sent2labels(sample_sent)))

Predicted: 
 O O O O O B-airline_name O B-flight_number O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name I-toloc.city_name I-toloc.city_name


Correct: 
 O O O O O B-airline_name O B-flight_number O B-fromloc.city_name I-fromloc.city_name O B-toloc.city_name I-toloc.city_name I-toloc.city_name


### Converting Predicted Labels to Trees and Traversing Them

Let's now see how we can use the trained model to make predictions on unseen data and extract useful entities from it.

Once we predict the labels of a sentence, we want to extract useful entities from it. In the following code, we first append the predicted labels to the (word, pos) sentence to get tuples of (word, pos, predicted_label). Then we convert the list to tree format, as seen earlier, and traverse the tree to extract useful entities. 



In [30]:
# function to create (word, pos_tag, predicted_iob_label) tuples for a given dataset
def append_predicted_tags(pos_tagged_data, labels):
    iob_labels = []
    for sent in list(zip(pos_tagged_data, labels)):
        pos = sent[0]
        labels = sent[1]
        l = list(zip(pos, labels))
        tuple_3 = [(word_pos[0], word_pos[1], label) for (word_pos, label) in l]
        iob_labels.append(tuple_3)
    return iob_labels

In [31]:
# predictions of IOB tags on a sample validation sentence 
valid_tags = append_predicted_tags(valid_pos, y_pred)
valid_tags[0]

[('what', 'WP', 'O'),
 ('aircraft', 'NN', 'O'),
 ('is', 'VBZ', 'O'),
 ('used', 'VBN', 'O'),
 ('on', 'IN', 'O'),
 ('delta', 'JJ', 'B-airline_name'),
 ('flight', 'NN', 'O'),
 ('DIGITDIGITDIGITDIGIT', 'NNP', 'B-flight_number'),
 ('from', 'IN', 'O'),
 ('kansas', 'NNP', 'B-fromloc.city_name'),
 ('city', 'NN', 'I-fromloc.city_name'),
 ('to', 'TO', 'O'),
 ('salt', 'VB', 'B-toloc.city_name'),
 ('lake', 'JJ', 'I-toloc.city_name'),
 ('city', 'NN', 'I-toloc.city_name')]

Now, we can convert the predicted sentence labels to a tree format and traverse the tree to extarct useful entities.

In [32]:
# create a tree using the assigned iob labels
valid_trees = [conlltags2tree(sent) for sent in valid_tags]
print(valid_trees[0])

(S
  what/WP
  aircraft/NN
  is/VBZ
  used/VBN
  on/IN
  (airline_name delta/JJ)
  flight/NN
  (flight_number DIGITDIGITDIGITDIGIT/NNP)
  from/IN
  (fromloc.city_name kansas/NNP city/NN)
  to/TO
  (toloc.city_name salt/VB lake/JJ city/NN))


One can similarly make predictions on the test dataset and store them in tree format.

In [33]:
# append test data predicted tags
test_tags = append_predicted_tags(test_pos, y_pred_test)
test_trees = [conlltags2tree(sent) for sent in test_tags]
print(test_trees[0])

(S
  i/NN
  would/MD
  like/VB
  to/TO
  find/VB
  a/DT
  flight/NN
  from/IN
  (fromloc.city_name charlotte/NN)
  to/TO
  (toloc.city_name las/VB vegas/NN)
  that/WDT
  makes/VBZ
  a/DT
  stop/NN
  in/IN
  (stoploc.city_name st./NN louis/NN))


### Traversing a Chunked Tree

Now that we have labelled the validation and test datasets, let's see how we can traverse the trees. The following code shows a sample predicted validation query and extracts all the 'chunks' from the tree.

In NLTK, trees are stored as objects of the ```Tree``` class, and each chunk of a tree is itself a subtree of the ```Tree``` class. Below, we traverse a sample tree, check whether a given object in the tree is an object  ```Tree``` (i.e. a chunk), and prints the *label* and the *leaves* of the chunk subtree.

In [34]:
# choose a random predicted tree
i = random.randrange(len(valid_trees))

# print the string version of the chosen sentence
chunked_tree = valid_trees[i]
print(' '.join([id_to_words[val] for val in val_x[i]]), '\n')

# traverse the tree and print labels of subtrees 
for n in chunked_tree:
    if isinstance(n, nltk.tree.Tree):
        print(n.label(), n.leaves())

what are the flights from boston to san francisco 

fromloc.city_name [('boston', 'NN')]
toloc.city_name [('san', 'VB'), ('francisco', 'NN')]


In [35]:
# correctly parsed complex queries - i= 771, 410, 25, 473, 23, 498, 893, 882, 694
# ambiguous queries: not many so far
i

469