## Building an Application

Now that we can extract the chunks from any given user query, we need to decide which chunks  we actually need for our application and extract only those. 

Firstly, note that although there are a variety of labels/chunk types, only a few occur in majority of user queries. 

Secondly, while building an application to make flight reservations, you may be using an API/database to fetch the requested flight data, and the API/database may only provide certain types of information. For e.g. if the API does not provide any info about *meals* in flights, there may be no use extracting the entity ```meal_description```.

Let's look at both the variety and frequency of entities. 

In [25]:
# import libaries
import numpy as np
import pandas as pd
import nltk, pprint
import matplotlib.pyplot as plt
import random

import gzip, os, pickle # gzip for reading the gz files, pickle to save/dump trained model 
import _pickle as cPickle

import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers

from nltk.corpus import conll2000
from nltk import conlltags2tree, tree2conlltags, ChunkParserI

from itertools import chain
from collections import Counter

import requests, json

# supress warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# read the first part of the dataset
# each part (.gz file) contains train, validation and test sets, plus a dict

filename = './data/atis.fold0.pkl.gz'
f = gzip.open(filename, 'rb')
try:
    train_set, valid_set, test_set, dicts = pickle.load(f, encoding='latin1')
except:
    train_set, valid_set, test_set, dicts = pickle.load(f)
finally:
    f.close()


In [3]:
# storing the three elements of the tuple in three objects 
# The '_' is a conventional variable in python used to store non-useful/dummy objects
train_x, _, train_label = train_set
val_x, _, val_label = valid_set
test_x, _, test_label = test_set

In [4]:
# storing labels and words in separate variables
# we'll need only two of these dicts - words and labels
words = dicts['words2idx']
labels = dicts['labels2idx']

In [5]:
# converting words_to_id to id_to_words
# and labels_to_id to id_to_labels
id_to_words = {words[k]:k for k in words}
id_to_labels = {labels[k]:k for k in labels}

In [6]:
# POS tagging sentences
# takes in a list of sentences and returns a list of POS-tagged sentences
# in the form (word, tag)
def pos_tag(sent_list):
    pos_tags = []    
    for sent in sent_list:
        tagged_words = nltk.pos_tag([id_to_words[val] for val in sent])
        pos_tags.append(tagged_words)
    return pos_tags

In [7]:
# pos tagging train, validation and test sets
train_pos = pos_tag(train_x)
valid_pos = pos_tag(val_x)
test_pos = pos_tag(test_x)

In [8]:
# function to create (word, pos_tag, iob_label) tuples for a given dataset
def create_word_pos_label(pos_tagged_data, labels):
    iob_labels = []         # initialize the list of 3-tuples to be returned
    
    for sent in list(zip(pos_tagged_data, labels)):
        pos = sent[0]       
        labels = sent[1]    
        zipped_list = list(zip(pos, labels)) # [(word, pos), label]
        
        # create (word, pos, label) tuples from zipped list
        tuple_3 = [(word_pos_tuple[0], word_pos_tuple[1], id_to_labels[label]) 
                   for word_pos_tuple, label in zipped_list]
        iob_labels.append(tuple_3)
    return iob_labels

In [9]:
# printing some sample queries in the form (word, pos, label)
train_labels = create_word_pos_label(train_pos, train_label)
train_labels[4:6]

[[('show', 'VB', 'O'),
  ('me', 'PRP', 'O'),
  ('the', 'DT', 'O'),
  ('cheapest', 'JJS', 'B-cost_relative'),
  ('round', 'NN', 'B-round_trip'),
  ('trips', 'NNS', 'I-round_trip'),
  ('from', 'IN', 'O'),
  ('dallas', 'NN', 'B-fromloc.city_name'),
  ('to', 'TO', 'O'),
  ('baltimore', 'VB', 'B-toloc.city_name')],
 [('i', 'JJ', 'O'),
  ("'d", 'MD', 'O'),
  ('like', 'VB', 'O'),
  ('to', 'TO', 'O'),
  ('see', 'VB', 'O'),
  ('all', 'DT', 'O'),
  ('flights', 'NNS', 'O'),
  ('from', 'IN', 'O'),
  ('denver', 'NN', 'B-fromloc.city_name'),
  ('to', 'TO', 'O'),
  ('philadelphia', 'VB', 'B-toloc.city_name')]]

In [10]:
# storing validation and test data as well as (word, pos, label)
valid_labels = create_word_pos_label(valid_pos, val_label)
test_labels = create_word_pos_label(test_pos, test_label)

In [11]:
# converting the sample sentence above to tree format
tree = conlltags2tree(train_labels[3])
print(tree)

(S
  show/VB
  me/PRP
  the/DT
  (airline_name us/PRP air/NN)
  flights/NNS
  from/IN
  (fromloc.city_name atlanta/NN)
  to/TO
  (toloc.city_name boston/VB))


In [12]:
# converting training, validation and test datasets to tree format
train_trees = [conlltags2tree(sent) for sent in train_labels]
valid_trees = [conlltags2tree(sent) for sent in valid_labels]
test_trees = [conlltags2tree(sent) for sent in test_labels]

In [13]:
# reading a file containing list of US cities, states and counties
us_cities = pd.read_csv("./data/us_cities_states_counties.csv", sep="|")
us_cities.head()

Unnamed: 0,City,State short,State full,County,City alias
0,Holtsville,NY,New York,SUFFOLK,Internal Revenue Service
1,Holtsville,NY,New York,SUFFOLK,Holtsville
2,Adjuntas,PR,Puerto Rico,ADJUNTAS,URB San Joaquin
3,Adjuntas,PR,Puerto Rico,ADJUNTAS,Jard De Adjuntas
4,Adjuntas,PR,Puerto Rico,ADJUNTAS,Colinas Del Gigante


In [14]:
# storing cities, states and counties as sets
cities = set(us_cities['City'].str.lower())
states = set(us_cities['State full'].str.lower())
counties = set(us_cities['County'].str.lower())

In [15]:
# define a function to look up a given word in cities, states, county
def gazetteer_lookup(word):
    return (word in cities, word in states, word in counties)

In [16]:
# extract features from a given sentence
def word_features(sent, i):
    word = sent[i][0]
    pos = sent[i][1]
    
    # first word
    if i==0:
        prevword = '<START>'
        prevpos = '<START>'
    else:
        prevword = sent[i-1][0]
        prevpos = sent[i-1][1]
    
    # last word
    if i == len(sent)-1:
        nextword = '<END>'
        nextpos = '<END>'
    else:
        nextword = sent[i+1][0]
        nextpos = sent[i+1][1]
    
    # word is in gazetteer
    gazetteer = gazetteer_lookup(word)
    
    # suffixes and prefixes
    pref_1, pref_2, pref_3, pref_4 = word[:1], word[:2], word[:3], word[:4]
    suff_1, suff_2, suff_3, suff_4 = word[-1:], word[-2:], word[-3:], word[-4:]
    
    return {'word':word,
            'pos': pos, 
            'prevword': prevword,
            'prevpos': prevpos,  
            'nextword': nextword, 
            'nextpos': nextpos,
            'word_is_city': gazetteer[0],
            'word_is_state': gazetteer[1],
            'word_is_county': gazetteer[2],
            'word_is_digit': word in 'DIGITDIGITDIGIT',
            'suff_1': suff_1,  
            'suff_2': suff_2,  
            'suff_3': suff_3,  
            'suff_4': suff_4, 
            'pref_1': pref_1,  
            'pref_2': pref_2,  
            'pref_3': pref_3, 
            'pref_4': pref_4 }  

In [17]:
# defining a few more functions to extract featrues, labels, words from sentences

def sent2features(sent):
    return [word_features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]    

In [18]:
 # create training, validation and test sets
X_train = [sent2features(s) for s in train_labels]
y_train = [sent2labels(s) for s in train_labels]

X_valid = [sent2features(s) for s in valid_labels]
y_valid = [sent2labels(s) for s in valid_labels]

X_test = [sent2features(s) for s in test_labels]
y_test = [sent2labels(s) for s in test_labels]

In [19]:
# list all labels of train trees
tree_labels = []
for tree in train_trees:
    for n in tree:
        if isinstance(n, nltk.tree.Tree):
            tree_labels.append(n.label())

In [20]:
# training set has 78 unique labels
label_set = set(tree_labels)
len(label_set)

78

There are 78 types of labels/entities in the training set (there might be some more in validation / test as well, though not many). Let's now look at the frequency of chunk types.

In [21]:
# frequency of chunk types/labels
# from collections import Counter
c = Counter(tree_labels)
pprint.pprint(c.most_common(10))

[('toloc.city_name', 3483),
 ('fromloc.city_name', 3458),
 ('depart_date.day_name', 728),
 ('airline_name', 562),
 ('depart_time.period_of_day', 457),
 ('depart_date.day_number', 313),
 ('depart_date.month_name', 304),
 ('depart_time.time', 295),
 ('round_trip', 286),
 ('cost_relative', 284)]


The dict above shows the frequency of each chunk type in the training set. As expected, the most frequent ones are ```toloc.city_name```, ```fromloc.city_name```, ```depart_date.day_name``` etc.

In the section below, we will extract some common entities from a given sentence and use <a href="https://developer.flightstats.com/">the flightstats API</a> to query flight schedules data. 

For now, we'll extract only ```fromloc.city_name```, ```toloc.city_name```, ```depart_date.day_name```, ```depart_time.period_of_day```, ```depart_date.day_number```, ```depart_date.month_name```, ```depart_time.time_relative```, ```depart_date.today_relative``` and ```depart_time.time```.

In [22]:
# print the string version of the chosen sentence
i = random.randrange(len(valid_trees))
# i=408
chunked_tree = valid_trees[i]
print(' '.join([id_to_words[val] for val in val_x[i]]), '\n')

# acceptable labels
extract_labels = ["fromloc.city_name", "toloc.city_name", "depart_date.day_name", 
                 "depart_time.period_of_day", "depart_date.day_number", "depart_date.month_name",
                 "depart_time.time", "depart_time.time_relative", "depart_date.today_relative"]

# traverse the tree and print labels of subtrees 
for n in chunked_tree:
    if isinstance(n, nltk.tree.Tree) and \
     n.label() in extract_labels:
        print(n.label(), n.leaves())

show me the evening flights from atlanta to washington on wednesdays 

depart_time.period_of_day [('evening', 'VBG')]
fromloc.city_name [('atlanta', 'NN')]
toloc.city_name [('washington', 'VB')]
depart_date.day_name [('wednesdays', 'NNS')]


### Querying Data from FlightStats API 

We'll use the flightstats API for getting data of flight schedules. <a href="https://developer.flightstats.com/api-docs/">The homepage of flightstats API</a> shows the list of all APIs they provide; we'll use <a href="https://developer.flightstats.com/api-docs/scheduledFlights/v1">the Schedules API</a>.

In [23]:
# Useful flightstats URLs
# https://developer.flightstats.com/api-docs/scheduledFlights/v1
# https://developer.flightstats.com/api-docs/
# https://developer.flightstats.com/api-docs/how_to

The flightstats Schedules API provides multiple API call types (mentioned on <a href="https://developer.flightstats.com/api-docs/scheduledFlights/v1">this page</a>). Some of these are also mentioned below.

However, we'll only use the following first type of API call, ```from dep_city to arr_city on dep_date```, since it covers the majority of query types.



The first step is to sign up and create an app ID and key. The example query below queries the Schedules API to get a list of all flights from airport_code_1 to airport_code_2 departing on a certain date (type-1 API call).

In [24]:
# querying flightstats API 
app_id = '956f33a1'
app_key = '86a7d2396340e11a2a02196eab6ea5ac'

base_url = 'https://api.flightstats.com/flex/schedules/rest/v1/json/from/'

# {departureAirportCode}/to/{arrivalAirportCode}/departing/{year}/{month}/{day}
# JFK in new york to LAX in los angeles
# make sure to enter a future date, else no data is returned
extended_url = 'JFK/to/LAX/departing/2018/7/18'

# credentials
creds = '?appId={0}&appKey={1}'.format(app_id, app_key)

# complete url
url = base_url + extended_url + creds
print(url)

https://api.flightstats.com/flex/schedules/rest/v1/json/from/JFK/to/LAX/departing/2018/7/18?appId=956f33a1&appKey=86a7d2396340e11a2a02196eab6ea5ac


In [29]:
# request data from the API
data = requests.get(url).json()

In [30]:
data

{'error': {'httpStatusCode': 403,
  'errorId': 'e73161e9-1048-432a-8f8d-8477a91d1a66',
  'errorMessage': 'application is not active',
  'errorCode': 'FORBIDDEN'}}

In [27]:
# sample flight details
data['scheduledFlights'][1]

KeyError: 'scheduledFlights'

In [None]:
# number of flights returned
len(data['scheduledFlights'])

The query above uses airport codes to request the data, but our parser extracts the names of cities (san franciso, new york etc.) rather than airport codes. 

Thus, we need to convert the city names to flightstats airport codes. We can do that using another flightstats API - *airports*, which returns a list of all airports, their codes, and various other attributes.

In [None]:
# data of all active airports
base_url = 'https://api.flightstats.com/flex/airports/rest/v1/json/active/'
url = base_url + creds
data = requests.get(url)
airports = data.json()

# convert to df
airports_df = pd.DataFrame(airports['airports'])
airports_df.to_csv('airports.csv')

In [None]:
airports_df = pd.read_csv("airports.csv")
airports_df.head()

In [None]:
# looking up 'new york' in the airports dataframe
# fs refers to 'flightstats code' of the airport
airports_df[airports_df['city'].str.lower() =='new york']

The list above shows that New York has multiple airports and thus airport codes (you can check for other cities such as LA etc.). The main airports, however, have the value classification=1 or 2. 

Let's write a small function which takes in the city name and returns the codes of the main airports.

In [None]:
# extract main airport codes from city name
def city_to_airport_code(city):
    df = airports_df[(airports_df['city'].str.lower() == city) & \
                ((airports_df['classification'] == 1) | \
                 (airports_df['classification'] == 2))]['fs']
    return list(df)


In [None]:
# samples
print(city_to_airport_code("new york"))
print(city_to_airport_code("baltimore"))
print(city_to_airport_code("chicago"))
print(city_to_airport_code("pittsburgh"))


Now, after extracting the source and destination cities (the entities ```fromloc``` and ```toloc```), we can query the API to get the list of flights. We also have to parse the dates to get the year, month and day.

### Pipeline: From Tree to Flight Data

The following pipeline shows the steps to process a given sentence tree (predicted using CRF), extract entities from it, generate a relevant API call, and apply additional constraints to filter the data.

There are roughly three layers in the application:
1. Extracting structured entities from query tree
2. Making the API call to get JSON data
3. Applying additional constraints to the retreived data 


The first layer processes the parsed tree and extracts structured entities so that we can make an API call. For e.g. queries mention dates as "seventh may" which needs to be passed to the API as integers (day=7, month=5). Similarly, a request for "flight on Wednesday" should query the API for a flight which departs on the closest upcoming Wednesday from the current data. 

The second layer's task is to make the API call and fetch the data.

The third layer applies filters on arrival/departure time, such as 'evening', 'early morning', 'after DIGIT pm' etc. to the retreived data and returns the filtered data to the user.


In [None]:
# i=776
i

In [None]:
## TODO:

## Layer-1: 
# get list of entities from tree
# get source and dest cities, dep date

def tree_to_dict(tree):
    # traverse the tree and store the subtrees/labels and leaves
    labels = {}
    for n in tree:
        if isinstance(n, nltk.tree.Tree) and \
        n.label() in extract_labels:
            leaf_str = ' '.join([leaf[0] for leaf in n.leaves()])
            labels[n.label()] = leaf_str
    
    return(labels)
    

Let's now convert all queries in the train, validation and test trees to corresponding dicts.

In [None]:
train_dicts = [tree_to_dict(tree) for tree in train_trees]
valid_dicts = [tree_to_dict(tree) for tree in valid_trees]
test_dicts = [tree_to_dict(tree) for tree in test_trees]

Now, let's look at some typical entries in day_number, month_name etc., so that we can process them correctly. For example, how are day and months typically written - *seventh, twenty third, September (or Sept?)* etc. 

In [None]:
# depart_date.day_name
day_names = [d.get("depart_date.day_name") for d in train_dicts \
            if d.get("depart_date.day_name") is not None]
print("day_names\n", set(day_names), '\n')

# depart_date.day_number
day_numbers = [d.get("depart_date.day_number") for d in train_dicts 
               if d.get("depart_date.day_number") is not None]
print("day_numbers\n", set(day_numbers), '\n')


# depart_date.month_name
month_names = [d.get("depart_date.month_name") for d in train_dicts 
               if d.get("depart_date.month_name") is not None]
print("month_name\n", set(month_names), '\n')

# depart_time.time_relative
time_relative = [d.get("depart_time.time_relative") for d in train_dicts 
               if d.get("depart_time.time_relative") is not None]
print("time_relative\n", set(time_relative), '\n')

# depart_time.time
time = [d.get("depart_time.time") for d in train_dicts 
               if d.get("depart_time.time") is not None]
print("time\n", set(time), '\n')

# depart_time.period_of_day
period_of_day = [d.get("depart_time.period_of_day") for d in train_dicts 
               if d.get("depart_time.period_of_day") is not None]
print("period_of_day\n", set(period_of_day), '\n')

# depart_date.today_relative
today_relative = [d.get("depart_date.today_relative") for d in train_dicts 
               if d.get("depart_date.today_relative") is not None]
print("today_relative\n", set(today_relative), '\n')


Dealing with months is the easiest since there are just 12 varieties. Similarly, day names cab be looked up from a manully created dict ```{sunday, sundays, tuesday, tuesdays, ...}``` etc.

For converting day numbers to numerics, although there are some nice third-party libraries such as ```word2num```, they only handle simple cases such as 'one', 'thirty two' etc. but not words which ends in suffixes such as 'twentieth', 'thirty first' etc.

Credits: <a href="https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa">Stack Overflow</a>

In [None]:
# text2int
def text2int(textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                raise Exception("Illegal word: " + word)

            scale, increment = numwords[word]
        current = current * scale + increment
        if scale > 100:
            result += current
            current = 0

    return result + current

In [None]:
# examples
print(text2int("twenty"))
print(text2int("first"))
print(text2int("thirtieth"))
print(text2int("twenty eighth"))
print(text2int("twelfth"))

In [None]:
# months: use the builtin calendar module
import calendar 
month2int = {v.lower():k for k,v in enumerate(calendar.month_name)}
month2int    

In [None]:
# days
day2int = {v.lower():k for k,v in enumerate(calendar.day_name)}
day2int

Now, when coming across a day name such as Monday, Tuesday etc. we'll simply use the date corresponding to the next Monday/Tuesday. The following code returns the date of the next day_name.

In [None]:
# return the date of the next day_name from today
import datetime
def next_weekday(d, weekday):
    days_ahead = weekday - d.weekday()
    if days_ahead <= 0: # Target day already happened this week
        days_ahead += 7
    return d + datetime.timedelta(days_ahead)

today = datetime.date.today()
next_day = next_weekday(today, 1) # 0 = Monday, 1=Tuesday, 2=Wednesday...
print("date", next_day)
print("year", next_day.year)
print("month", next_day.month)
print("day", next_day.day)


Now we can extract entities in numeric format using these functions/dicts. Given a query (tree), we'll extract source and destination cities, day (int), month (int) etc. If the day is specified as name e.g. Monday, we'll assume it is the next Monday and query that date. 

For time and time modifiers such as before, after etc., we'll filter the 'arr/dep time' attributes after we get the data from the API.

In [None]:
# extract integer dates etc from query tree
def extract_entities_from_tree(tree):
    query_dict = tree_to_dict(tree)
    entities = {}
    
    for key, val in query_dict.items():
        
        # get airport codes from city names as a list
        if key == "fromloc.city_name" or key == "toloc.city_name":
            entities[key] = city_to_airport_code(val)
            
        # strip the last 's' e.g. tuesdays from day of week
        if key == "depart_date.day_name":
            query_dict[key] = val[:-1] if val.endswith("s") else val
            
            # get year, month, day of the next day_name
            day_num = day2int[query_dict[key]]
            today = datetime.date.today()
            next_day = next_weekday(today, day_num) # 0 = Monday, 1=Tuesday, 2=Wednesday...
            entities['day'] = next_day.day
            entities['month'] = next_day.month
            entities['year'] = next_day.year

        # day number explicitly mentioned
        if key == "depart_date.day_number":
            entities['day'] = text2int(val)
            today = datetime.date.today()
            entities['month'] = today.month
            entities['year'] = today.year
            
        # month explicitly mentioned
        if key == "depart_date.month_name":
            entities['month'] = month2int[val]
            # assume today's date and year
            today = datetime.date.today()
            entities['day'] = today.day
            entities['year'] = today.year
        
        # if day/month/year still not in dict, show tomorrow's flights
        if ('day' not in entities.keys() or 
            'month' not in entities.keys() or 
            'year' not in entities.keys()):
            today = datetime.date.today()
            tom = today + datetime.timedelta(days=1)
            entities['day'] = tom.day
            entities['month'] = tom.month
            entities['year'] = tom.year 
        
    return query_dict, entities
    
    

In [None]:
i = random.randrange(len(train_trees))
i=2308
print(' '.join([id_to_words[t] for t in train_x[i]]), '\n')
q, d = extract_entities_from_tree(train_trees[i])
print(train_trees[i])
print(q)
print(d)

In [None]:
# examples from train set
# i=1285, 2438 , 3442, 3120, 1956, 2084
i

 ### Ignore - Processing a Sample User Generated Query

In [None]:
# sample: predicting a new user generated query
s = 'Can you please show me flights from new york to los angeles departing on monday after DIGIT pm'

# tokenize and tag user query and create features for each token
def process_user_query(sent_string):
    tokens = nltk.word_tokenize(sent_string)
    pos_tags = nltk.pos_tag(tokens)
    
    # create features from words in query q
    query_features = [word_features(pos_tags, i) for i in range(len(pos_tags))]      
    return(pos_tags, query_features)

# generate query features for sentence s
query_pos_tags, query_features = process_user_query(s)

# predict tags of query
predicted_labels = crf.predict([query_features])[0]
predicted_labels

In [None]:
# convert the predicted labels into standard (token, pos, label) format
query_tag_list = [(pos_tag[0], pos_tag[1], label) for pos_tag, label in list(zip(query_pos_tags, predicted_labels))]

# convert into tree
query_tree = conlltags2tree(query_tag_list)

# traverse the tree and print labels of subtrees 
for n in query_tree:
    if isinstance(n, nltk.tree.Tree):
        label = n.label()
        leaves = ' '.join(i[0] for i in n.leaves())
        print(label,':', leaves)