In [1]:
import numpy as np
from __future__ import print_function
import gzip
import cPickle as pickle
from urllib import urlretrieve
import os
import random
from os.path import isfile


In [2]:
PREFIX = '../SampleDatasets/ATIS/'


def download_dropbox():
    ''' 
    download from drop box in the meantime
    '''
    print('Downloading data from https://www.dropbox.com/s/3lxl9jsbw0j7h8a/atis.pkl?dl=0')
    os.system('wget -O atis.pkl https://www.dropbox.com/s/3lxl9jsbw0j7h8a/atis.pkl?dl=0')
    
def load_dropbox(filename):
    if not isfile(filename):
        #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/'+filename)
        download_dropbox()
    #f = gzip.open(filename,'rb')
    f = open(filename,'rb')
    return f

def atisfull():
    f = load_dropbox(PREFIX + 'atis.pkl')
    
    try:
        train_set, test_set, dicts = pickle.load(f)
    except UnicodeDecodeError:
        train_set, test_set, dicts = pickle.load(f, encoding='latin1')
    return train_set, test_set, dicts

In [3]:
train_set, valid_set, dicts = atisfull()
w2idx, labels2idx = dicts['words2idx'], dicts['labels2idx']

train_x, _, train_label = train_set
val_x, _, val_label = valid_set

In [4]:
# Create index to word/label dicts
idx2w  = {w2idx[k]:k for k in w2idx}
idx2la = {labels2idx[k]:k for k in labels2idx}

In [5]:
# For conlleval script
words_train = [ list(map(lambda x: idx2w[x], w)) for w in train_x]
labels_train = [ list(map(lambda x: idx2la[x], y)) for y in train_label]
words_val = [ list(map(lambda x: idx2w[x], w)) for w in val_x]
labels_val = [ list(map(lambda x: idx2la[x], y)) for y in val_label]

n_classes = len(idx2la)
n_vocab = len(idx2w)

In [6]:
print("Example sentence : {}".format(words_train[0]))
print("Encoded form: {}".format(train_x[0]))
print()
print("It's label : {}".format(labels_train[0]))
print("Encoded form: {}".format(train_label[0]))

Example sentence : ['i', 'want', 'to', 'fly', 'from', 'boston', 'at', 'DIGITDIGITDIGIT', 'am', 'and', 'arrive', 'in', 'denver', 'at', 'DIGITDIGITDIGITDIGIT', 'in', 'the', 'morning']
Encoded form: [232 542 502 196 208  77  62  10  35  40  58 234 137  62  11 234 481 321]

It's label : ['O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-depart_time.time', 'I-depart_time.time', 'O', 'O', 'O', 'B-toloc.city_name', 'O', 'B-arrive_time.time', 'O', 'O', 'B-arrive_time.period_of_day']
Encoded form: [126 126 126 126 126  48 126  35  99 126 126 126  78 126  14 126 126  12]


## Model

In [10]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D

model = Sequential()
model.add(Embedding(n_vocab,100))
model.add(Dropout(0.25))
model.add(SimpleRNN(100,return_sequences=True))
model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
model.compile('rmsprop', 'categorical_crossentropy')

# model = Sequential()
# model.add(Embedding(n_vocab,100))
# model.add(Convolution1D(128, 5, border_mode='same', activation='relu'))
# model.add(Dropout(0.25))
# model.add(GRU(100,return_sequences=True))
# model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
# model.compile('rmsprop', 'categorical_crossentropy')

## Training

In [11]:
n_epochs = 30

for i in range(n_epochs):
    print("Training epoch {}".format(i))
    
    for n_batch, sent in enumerate(train_x):
        label = train_label[n_batch]
        # Make labels one hot
        label = np.eye(n_classes)[label][np.newaxis,:] 
        # View each sentence as a batch
        sent = sent[np.newaxis,:]
        
        if sent.shape[1] > 1: #ignore 1 word sentences
            model.train_on_batch(sent, label)

Training epoch 0
Training epoch 1
Training epoch 2
Training epoch 3
Training epoch 4
Training epoch 5
Training epoch 6
Training epoch 7
Training epoch 8
Training epoch 9
Training epoch 10
Training epoch 11
Training epoch 12
Training epoch 13
Training epoch 14
Training epoch 15
Training epoch 16
Training epoch 17
Training epoch 18
Training epoch 19
Training epoch 20
Training epoch 21
Training epoch 22
Training epoch 23
Training epoch 24
Training epoch 25
Training epoch 26
Training epoch 27
Training epoch 28
Training epoch 29


## Evaluation

In [29]:
# import numpy
# import random
# import os
# import stat
import subprocess
# from os.path import isfile, join
# from os import chmod

def conlleval(p, g, w, filename):
    '''
    INPUT:
    p :: predictions
    g :: groundtruth
    w :: corresponding words

    OUTPUT:
    filename :: name of the file where the predictions
    are written. it will be the input of conlleval.pl script
    for computing the performance in terms of precision
    recall and f1 score
    '''
    out = ''
    for sl, sp, sw in zip(g, p, w):
        out += 'BOS O O\n'
        for wl, wp, w in zip(sl, sp, sw):
            out += w + ' ' + wl + ' ' + wp + '\n'
        out += 'EOS O O\n\n'

    f = open(filename,'w')
    f.writelines(out)
    f.close()
    
    return get_perf(filename)

def get_perf(filename):
    ''' run conlleval.pl perl script to obtain
    precision/recall and F1 score '''
    _conlleval = 'conlleval.pl'
#     if not isfile(_conlleval):
#         #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 
#         os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl')
#         chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions

    proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
    stdout, _ = proc.communicate(open(filename,'rb').read())
    for line in stdout.decode("utf-8").split('\n'):
        if 'accuracy' in line:
            out = line.split()
            break
    
    # out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00']
    
    precision = float(out[3][:-2])
    recall    = float(out[5][:-2])
    f1score   = float(out[7])

    return {'p':precision, 'r':recall, 'f1':f1score}

def get_perfo(filename):
    ''' 
    work around for using a PERL script in python
    dirty but still works.
    '''
    tempfile = str(random.randint(1,numpy.iinfo('i').max)) + '.txt'
    #if not isfile(PREFIX + 'conlleval.pl'):
        #os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl')
        #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 
        #chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions
    #if len(PREFIX) > 0:
        #chmod(PREFIX + 'conlleval.pl', stat.S_IRWXU) # give the execute permissions
        #cmd = PREFIX + 'conlleval.pl < %s | grep accuracy > %s'%(filename,tempfile)
    #else:
        #cmd = './conlleval.pl < %s | grep accuracy > %s'%(filename,tempfile)
    
    cmd = './conlleval.pl < %s | grep accuracy > %s'%(filename,tempfile)

    print(cmd)
    out = os.system(cmd)
    out = open(tempfile).readlines()[0].split()
    os.system('rm %s'%tempfile)
    precision = float(out[6][:-2])
    recall    = float(out[8][:-2])
    f1score   = float(out[10])
    return {'p':precision, 'r':recall, 'f1':f1score}


In [30]:
labels_pred_val = []

for n_batch, sent in enumerate(val_x):
    label = val_label[n_batch]
    label = np.eye(n_classes)[label][np.newaxis,:]
    sent = sent[np.newaxis,:]

    pred = model.predict_on_batch(sent)
    pred = np.argmax(pred,-1)[0]
    labels_pred_val.append(pred)

labels_pred_val = [ list(map(lambda x: idx2la[x], y)) \
                                    for y in labels_pred_val]
con_dict = conlleval(labels_pred_val, labels_val, 
                            words_val, 'measure.txt')

print('Precision = {}, Recall = {}, F1 = {}'.format(
            con_dict['r'], con_dict['p'], con_dict['f1']))

Precision = 91.79, Recall = 92.64, F1 = 92.21
