In [0]:
#Htet Phyo Wai
#Progress Report 3/4 Implementation
#4/30/19

#(Note :: The program is adapted from Microsoft Online Learning on NLP using RNN models and uses ATIS(Air Travel Information Services) Test Data 
#https://github.com/Microsoft/CNTK/tree/v2.0/Examples/LanguageUnderstanding/ATIS/Data

##**Slot Tagging using Neural Networks**

In this methods, we select individual words and tags to the respective classes. 

Classes are provided as labels in the training data set. 

The following techniques and network models are used in the process of slots tagging:

- LSTM: It is composed of the following stages :

    1. New memory generation: A new memory (h(t)) is the consolidation of a new input word x(t) with the past hidden state h(t−1)

    2. Input Gate: The input gate uses the input word and the past hidden state to determine whether or not the input is worth preserving and thus is used to gate the new memory. It thus produces it as an indicator of this information.
    
    3. Forget Gate: This gate is similar to the input gate except that it does not make a determination of usefulness of the input word – instead it makes an assessment on whether the past memory cell is useful for the computation of the current memory cell. Thus, the forget gate looks at the input word and the past hidden state and produces f(t).

    4. Final memory generation: This stage first takes the advice of the forget gate f(t) and accordingly forgets the past memory c(t−1). Similarly, it takes the advice of the input gate it and accordingly gates the new memory c ̃(t). It then sums these two results to produce the final memory c(t).

    5. Output/Exposure Gate: to separate the final memory from the hidden state. The final memory c(t) contains a lot of information that is not necessarily required to be saved in the hidden state. Hidden states are used in every single gate of an LSTM and thus, this gate makes the assessment regarding what parts of the memory c(t) needs to be exposed/present in the hidden state h(t). The signal it produces to indicate this is ot and this is used to gate the point-wise tanh of the memory.

<br>
<br>



In [0]:
!apt-get install --no-install-recommends openmpi-bin libopenmpi-dev libopencv-dev python3-opencv python-opencv && ln -sf /usr/lib/x86_64-linux-gnu/libmpi_cxx.so /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.1 && ln -sf /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so /usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so.12 && ln -sf /usr/lib/x86_64-linux-gnu/libmpi.so /usr/lib/x86_64-linux-gnu/libmpi.so.12 && pip install cntk


In [0]:
import math
import numpy as np
import cntk as C

In [0]:
from __future__ import print_function # Use a function definition from future version (say 3.x from 2.7 interpreter)
import requests
import os

def download(url, filename):
    """ utility function to download a file """
    response = requests.get(url, stream=True)
    with open(filename, "wb") as handle:
        for data in response.iter_content():
            handle.write(data)

locations = ['Tutorials/SLUHandsOn', 'Examples/LanguageUnderstanding/ATIS/BrainScript']

data = {
  'train': { 'file': 'atis.train.ctf', 'location': 0 },
  'test': { 'file': 'atis.test.ctf', 'location': 0 },
  'query': { 'file': 'query.wl', 'location': 1 },
  'slots': { 'file': 'slots.wl', 'location': 1 }
}

for item in data.values():
    location = locations[item['location']]
    path = os.path.join('..', location, item['file'])
    if os.path.exists(path):
        print("Reusing locally cached:", item['file'])
        # Update path
        item['file'] = path
    elif os.path.exists(item['file']):
        print("Reusing locally cached:", item['file'])
    else:
        print("Starting download:", item['file'])
        url = "https://github.com/Microsoft/CNTK/blob/v2.0/%s/%s?raw=true"%(location, item['file'])
        download(url, item['file'])
        print("Download completed")

Starting download: atis.train.ctf
Download completed
Starting download: atis.test.ctf
Download completed
Starting download: query.wl
Download completed
Starting download: slots.wl
Download completed


In [0]:
# number of words in vocab, slot labels, and intent labels
vocab_size = 943 ; num_labels = 129 ; num_intents = 26    

# model dimensions
input_dim  = vocab_size
label_dim  = num_labels
emb_dim    = 150
hidden_dim = 300

# Create the containers for input feature (x) and the label (y)
x = C.sequence.input_variable(vocab_size)
y = C.sequence.input_variable(num_labels)

def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim, name='embed'),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.Dense(num_labels, name='classify')
        ])

In [0]:
# peek
z = create_model()
print(z.embed.E.shape)
print(z.classify.b.value)

(-1, 150)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [0]:
# Pass an input and check the dimension
z = create_model()
print(z(x).embed.E.shape)

(943, 150)


In [0]:
def create_reader(path, is_training):
    return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
         query         = C.io.StreamDef(field='S0', shape=vocab_size,  is_sparse=True),
         intent_unused = C.io.StreamDef(field='S1', shape=num_intents, is_sparse=True),  
         slot_labels   = C.io.StreamDef(field='S2', shape=num_labels,  is_sparse=True)
     )), randomize=is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)

In [0]:
# peek
reader = create_reader(data['train']['file'], is_training=True)
reader.streams.keys()

dict_keys(['slot_labels', 'query', 'intent_unused'])

In [0]:
def create_criterion_function(model):
    labels = C.placeholder(name='labels')
    ce   = C.cross_entropy_with_softmax(model, labels)
    errs = C.classification_error      (model, labels)
    return C.combine ([ce, errs]) # (features, labels) -> (loss, metric)

criterion = create_criterion_function(create_model())
criterion.replace_placeholders({criterion.placeholders[0]: C.sequence.input_variable(num_labels)})

Composite(Combine): Input('Input2300', [#, *], [129]), Placeholder('labels', [???], [???]) -> Output('Block2270_Output_0', [#, *], [???]), Output('Block2290_Output_0', [#, *], [???])

In [0]:
def create_criterion_function_preferred(model, labels):
    ce   = C.cross_entropy_with_softmax(model, labels)
    errs = C.classification_error      (model, labels)
    return ce, errs # (model, labels) -> (loss, error metric)

In [0]:
def train(reader, model_func, max_epochs=10):
    
    # Instantiate the model function; x is the input (feature) variable 
    model = model_func(x)
    
    # Instantiate the loss and error function
    loss, label_error = create_criterion_function_preferred(model, y)

    # training conf
    epoch_size = 18000        # half the dataset size 
    minibatch_size = 70
    
    # LR schedule over epochs 
    lr_per_sample = [0.003]*4+[0.0015]*24+[0.0003]
    lr_per_minibatch = [lr * minibatch_size for lr in lr_per_sample]
    lr_schedule = C.learning_rate_schedule(lr_per_minibatch, C.UnitType.minibatch, epoch_size)
    
    # Momentum schedule
    momentum_as_time_constant = C.momentum_as_time_constant_schedule(700)
    


    learner = C.adam(parameters=model.parameters,
                     lr=lr_schedule,
                     momentum=momentum_as_time_constant,
                     gradient_clipping_threshold_per_sample=15, 
                     gradient_clipping_with_truncation=True)

    # Setup the progress updater
    progress_printer = C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs)
    
       # Instantiate the trainer
    trainer = C.Trainer(model, (loss, label_error), learner, progress_printer)

    # process minibatches and perform model training
    C.logging.log_number_of_parameters(model)

    t = 0
    for epoch in range(max_epochs):         # loop over epochs
        epoch_end = (epoch+1) * epoch_size
        while t < epoch_end:                # loop over minibatches on the epoch
            data = reader.next_minibatch(minibatch_size, input_map={  # fetch minibatch
                x: reader.streams.query,
                y: reader.streams.slot_labels
            })
            trainer.train_minibatch(data)               # update model with it
            t += data[y].num_samples                    # samples 
        trainer.summarize_training_progress()

In [0]:
def do_train():
    global z
    z = create_model()
    reader = create_reader(data['train']['file'], is_training=True)
    train(reader, z)
do_train()

Training 721479 parameters in 6 parameter tensors.
Learning rate per minibatch: 0.21
Finished Epoch[1 of 10]: [Training] loss = 0.692392 * 18010, metric = 14.14% * 18010 7.114s (2531.6 samples/s);
Finished Epoch[2 of 10]: [Training] loss = 0.196765 * 18051, metric = 4.42% * 18051 7.240s (2493.2 samples/s);
Finished Epoch[3 of 10]: [Training] loss = 0.127771 * 17941, metric = 2.88% * 17941 7.039s (2548.8 samples/s);
Finished Epoch[4 of 10]: [Training] loss = 0.088850 * 18059, metric = 2.11% * 18059 7.159s (2522.6 samples/s);
Learning rate per minibatch: 0.105
Finished Epoch[5 of 10]: [Training] loss = 0.056562 * 17957, metric = 1.36% * 17957 7.402s (2426.0 samples/s);
Finished Epoch[6 of 10]: [Training] loss = 0.052673 * 18021, metric = 1.27% * 18021 7.404s (2434.0 samples/s);
Finished Epoch[7 of 10]: [Training] loss = 0.047527 * 17980, metric = 1.23% * 17980 7.207s (2494.8 samples/s);
Finished Epoch[8 of 10]: [Training] loss = 0.043071 * 18025, metric = 1.12% * 18025 7.398s (2436.5 sam

In [0]:
def evaluate(reader, model_func):
    
    # Instantiate the model function; x is the input (feature) variable 
    model = model_func(x)
    
    # Create the loss and error functions
    loss, label_error = create_criterion_function_preferred(model, y)

    # process minibatches and perform evaluation
    progress_printer = C.logging.ProgressPrinter(tag='Evaluation', num_epochs=0)

    while True:
        minibatch_size = 500
        data = reader.next_minibatch(minibatch_size, input_map={  # fetch minibatch
            x: reader.streams.query,
            y: reader.streams.slot_labels
        })
        if not data:                                 # until we hit the end
            break

        evaluator = C.eval.Evaluator(loss, progress_printer)
        evaluator.test_minibatch(data)
     
    evaluator.summarize_test_progress()

In [0]:
def do_test():
    reader = create_reader(data['test']['file'], is_training=False)
    evaluate(reader, z)
do_test()
z.classify.b.value

Finished Evaluation [1]: Minibatch[1-23]: metric = 0.27% * 10984;


array([-0.03935637, -0.09902153, -0.03894174, -0.03529919, -0.01363602,
       -0.05434954, -0.04320217, -0.10497631, -0.02455624, -0.05959513,
       -0.03621629, -0.04268409, -0.05343723, -0.06117778, -0.08973555,
       -0.07753321, -0.12392411, -0.04092873,  0.04135334, -0.13623148,
       -0.08145326, -0.05778902,  0.01750098, -0.01237598, -0.04901759,
       -0.00611704, -0.03405998,  0.00117075, -0.01631831, -0.04836401,
       -0.03489719, -0.02382404, -0.08001239, -0.00699768, -0.05259896,
        0.0855659 ,  0.06091429, -0.0171707 , -0.01988967, -0.05222796,
       -0.14214335, -0.0557886 ,  0.0151703 , -0.04353099,  0.01414575,
       -0.08254308,  0.01819646,  0.03762412,  0.0261627 , -0.0337337 ,
       -0.05364189, -0.08558379,  0.03114999, -0.06602737,  0.05271252,
        0.00680154, -0.04858708, -0.05499237, -0.05703288, -0.05267509,
       -0.01114069, -0.05643699,  0.02559835, -0.02465305, -0.03254724,
       -0.04454485, -0.10612184, -0.05292601, -0.05991152, -0.12

In [0]:
# load dictionaries
query_wl = [line.rstrip('\n') for line in open(data['query']['file'])]
slots_wl = [line.rstrip('\n') for line in open(data['slots']['file'])]
query_dict = {query_wl[i]:i for i in range(len(query_wl))}
slots_dict = {slots_wl[i]:i for i in range(len(slots_wl))}

# let's run a sequence through
seq = 'BOS flights from new york to seattle EOS'
w = [query_dict[w] for w in seq.split()] # convert to word indices
print(w)
onehot = np.zeros([len(w),len(query_dict)], np.float32)
for t in range(len(w)):
    onehot[t,w[t]] = 1

#x = C.sequence.input_variable(vocab_size)
pred = z(x).eval({x:[onehot]})[0]
print(pred.shape)
best = np.argmax(pred,axis=1)
print(best)
list(zip(seq.split(),[slots_wl[s] for s in best]))

[178, 429, 444, 619, 937, 851, 752, 179]
(8, 129)
[128 128 128  48 110 128  78 128]


[('BOS', 'O'),
 ('flights', 'O'),
 ('from', 'O'),
 ('new', 'B-fromloc.city_name'),
 ('york', 'I-fromloc.city_name'),
 ('to', 'O'),
 ('seattle', 'B-toloc.city_name'),
 ('EOS', 'O')]

In [0]:
def create_model():
    with C.layers.default_options(initial_state=0.1):
        return C.layers.Sequential([
            C.layers.Embedding(emb_dim),
            C.layers.BatchNormalization(),
            C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False),
            C.layers.BatchNormalization(),
            C.layers.Dense(num_labels)
        ])

do_train()
do_test()

Training 722379 parameters in 10 parameter tensors.
Learning rate per minibatch: 0.21
Finished Epoch[1 of 10]: [Training] loss = 0.382122 * 18010, metric = 7.10% * 18010 7.644s (2356.1 samples/s);
Finished Epoch[2 of 10]: [Training] loss = 0.152723 * 18051, metric = 3.12% * 18051 7.574s (2383.3 samples/s);
Finished Epoch[3 of 10]: [Training] loss = 0.102781 * 17941, metric = 2.21% * 17941 7.677s (2337.0 samples/s);
Finished Epoch[4 of 10]: [Training] loss = 0.083122 * 18059, metric = 1.88% * 18059 7.925s (2278.7 samples/s);
Learning rate per minibatch: 0.105
Finished Epoch[5 of 10]: [Training] loss = 0.040701 * 17957, metric = 0.96% * 17957 8.091s (2219.4 samples/s);
Finished Epoch[6 of 10]: [Training] loss = 0.041964 * 18021, metric = 1.00% * 18021 8.027s (2245.0 samples/s);
Finished Epoch[7 of 10]: [Training] loss = 0.037129 * 17980, metric = 0.98% * 17980 7.938s (2265.1 samples/s);
Finished Epoch[8 of 10]: [Training] loss = 0.028060 * 18025, metric = 0.73% * 18025 7.931s (2272.7 sam