In [1]:
import cntk as C
import numpy as np

In [2]:
hidden_dim = 64
attention_dim = 64
use_attention = True
use_embedding = True
embedding_dim = 100
length_increase = 1

In [3]:
def get_vocab(path):
    vocab = [w.strip().split('\t') for w in open(path,encoding='utf-8').readlines()]
    i2w = { int(i):w for w,i in vocab }
    w2i = { w:int(i) for w,i in vocab }
    
    return (vocab, i2w, w2i)

vocab, i2w, w2i = get_vocab('data/voc_50k.txt')
vocab = [x[0] for x in vocab]
vocab_dim = len(vocab)
max_extended_vocab_dim = vocab_dim+400

def create_reader(path, is_training):
    return C.io.MinibatchSource(C.io.CTFDeserializer(path, C.io.StreamDefs(
        en_in = C.io.StreamDef(field='S0', shape=vocab_dim, is_sparse=True),
        en_in_extended = C.io.StreamDef(field='S1', shape=max_extended_vocab_dim, is_sparse=True),
        target = C.io.StreamDef(field='S2',shape=max_extended_vocab_dim, is_sparse=True)#,
        #target = C.io.StreamDef(field='S3', shape=max_extended_vocab_dim, is_sparse=True)
    )), randomize = is_training, max_sweeps = C.io.INFINITELY_REPEAT if is_training else 1)

train_reader = create_reader('data/stories_test.ctf', True)

#valid_reader = create_reader('data/stories_validation.ctf', True)

In [4]:
sentence_start =C.Constant(np.array([i==w2i['<s>'] for i in range(max_extended_vocab_dim)], dtype=np.float32))
sentence_end_index = vocab.index('</s>')

In [5]:
enAxis = C.Axis('enAxis')
deAxis = C.Axis('deAxis')

EncoderSequence = C.layers.SequenceOver[enAxis]
DecoderSequence = C.layers.SequenceOver[deAxis]

In [6]:
def create_model_train(s2smodel):
    # model used in training (history is known from labels)
    # note: the labels must NOT contain the initial <s>
    @C.Function
    def model_train(input, extended_input, de): # (input*, labels*) --> (word_logp*)

        # The input to the decoder always starts with the special label sequence start token.
        # Then, use the previous value of the label sequence (for training) or the output (for execution).
        past_labels = C.layers.Delay(initial_state=sentence_start)(de)
        return s2smodel(past_labels, input, extended_input)
    return model_train

In [7]:
def create_criterion_function(model):
    @C.Function
    @C.layers.Signature(input=EncoderSequence[C.layers.Tensor[vocab_dim]],
                        extended_input=EncoderSequence[C.layers.Tensor[max_extended_vocab_dim]],
                        #de_input=DecoderSequence[C.layers.Tensor[vocab_dim]],
                        target=DecoderSequence[C.layers.Tensor[max_extended_vocab_dim]])
    def criterion(input, extended_input, target):#, de_input, target):
        # criterion function must drop the <s> from the labels
        postprocessed_target = C.sequence.slice(target,1 ,0)
        z = model(input, extended_input, postprocessed_target)
        print(repr(z))
        ce = C.negate(C.reduce_sum(C.element_times(postprocessed_target,C.log(z[0]),name='loss'), axis=-1))
        #ce = C.sequence.reduce_sum(C.sequence.gather(z[0], postprocessed_labels))
        #ce = C.cross_entropy_with_softmax(z[0], postprocessed_target)
        #errs = C.classification_error(z[0], postprocessed_target)
        print(repr(ce))
        return ce

    return criterion

In [8]:
def train(train_reader, valid_reader, vocab, i2w, s2smodel, max_epochs, epoch_size):

    # create the training wrapper for the s2smodel, as well as the criterion function
    model_train = create_model_train(s2smodel)
    criterion = create_criterion_function(model_train)

    # also wire in a greedy decoder so that we can properly log progress on a validation example
    # This is not used for the actual training process.
    #model_greedy = create_model_greedy(s2smodel)

    # Instantiate the trainer object to drive the model training
    minibatch_size = 1
    lr = 0.001 if use_attention else 0.005
    learner = C.fsadagrad(model_train.parameters,
                          #apply the learning rate as if it is a minibatch of size 1
                          lr = C.learning_parameter_schedule_per_sample([lr]*2+[lr/2]*3+[lr/4], epoch_size),
                          momentum = C.momentum_schedule(0.9366416204111472, minibatch_size=minibatch_size),
                          gradient_clipping_threshold_per_sample=2.3,
                          gradient_clipping_with_truncation=True)
    trainer = C.Trainer(None, criterion, learner)

    # Get minibatches of sequences to train with and perform model training
    total_samples = 0
    mbs = 0
    eval_freq = 100

    # print out some useful training information
    C.logging.log_number_of_parameters(model_train) ; print()
    progress_printer = C.logging.ProgressPrinter(freq=30, tag='Training')    

    # a hack to allow us to print sparse vectors
    #sparse_to_dense = create_sparse_to_dense(input_vocab_dim)

    for epoch in range(max_epochs):
        while total_samples < (epoch+1) * epoch_size:
            # get next minibatch of training data
            mb_train = train_reader.next_minibatch(minibatch_size)
            print(mb_train[train_reader.streams.en_in].shape)
            
            # do the training
            trainer.train_minibatch({criterion.arguments[0]: mb_train[train_reader.streams.en_in], 
                                     criterion.arguments[1]: mb_train[train_reader.streams.en_in_extended],
                                     #criterion.arguments[2]: mb_train[train_reader.streams.de_in],
                                     criterion.arguments[2]: mb_train[train_reader.streams.target]})

            progress_printer.update_with_trainer(trainer, with_metric=False) # log progress

            # every N MBs evaluate on a test sequence to visually show how we're doing
            #if mbs % eval_freq == 0: 
                #mb_valid = valid_reader.next_minibatch(1)

                # run an eval on the decoder output model (i.e. don't use the groundtruth)
                #e = model_greedy(mb_valid[valid_reader.streams.features])
                #print(format_sequences(sparse_to_dense(mb_valid[valid_reader.streams.features]), i2w))
                #print("->")
                #print(format_sequences(e, i2w))

                # visualizing attention window
                #if use_attention:
                    #debug_attention(model_greedy, mb_valid[valid_reader.streams.features])

            total_samples += mb_train[train_reader.streams.en_in].num_samples
            #mbs += 1

        # log a summary of the stats for the epoch
        progress_printer.epoch_summary(with_metric=False)

    # done: save the final model
    model_path = "model_%d.cmf" % epoch
    print("Saving final model to '%s'" % model_path)
    s2smodel.save(model_path)
    print("%d epochs complete." % max_epochs)

In [9]:
def create_model():
    embed1 = C.layers.Embedding(embedding_dim, name='embed1')
    embed2 = C.layers.Embedding(embedding_dim, name='embed2')
    
    with C.layers.default_options(enable_self_stabilization=True):
        encode = C.layers.Sequential([
            embed1,
            C.layers.Stabilizer(),
            (C.layers.Recurrence(C.layers.LSTM(hidden_dim),return_full_state=True),C.layers.Recurrence(C.layers.LSTM(hidden_dim),return_full_state=True)),
        ])

    with C.layers.default_options(enable_self_stabilization=True):
        # sub-layers
        stab_in = C.layers.Stabilizer()
        stab_out = C.layers.Stabilizer()
        proj_out = C.layers.Dense(vocab_dim, name='out_proj')
        h_dense = C.layers.Dense(hidden_dim,activation=C.relu, name='en2de_h')
        c_dense = C.layers.Dense(hidden_dim,activation=C.relu, name='en2de_c')
        rec_block = C.layers.LSTM(hidden_dim)
        attention_model = C.layers.AttentionModel(attention_dim, name='attention_model')
        pgen_h_att = C.layers.Dense(1,activation=None)
        pgen_h = C.layers.Dense(1,activation=None)
        pgen_x = C.layers.Dense(1,activation=None)
        
        @C.Function
        def decode(history, input, extended_input):
            encoded_input = encode(input)
            encoded_h = C.splice(encoded_input[0],encoded_input[2])
            encoded_h = h_dense(encoded_h)
            encoded_c = C.splice(encoded_input[1][-1],encoded_input[3][-1])
            encoded_c = c_dense(encoded_c)
            x = embed2(history)
            x = stab_in(x)
            r = C.layers.RecurrenceFrom(rec_block,return_full_state=True)(encoded_h,encoded_c,x)
            h_att = attention_model(encoded_input.outputs[0], r[0])
            pgen = C.sigmoid(pgen_h_att(h_att)+pgen_h(r[0])+pgen_x(x))
            att_w = h_att.attention_weights
            tmp = C.sequence.broadcast_as(C.sequence.unpack(extended_input,0,no_mask_output=True),att_w)
            att_dist = C.reduce_sum(C.element_times(att_w,tmp,name='get_att_dist'),axis=0,name='att_dist')
            att_dist =  C.reshape(att_dist, (), 0, 1)
            voc_dist = stab_out(C.splice(r[0],h_att))
            voc_dist = proj_out(voc_dist)
            voc_dist = C.layers.Label('voc_proj_out')(voc_dist)
            voc_dist = C.pad(voc_dist, pattern=[(0,400)],mode=C.ops.CONSTANT_PAD, constant_value=0)
            extend_dist = C.element_times(pgen,voc_dist,name='p_voc_dist') + C.element_times((1-pgen),att_dist,name='p_att_dist')
            return (extend_dist,att_dist,att_w,tmp)

    return decode

In [10]:
model = create_model()
train(train_reader, train_reader, vocab, i2w, model, max_epochs=1, epoch_size=160000)

Composite(Sequence::Slice): Input('target', [#, deAxis], [50404]) -> Output('Block26298_Output_0', [#, deAxis_times_1_minus_1], [50404])
Composite(Negate): Input('target', [#, deAxis], [50404]), Input('input', [#, enAxis], [50004]), Input('extended_input', [#, enAxis], [50404]) -> Output('Negate28674_Output_0', [#, deAxis_times_1_minus_1], [1])
Composite(Sequence::Slice): Input('input', [#, enAxis], [50004]), Input('extended_input', [#, enAxis], [50404]), Input('target', [#, deAxis], [50404]) -> Output('Block28712_Output_0', [#, deAxis_times_1_minus_1], [50404])
Composite(Negate): Input('input', [#, enAxis], [50004]), Input('extended_input', [#, enAxis], [50404]), Input('target', [#, deAxis], [50404]) -> Output('Negate31094_Output_0', [#, deAxis_times_1_minus_1], [1])
Training 16634981 parameters in 36 parameter tensors.

(1, 554, 50004)


RuntimeError: CUDA failure 2: out of memory ; GPU=0 ; hostname=GCRGDW132 ; expr=cudaMalloc((void**) &deviceBufferPtr, sizeof(AllocatedElemType) * AsMultipleOf(numElements, 2))

[CALL STACK]
    > Microsoft::MSR::CNTK::CudaTimer::  Stop
    - Microsoft::MSR::CNTK::CudaTimer::  Stop (x2)
    - Microsoft::MSR::CNTK::GPUMatrix<float>::  Resize
    - Microsoft::MSR::CNTK::Matrix<float>::  Resize
    - Microsoft::MSR::CNTK::TracingGPUMemoryAllocator::  operator= (x4)
    - CNTK::Internal::  UseSparseGradientAggregationInDataParallelSGD
    - CNTK::  CreateTrainer
    - CNTK::Trainer::  TotalNumberOfUnitsSeen
    - CNTK::Trainer::  TrainMinibatch (x2)
    - PyInit__cntk_py (x2)

