In [None]:
# char-rnn.py
# Neural Character Language Model in CNTK2
# wdarling@microsoft.com

In [1]:
import numpy as np
import os
from cntk import Trainer, Axis
from cntk.learner import adam_sgd, momentum_sgd, momentum_as_time_constant_schedule, learning_rate_schedule, UnitType
from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error
from cntk.persist import load_model, save_model
from cntk.blocks import LSTM, Stabilizer
from cntk.layers import Recurrence, Dense, Dropout, BatchNormalization
from cntk.utils import get_train_eval_criterion, get_train_loss
from cntk.device import set_default_device, gpu

# Set to GPU, run if GPU is available
#set_default_device(gpu(0))

# model hyperparameters
hidden_dim = 512
num_layers = 2
minibatch_size = 100 # also how much time we unroll the RNN for

# Get data
def get_data(p, minibatch_size, data, char_to_ix, vocab_dim):

    xi = [char_to_ix[ch] for ch in data[p:p+minibatch_size]]
    yi = [char_to_ix[ch] for ch in data[p+1:p+minibatch_size+1]]
    
    X = np.eye(vocab_dim, dtype=np.float32)[xi]
    Y = np.eye(vocab_dim, dtype=np.float32)[yi]

    # return a list of numpy arrays for each of X (features) and Y (labels)
    return [X], [Y]

# Sample from the network
def sample(root, ix_to_char, vocab_dim, char_to_ix, prime_text='', use_hardmax=False, length=300, temperature=1.2):

    # temperature: T < 1 means smoother; T=1.0 means same; T > 1 means more peaked
    def apply_temp(p):
        # apply temperature
        p = np.power(p, (temperature))
        # renormalize and return
        return (p / np.sum(p))

    def sample_word(p):
        if use_hardmax:
            w = np.argmax(p, axis=2)[0,0]
        else:
            # normalize probabilities then take weighted sample
            p = np.exp(p) / np.sum(np.exp(p))            
            p = apply_temp(p)
            w = np.random.choice(range(vocab_dim), p=p.ravel())
        return w

    plen = 1
    prime = -1

    # start sequence with first input    
    x = np.zeros((1, vocab_dim), dtype=np.float32)    
    if prime_text != '':
        plen = len(prime_text)
        prime = char_to_ix[prime_text[0]]
    else:
        prime = np.random.choice(range(vocab_dim))
    x[0, prime] = 1
    arguments = ([x], [True])

    output=[]
    output.append(prime)
    
    # loop through prime text
    for i in range(plen):            
        p = root.eval(arguments)        
        
        # reset
        x = np.zeros((1, vocab_dim), dtype=np.float32)
        if i < plen-1:
            idx = char_to_ix[prime_text[i+1]]
        else:
            idx = sample_word(p)

        output.append(idx)
        x[0, idx] = 1            
        arguments = ([x], [False])
    
    # loop through length of generated text, sampling along the way
    for i in range(length-plen):
        p = root.eval(arguments)
        idx = sample_word(p)
        output.append(idx)

        x = np.zeros((1, vocab_dim), dtype=np.float32)
        x[0, idx] = 1
        arguments = ([x], [False])

    # return output
    return ''.join([ix_to_char[c] for c in output])

def load_data_and_vocab(training_file, convert_to_lower=True):
    
    # load data
    rel_path = training_file
    path = rel_path
    data = open(path, "r", encoding='utf8').read()
    
    # Do some simple text prep
    if convert_to_lower == True:
        data = data.lower()
    replacements = [["’", "'"], 
                    ['“', '"'], 
                    ['”', '"'], 
                    ["`", "'"], 
                    ['[', '('], 
                    [']', ')']]
    for r in replacements:
        data = data.replace(r[0], r[1])
        
    chars = sorted(list(set(data)))
    data_size, vocab_size = len(data), len(chars)
    print('data has %d characters, %d unique.' % (data_size, vocab_size))
    char_to_ix = { ch:i for i,ch in enumerate(chars) }
    ix_to_char = { i:ch for i,ch in enumerate(chars) }

    # write vocab
    ff = open(path + ".vocab", "w", encoding='utf8')
    for c in chars:
        ff.write("%s\n" % c) if c != '\n' else ff.write("\n")
    ff.close()
    
    return data, char_to_ix, ix_to_char, data_size, vocab_size

# Creates and trains a character-level language model
def train_lm(training_file, model_path, nb_epochs=1):

    # create the stabilizer function from blocks
    stabilize = Stabilizer()

    # load the data and vocab
    data, char_to_ix, ix_to_char, data_size, vocab_dim = load_data_and_vocab(training_file)

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(vocab_dim), dynamic_axes=input_dynamic_axes)
    raw_labels = input_variable(shape=(vocab_dim), dynamic_axes=input_dynamic_axes)

    input_sequence = raw_input
    label_sequence = raw_labels

    # LSTM
    encoder_output = stabilize(input_sequence)
    for i in range(0, num_layers):
        encoder_output = Recurrence(LSTM(hidden_dim, enable_self_stabilization=True)) (encoder_output.output)
        encoder_output = Dropout(0.5) (encoder_output.output)

    # get output of the LSTM
    states = encoder_output.output

    # dense layer    
    z = Dense(vocab_dim) (states)
    print(z)

    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # Instantiate the trainer object to drive the model training
    lr_per_sample = learning_rate_schedule(0.001, UnitType.sample)
    momentum_time_constant = momentum_as_time_constant_schedule(1100)
    clipping_threshold_per_sample = 5.0
    gradient_clipping_with_truncation = True
    learner = adam_sgd(z.parameters, lr_per_sample, momentum_time_constant, 
                           gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
                           gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, ce, errs, learner)

    training_progress_output_freq = 100
    sample_freq = 1000
    epochs = nb_epochs
    minibatches_per_epoch = int((data_size / minibatch_size))
    minibatches = epochs * minibatches_per_epoch
    
    e = 0
    p = 0
    for i in range(0, minibatches):

        if p + minibatch_size+1 >= data_size:
            p = 0
            e += 1
            model_filename = model_path % e
            save_model(z, model_filename)
            print("Saved model to '%s'" % model_filename)

        # get the data            
        features, labels = get_data(p, minibatch_size, data, char_to_ix, vocab_dim)

        # Specify the mapping of input variables in the model to actual minibatch data to be trained with
        # If it's the start of the data, we specify that we are looking at a new sequence (True)
        mask = [False] 
        if p == 0:
            mask = [True]
        arguments = ({raw_input : features, raw_labels : labels}, mask)
        trainer.train_minibatch(arguments)

        if i % training_progress_output_freq == 0:
            print("Minibatch: {}, Train Loss: {}, Train Evaluation Criterion: {}".format(i,
                      get_train_loss(trainer), get_train_eval_criterion(trainer)))
            print("Epoch %d, %f %% done" % (e, ((float(i) / float(minibatches_per_epoch)) - e) * 100.0))
        
        if i % sample_freq == 0:
            print(sample(z, ix_to_char, vocab_dim, char_to_ix))

        p += minibatch_size

# Creates and trains a character-level language model
def train_multitask_lm(training_file, training_file_second, model_path, model_path_second, 
                       nb_epochs=1, nb_epochs_second=1, alternate=True):

    # create the stabilizer function from blocks
    stabilize = Stabilizer()

    # load the data and vocab
    data, char_to_ix, ix_to_char, data_size, vocab_dim = load_data_and_vocab(training_file)

    # Source and target inputs to the model
    batch_axis = Axis.default_batch_axis()
    input_seq_axis = Axis('inputAxis')

    input_dynamic_axes = [batch_axis, input_seq_axis]
    raw_input = input_variable(shape=(vocab_dim), dynamic_axes=input_dynamic_axes)
    raw_labels = input_variable(shape=(vocab_dim), dynamic_axes=input_dynamic_axes)

    input_sequence = raw_input
    label_sequence = raw_labels

    # LSTM
    encoder_output = stabilize(input_sequence)
    for i in range(0, num_layers):
        encoder_output = Recurrence(LSTM(hidden_dim, enable_self_stabilization=True)) (encoder_output.output)
        encoder_output = Dropout(0.5) (encoder_output.output)

    # get output of the LSTM
    states = encoder_output.output

    # dense layer    
    z = Dense(vocab_dim) (states)

    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)

    # Instantiate the trainer object to drive the model training
    lr_per_sample = learning_rate_schedule(0.001, UnitType.sample)
    momentum_time_constant = momentum_as_time_constant_schedule(1100)
    clipping_threshold_per_sample = 5.0
    gradient_clipping_with_truncation = True
    learner = adam_sgd(z.parameters, lr_per_sample, momentum_time_constant, 
                           gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
                           gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, ce, errs, learner)

    training_progress_output_freq = 100
    sample_freq = 1000
    
    if alternate == False:

        epochs = nb_epochs
        minibatches_per_epoch = int((data_size / minibatch_size))
        minibatches = epochs * minibatches_per_epoch

        e = 0
        p = 0
        for i in range(0, minibatches):

            if p + minibatch_size+1 >= data_size:
                p = 0
                e += 1
                model_filename = model_path % e
                save_model(z, model_filename)
                print("Saved model to '%s'" % model_filename)

            # get the data            
            features, labels = get_data(p, minibatch_size, data, char_to_ix, vocab_dim)

            # Specify the mapping of input variables in the model to actual minibatch data to be trained with
            # If it's the start of the data, we specify that we are looking at a new sequence (True)
            mask = [False] 
            if p == 0:
                mask = [True]
            arguments = ({raw_input : features, raw_labels : labels}, mask)
            trainer.train_minibatch(arguments)

            if i % training_progress_output_freq == 0:
                print("Minibatch: {}, Train Loss: {}, Train Evaluation Criterion: {}".format(i,
                          get_train_loss(trainer), get_train_eval_criterion(trainer)))
                print("Epoch %d, %f %% done" % (e, ((float(i) / float(minibatches_per_epoch)) - e) * 100.0))

            if i % sample_freq == 0:
                print(sample(z, ix_to_char, vocab_dim, char_to_ix, prime_text='§'))

            p += minibatch_size

        # load the data and vocab
        data, char_to_ix_, ix_to_char_, data_size, vocab_dim_ = load_data_and_vocab(training_file_second)

        epochs = nb_epochs_second
        minibatches_per_epoch = int((data_size / minibatch_size))
        minibatches = epochs * minibatches_per_epoch

        e = 0
        p = 0
        for i in range(0, minibatches):

            if p + minibatch_size+1 >= data_size:
                p = 0
                e += 1
                model_filename = model_path_second % e
                save_model(z, model_filename)
                print("Saved model to '%s'" % model_filename)

            # get the data            
            features, labels = get_data(p, minibatch_size, data, char_to_ix, vocab_dim)

            # Specify the mapping of input variables in the model to actual minibatch data to be trained with
            # If it's the start of the data, we specify that we are looking at a new sequence (True)
            mask = [False] 
            if p == 0:
                mask = [True]
            arguments = ({raw_input : features, raw_labels : labels}, mask)
            trainer.train_minibatch(arguments)

            if i % training_progress_output_freq == 0:
                print("Minibatch: {}, Train Loss: {}, Train Evaluation Criterion: {}".format(i,
                          get_train_loss(trainer), get_train_eval_criterion(trainer)))
                print("Epoch %d, %f %% done" % (e, ((float(i) / float(minibatches_per_epoch)) - e) * 100.0))

            if i % sample_freq == 0:
                print(sample(z, ix_to_char, vocab_dim, char_to_ix, prime_text='|'))

            p += minibatch_size
    
    else:
        
        epochs = nb_epochs + nb_epochs_second
        
        for k in range(0, epochs):
            
            if k % 4 == 0:
                print('Training on ' + training_file + ' ...')
                data, char_to_ix, ix_to_char, data_size, vocab_dim = load_data_and_vocab(training_file)
                modelpath = model_path
                primetext = '§'
                minibatches_per_epoch = int((data_size / minibatch_size))
                training_progress_output_freq = int((minibatches_per_epoch / 20))
                sample_freq = int((minibatches_per_epoch / 5))
                new_seq = True
            else:
                print('Training on ' + training_file_second + ' ...')
                data, char_to_ix_, ix_to_char_, data_size, vocab_dim_ = load_data_and_vocab(training_file_second)
                modelpath = model_path_second
                primetext = '|'
                minibatches_per_epoch = int((data_size / minibatch_size))
                training_progress_output_freq = int((minibatches_per_epoch / 10))
                sample_freq = int((minibatches_per_epoch / 3))
            
            e = 0
            p = 0
            for i in range(0, minibatches_per_epoch):

                if p + minibatch_size+1 >= data_size:
                    p = 0
                    e += 1
                    #model_filename = modelpath % k
                    #save_model(z, model_filename)
                    #print("Saved model to '%s'" % model_filename)

                # get the data            
                features, labels = get_data(p, minibatch_size, data, char_to_ix, vocab_dim)

                # Specify the mapping of input variables in the model to actual minibatch data to be trained with
                # If it's the start of the data, we specify that we are looking at a new sequence (True)
                #mask = [False] 
                #if p == 0:
                #    mask = [True]
                if new_seq == True:
                    mask = [True]
                    new_seq = False
                else:
                    mask = [False]
                arguments = ({raw_input : features, raw_labels : labels}, mask)
                trainer.train_minibatch(arguments)

                if i % training_progress_output_freq == 0:
                    print("Minibatch: {}, Train Loss: {}, Train Evaluation Criterion: {}".format(i,
                              get_train_loss(trainer), get_train_eval_criterion(trainer)))
                    print("Epoch %d, %f %% done" % (k, ((float(i) / float(minibatches_per_epoch)) - e) * 100.0))

                if i % sample_freq == 0:
                    print(sample(z, ix_to_char, vocab_dim, char_to_ix, prime_text=primetext, temperature=1.2))

                p += minibatch_size
            
            model_filename = modelpath % k
            save_model(z, model_filename)
            print("Saved model to '%s'" % model_filename)
        
def load_and_sample(model_filename, vocab_filename, prime_text='', use_hardmax=False, length=1000, temperature=1.2):
    
    # load the model
    model = load_model(model_filename)
    
    # load the vocab
    chars = [c[0] for c in open(vocab_filename, encoding='utf8').readlines()]
    char_to_ix = { ch:i for i,ch in enumerate(chars) }
    ix_to_char = { i:ch for i,ch in enumerate(chars) }
        
    output = sample(model, ix_to_char, len(chars), char_to_ix, prime_text=prime_text, use_hardmax=use_hardmax, length=length, temperature=temperature)
    
    print(output)

    #ff = open('output.txt', 'w', encoding='utf-8')
    #ff.write(output)
    #ff.close()

In [2]:
train_multitask_lm("data/stories.txt", "data/songs.txt", 
                   "models/deepjingling-storyteller5_epoch%d.dnn", 
                   "models/deepjingling-songwriter5_epoch%d.dnn", 
                   nb_epochs=20, nb_epochs_second=80, alternate=True)

data has 338901 characters, 54 unique.
Training on data/stories.txt ...
data has 338901 characters, 54 unique.
Minibatch: 0, Train Loss: 3.989766845703125, Train Evaluation Criterion: 0.99
Epoch 0, 0.000000 % done
§$,0:r7?49311 4k?'h54§8oeyse(,s§§öt|tpq|j'!v‐vhoygu|45
$wn'?ol-x.‐z
k
sx1e7"$gz"(5b'3kcusx,g56§9ihb9v.me83lmy,u0,2ko,s319- cga1"yx2wzn)zo0fkhu?g'|2gd?23y4cy1opv93pö ,3dg!14|2‐noecmy('epm?. g?h$hs§r8t4k6gzpy0l‐64ö8,öng-?u7!cy(k:‐öudu8c:w§qq"6;w|hd1m67?uwzdsc8x"2jg
djöw6,m8 r8!1z.hxvhmhf7kp,g$5y.it5||f
Minibatch: 169, Train Loss: 2.997497863769531, Train Evaluation Criterion: 0.81
Epoch 0, 4.986722 % done
Minibatch: 338, Train Loss: 2.740640869140625, Train Evaluation Criterion: 0.75
Epoch 0, 9.973443 % done
Minibatch: 507, Train Loss: 2.3457086181640623, Train Evaluation Criterion: 0.66
Epoch 0, 14.960165 % done
Minibatch: 676, Train Loss: 2.1005097961425783, Train Evaluation Criterion: 0.65
Epoch 0, 19.946887 % done
§y and mer. and soyi gom, the bome wind, conggend one houl g

KeyboardInterrupt: 

In [None]:
text = "scrooge scrooge scrooge scrooge scrooge".lower()
load_and_sample("models/deepjingling-songwriter5_epoch97.dnn", "data/songs.txt.vocab", 
                prime_text=text, use_hardmax=False, length=300, temperature=1.0)