In [None]:
# Import required modules
import os
import math
import numpy as np
import random

from cntk.blocks import default_options, LSTM, Placeholder, Input
from cntk.layers import Embedding, Recurrence, Dense, BatchNormalization
from cntk.models import Sequential
from cntk.utils import ProgressPrinter, log_number_of_parameters
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
from cntk import *
from cntk.learner import sgd, adam_sgd, learning_rate_schedule
from cntk.device import set_default_device, gpu

In [None]:
# Set to GPU, run if GPU is available
set_default_device(gpu(0))

In [None]:
# Set random seed (don't know if this actually works for reproducing results)
random.seed(1)

# Set paths
raw_data_path = 'data/songs.txt'
data_path = 'data/songs_processed.ctf'
dict_path = 'data/dict.ctf'

# Read text file and convert to lower case
with open(raw_data_path, encoding='utf8') as f:
    source_text = f.read().lower()

# Define and make char replacements
replacements = [["’", "'"], 
                ['“', '"'], 
                ['”', '"'],
                ['\n', '$']]
for r in replacements:
    source_text = source_text.replace(r[0], r[1])

In [None]:
# Get length of source text - it is quite small for an RNN!
len(source_text)

In [None]:
# Create dictionary of characters
chars = [[k, v] for v, k in enumerate(sorted(set(source_text)))]
char_dict = {key: value for (key, value) in chars}

# Get number of songs (the beginning and end of songs are marked with '|')
nb_songs = source_text.count('|')

In [None]:
# Set max length of sequences - 10 should be enough to learn the network how to spell words
seq_max_length = 10

# Iterate through source text and create appropriate sequence format for CNTK
new_text = ''
nb_sequences = 0
for n, char in enumerate(source_text[1:]):
    prev_chars = source_text[max(0,(n+1-seq_max_length)):n+1]
    if '|' in prev_chars:
        prev_chars = prev_chars[max(0,prev_chars.index('|')):]
    for k, prev_char in enumerate(prev_chars):
        new_text += str(n) + '\t|ic ' + str(char_dict[prev_char]) + ':1'
        if k == 0:
            new_text += '\t|oc ' + str(char_dict[char]) + ':1'
        new_text += '\n'
        nb_sequences += 1

# Write string to file
with open(data_path, "w") as text_file:
    text_file.write(new_text)
        
# Create dictionary string
dict_text = ''
for l in sorted(char_dict, key=char_dict.get):
    dict_text += l + '\n'

# Write dictionary to file
with open(dict_path, "w") as dict_file:
    dict_file.write(dict_text)

# Get number of sequences
nb_sequences = len(source_text) - 1

# Number of chars in vocabulary
vocab_size = num_labels = len(char_dict)

In [None]:
# Model dimensions
input_dim = vocab_size
label_dim = num_labels
hidden_dim = 256

# Function to create model
def create_model():
    with default_options(initial_state=0.1):
        # Batch normalization seems to help stabilize the initial learning, but doesn't work on CPU at the moment
        return Sequential([
                Recurrence(LSTM(hidden_dim), go_backwards=False), 
                #BatchNormalization(),
                Dense(num_labels)
            ])

def create_reader(path, is_training):
    ic_stream = StreamDef(field='ic', shape=vocab_size, is_sparse=True)
    oc_stream = StreamDef(field='oc', shape=num_labels, is_sparse=True)
    stream_defs = StreamDefs(ic = ic_stream, oc = oc_stream)
    ctf_deserializer = CTFDeserializer(path, stream_defs)
    mbs = MinibatchSource(ctf_deserializer, randomize=is_training, 
                          epoch_size = INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
    return mbs

def create_criterion_function(model):
    labels = Placeholder()
    ce = cross_entropy_with_softmax(model, labels)
    errs = classification_error(model, labels)
    return combine ([ce, errs])

def train(reader, model, max_epochs=1000):
    criterion = create_criterion_function(model)
    criterion.replace_placeholders({criterion.placeholders[0]: Input(vocab_size), 
                                    criterion.placeholders[1]: Input(num_labels)})
    
    # Set epoch size; usually one pass of the data set, but CNTK doesn't really care about this
    epoch_size = 100000
    
    # Set minibatch size - is this really sequences, or is it samples?
    minibatch_size = 100
    
    # Set learning rate schedule - a flat 0.001 usually works very well for Adam, since it should
    # adaptively decay the learning rate for each parameter. However, CNTK does not seem to agree ...
    #lr_schedule = learning_rate_schedule([(15, 0.1), (15, 0.01), (15, 0.001), (1, 0.0001)], UnitType.sample, epoch_size)
    lr_schedule = learning_rate_schedule(0.001, UnitType.sample)
    
    # Set momentum schedule
    #momentum_as_time_constant = momentum_as_time_constant_schedule(700)
    m_schedule = momentum_schedule(0.95)
    
    # Define optimizer
    #learner = sgd(criterion.parameters, lr=lr_schedule)
    learner = adam_sgd(criterion.parameters, lr=lr_schedule, momentum=m_schedule)
    
    # Define trainer
    trainer = Trainer(model, criterion.outputs[0], criterion.outputs[1], learner)
    
    # Process minibatches and perform training
    log_number_of_parameters(model)
    progress_printer = ProgressPrinter(freq=1000, tag='Training')
    
    t = 0
    for epoch in range(max_epochs):
        epoch_end = (epoch+1) * epoch_size
        while t < epoch_end:
            data = reader.next_minibatch(minibatch_size, input_map={
                    criterion.arguments[0]: reader.streams.ic, 
                    criterion.arguments[1]: reader.streams.oc
                })
            trainer.train_minibatch(data)
            t += data[criterion.arguments[1]].num_samples
            progress_printer.update_with_trainer(trainer, with_metric=True)
        loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True)
    
    return loss, metric

In [None]:
def do_train():
    global model
    model = create_model()
    reader = create_reader(data_path, is_training=True)
    train(reader, model)
do_train()