In [1]:
import os
import math
import numpy as np
import random

from cntk.blocks import default_options, LSTM, Placeholder, Input
from cntk.layers import Embedding, Recurrence, Dense, BatchNormalization
from cntk.models import Sequential
from cntk.utils import ProgressPrinter, log_number_of_parameters
from cntk.io import MinibatchSource, CTFDeserializer
from cntk.io import StreamDef, StreamDefs
from cntk.io import INFINITELY_REPEAT, FULL_DATA_SWEEP
from cntk import *
from cntk.learner import sgd, adam_sgd, learning_rate_schedule
from cntk.device import set_default_device, gpu

In [2]:
set_default_device(gpu(0))

In [3]:
random.seed(1)
raw_data_path = 'data/songs.txt'
data_path = 'data/songs_processed.ctf'
dict_path = 'data/dict.ctf'

with open(raw_data_path, encoding='utf8') as f:
    source_text = f.read().lower()
    
replacements = [["’", "'"], 
                ['“', '"'], 
                ['”', '"'],
                ['\n', '$']]

# Make replacements
for r in replacements:
    source_text = source_text.replace(r[0], r[1])

In [4]:
chars = [[k, v] for v, k in enumerate(sorted(set(source_text)))]
char_dict = {key: value for (key, value) in chars}

nb_songs = source_text.count('|')
seq_max_length = 2

In [5]:
new_text = ''
nb_sequences = 0
for n, char in enumerate(source_text[1:]):
    #if n > 531:
    #    break
    prev_chars = source_text[max(0,(n+1-seq_max_length)):n+1]
    if '|' in prev_chars:
        prev_chars = prev_chars[max(0,prev_chars.index('|')):]
    for k, prev_char in enumerate(prev_chars):
        new_text += str(n) + '\t|ic ' + str(char_dict[prev_char]) + ':1'
        if k == 0:
            new_text += '\t|oc ' + str(char_dict[char]) + ':1'
        new_text += '\n'
        nb_sequences += 1

dict_text = ''
for l in sorted(char_dict, key=char_dict.get):
    dict_text += l + '\n'

In [6]:
with open(data_path, "w") as text_file:
    text_file.write(new_text)

In [7]:
with open(dict_path, "w") as dict_file:
    dict_file.write(dict_text)

In [8]:
nb_sequences = len(source_text) - 1
#nb_sequences = 532
nb_sequences

30646

In [9]:
# Number of chars in vocabulary
vocab_size = num_labels = len(char_dict)

# Model dimensions
input_dim = vocab_size
label_dim = num_labels
hidden_dim = 256

In [10]:
# Function to create model
def create_model():
    with default_options(initial_state=0.1):
        return Sequential([
                Recurrence(LSTM(hidden_dim), go_backwards=False), 
                BatchNormalization(),
                Dense(num_labels)
            ])

def create_reader(path, is_training):
    ic_stream = StreamDef(field='ic', shape=vocab_size, is_sparse=True)
    oc_stream = StreamDef(field='oc', shape=num_labels, is_sparse=True)
    stream_defs = StreamDefs(ic = ic_stream, oc = oc_stream)
    ctf_deserializer = CTFDeserializer(path, stream_defs)
    mbs = MinibatchSource(ctf_deserializer, randomize=is_training, 
                          epoch_size = INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
    return mbs

def create_criterion_function(model):
    labels = Placeholder()
    ce = cross_entropy_with_softmax(model, labels)
    errs = classification_error(model, labels)
    return combine ([ce, errs])

def train(reader, model, max_epochs=1000):
    criterion = create_criterion_function(model)
    criterion.replace_placeholders({criterion.placeholders[0]: Input(vocab_size), 
                                    criterion.placeholders[1]: Input(num_labels)})
    
    epoch_size = 100000
    minibatch_size = 200
    
    # Define learning rate schedule
    #lr_per_sample = [0.01]*30 + [0.008]*30 + [0.006]*30 + [0.002]*30 + [0.0008]
    #lr_per_minibatch = [x * minibatch_size for x in lr_per_sample]
    #lr_schedule = learning_rate_schedule([(15, 0.1), (15, 0.01), (15, 0.001), (1, 0.0001)], UnitType.sample, epoch_size)
    lr_schedule = learning_rate_schedule(0.001, UnitType.sample)
    
    # Define momentum
    #momentum_as_time_constant = momentum_as_time_constant_schedule(700)
    m_schedule = momentum_schedule(0.95)
    
    # Define optimizer
    #learner = adam_sgd(criterion.parameters, lr=lr_schedule, momentum=momentum_as_time_constant, 
    #                   low_memory=True, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True)
    #learner = sgd(criterion.parameters, lr=lr_schedule)
    learner = adam_sgd(criterion.parameters, lr=lr_schedule, momentum=m_schedule)
    
    # Define trainer
    trainer = Trainer(model, criterion.outputs[0], criterion.outputs[1], learner)
    
    # Process minibatches and perform training
    log_number_of_parameters(model)
    progress_printer = ProgressPrinter(freq=1000, tag='Training')
    
    t = 0
    for epoch in range(max_epochs):
        epoch_end = (epoch+1) * epoch_size
        while t < epoch_end:
            data = reader.next_minibatch(minibatch_size, input_map={
                    criterion.arguments[0]: reader.streams.ic, 
                    criterion.arguments[1]: reader.streams.oc
                })
            trainer.train_minibatch(data)
            t += data[criterion.arguments[1]].num_samples
            progress_printer.update_with_trainer(trainer, with_metric=True)
        loss, metric, actual_samples = progress_printer.epoch_summary(with_metric=True)
    
    return loss, metric

In [11]:
def do_train():
    global model
    model = create_model()
    reader = create_reader(data_path, is_training=True)
    train(reader, model)
do_train()

Training 325168 parameters in 7 parameter tensors.
Finished Epoch [1]: [Training] loss = 2.589912 * 100138, metric = 75.6% * 100138 1.216s (82320.1 samples per second)
Finished Epoch [2]: [Training] loss = 2.468011 * 99938, metric = 74.4% * 99938 1.163s (85950.1 samples per second)
Finished Epoch [3]: [Training] loss = 2.460211 * 99938, metric = 74.4% * 99938 1.155s (86545.7 samples per second)
Finished Epoch [4]: [Training] loss = 2.457896 * 100184, metric = 74.2% * 100184 1.167s (85823.7 samples per second)
Finished Epoch [5]: [Training] loss = 2.456179 * 99938, metric = 74.2% * 99938 1.150s (86879.7 samples per second)
Finished Epoch [6]: [Training] loss = 2.451255 * 99938, metric = 74.1% * 99938 1.155s (86535.2 samples per second)
Finished Epoch [7]: [Training] loss = 2.454772 * 99938, metric = 74.3% * 99938 1.163s (85962.3 samples per second)
Finished Epoch [8]: [Training] loss = 2.450090 * 100184, metric = 74.1% * 100184 1.155s (86768.2 samples per second)
Finished Epoch [9]: [Tr

KeyError: <cntk.cntk_py.StreamInformation; proxy of <Swig Object of type 'CNTK::StreamInformation *' at 0x0000000007466DB0> >

In [None]:
model = create_model()
reader = create_reader(data_path, is_training=True)

In [None]:
with open('data/sophie_elise_text.txt', 'r', encoding='utf8') as text:
    source_text = text.read().lower()

In [None]:
source_text = 'abcdefghijklmnopqrstuvwxyz' * 1000

In [None]:
lr_per_sample = [0.01]*30 + [0.008]*30 + [0.006]*30 + [0.002]*30 + [0.0008]

minibatch_size = 14
epoch_size = nb_sequences

lr_per_minibatch = [x * minibatch_size for x in lr_per_sample]
lr_schedule = learning_rate_schedule(lr_per_minibatch, epoch_size, UnitType.minibatch)

In [None]:
criterion = create_criterion_function(model)
criterion.replace_placeholders({criterion.placeholders[0]: Input(vocab_size), 
                                criterion.placeholders[1]: Input(num_labels)})

epoch_size = nb_sequences
minibatch_size = 1

# Define learning rate schedule
lr_per_sample = [0.1]*50 + [0.001]*50 + [0.0001]
lr_per_minibatch = [x * minibatch_size for x in lr_per_sample]
lr_schedule = learning_rate_schedule(lr_per_minibatch, epoch_size, UnitType.minibatch)

# Define momentum
momentum_as_time_constant = momentum_as_time_constant_schedule(700)

# Define optimizer
#learner = adam_sgd(criterion.parameters, lr=lr_schedule, momentum=momentum_as_time_constant, 
#                   low_memory=True, gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True)
#learner = sgd(criterion.parameters, lr=lr_schedule)
learner = adam_sgd(criterion.parameters, lr=lr_schedule, momentum=0.9)

# Define trainer
trainer = Trainer(model, criterion.outputs[0], criterion.outputs[1], learner)

# Process minibatches and perform training
log_number_of_parameters(model)
progress_printer = ProgressPrinter(freq=10000, first=10, tag='Training')

In [None]:
data = reader.next_minibatch(minibatch_size, input_map={
                    criterion.arguments[0]: reader.streams.ic, 
                    criterion.arguments[1]: reader.streams.oc
                })

In [None]:
help(reader.next_minibatch)

In [None]:
data.items()

In [None]:
help(data)

In [None]:
data.get()

In [None]:
dir(data)

In [None]:
criterion.arguments[1]

In [None]:
dir(criterion.arguments[0])