In [1]:
import os
import re
import sys
import json
import random
import itertools
import numpy as np
import pandas as pd
from nltk import ngrams
from pandas.io.json import json_normalize
import keras
from keras.layers import LSTM, Dense, Bidirectional
from keras.models import Sequential, load_model
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
def load_json(json_path, artists=[]):
    if (os.path.isfile(json_path)):
        print("json")
        with open(json_path) as f:
            song_data = json.load(f)
            return song_data['songs']
        
    elif (os.path.isdir(json_path)):
        data = []
        json_files = []
        if (len(artists) > 0):
            for artist in artists:
                json_files = json_files + [json_file for json_file in os.listdir(json_path) if ((json_file.endswith('.json')) & (artist in json_file))]
        else:
            json_files = [json_file for json_file in os.listdir(json_path) if json_file.endswith('.json')]

        for json_file in json_files:
            path_to_json = os.path.join(json_path, json_file)
            with open(path_to_json) as f:
                song_data = json.load(f)
                data = data + song_data['songs']
        
        return data
    
    
def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    
    return distribution / np.sum(distribution)


def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)

In [3]:
json_path = '../../data/deutsch'
artists = ['Bushido']

In [4]:
data = load_json(json_path, artists)
df = json_normalize(data)
print(len(df))
lyrics = df.lyrics.values

100


In [5]:
lyrics_in_words = []
for lyric in lyrics:
    lyric = lyric.replace('\n', ' \n ').lower()
    words = lyric.split(' ')
    lyrics_in_words.append(words)
    
print('Corpus length in words:', len(list(itertools.chain(*lyrics_in_words))))

Corpus length in words: 60292


In [6]:
words = set(list(itertools.chain(*lyrics_in_words)))
print('Unique words:', len(words))
word_index = dict((c, i) for i, c in enumerate(words))
index_word = dict((i, c) for i, c in enumerate(words))

Unique words: 9108


In [7]:
maxlen = 10  # extraxt sequences of n characters
step = 1

sentences = []
next_words = []
for lyric in lyrics_in_words:
    for i in range(0, len(lyric) - maxlen, step): # iterates by step size
        sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
        next_words.append(lyric[i + maxlen])

In [8]:
sentences_train, sentences_test, next_words_train, next_words_test = train_test_split(sentences, next_words)

In [9]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, sentences, next_words, maxlen, word_index, batch_size=32, shuffle=True):
        self.batch_size = batch_size
        self.next_words = next_words
        self.sentences = sentences
        self.shuffle = shuffle
        self.maxlen = maxlen
        self.word_index = word_index
        self.on_epoch_end()
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.sentences) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        sentences_temp = [self.sentences[k] for k in indexes]
        next_words_temp = [self.next_words[l] for l in indexes]

        # Generate data
        X, y = self.__data_generation(sentences_temp, next_words_temp)

        return X, y
        
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.sentences))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, sentences, next_words):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.zeros((len(sentences), self.maxlen, len(word_index))) # (sentences)
        y = np.zeros((len(sentences), len(word_index)), dtype=np.bool)

        # Generate data
        for i, sentence in enumerate(sentences):
            for t, word in enumerate(sentence):
                X[i, t, word_index[word]] = 1    # one hot encoding
                y[i, word_index[next_words[i]]] = 1

        return X, y

In [16]:
BATCH_SIZE = 64
EPOCHS = 1
DIR = './wordbased/WordBased_{}_E{}'.format(artists[0], EPOCHS)

if not os.path.exists(DIR):
    os.makedirs(DIR)

In [11]:
training_generator = DataGenerator(sentences_train, next_words_train, maxlen, word_index, batch_size=BATCH_SIZE)
test_generator = DataGenerator(sentences_test, next_words_test, maxlen, word_index, batch_size=BATCH_SIZE)

In [12]:
tensorboard = TensorBoard(log_dir=os.path.join(DIR, 'logs'))
modelCheckpoint = ModelCheckpoint(filepath=os.path.join(DIR, "model.h5"))
#earlyStopping = EarlyStopping(patience=8)

In [13]:
model = Sequential()
model.add(Bidirectional(LSTM(128), input_shape=(maxlen, len(words))))
model.add(Dense(len(words), activation='softmax'))

optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [15]:
model.fit_generator(training_generator,
    epochs=EPOCHS,
    validation_data=test_generator,
    callbacks=[tensorboard, modelCheckpoint])

Epoch 1/1


OSError: Unable to create file (unable to open file: name = './wordbased/WordBased_Bushido_E1/model.h5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 602)