In [1]:
import os
import re
import sys
import json
import random
import itertools
from copy import copy
import numpy as np
import pandas as pd
from nltk import ngrams
from pandas.io.json import json_normalize
import keras
from keras.layers import LSTM, Dense, Bidirectional, CuDNNLSTM, Dropout
from keras.models import Sequential, load_model
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


## Utils

In [2]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, sentences, next_words, maxlen, word_index, batch_size=32, shuffle=True):
        self.batch_size = batch_size
        self.next_words = next_words
        self.sentences = sentences
        self.shuffle = shuffle
        self.maxlen = maxlen
        self.word_index = word_index
        self.on_epoch_end()
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.sentences) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        sentences_temp = [self.sentences[k] for k in indexes]
        next_words_temp = [self.next_words[l] for l in indexes]

        # Generate data
        X, y = self.__data_generation(sentences_temp, next_words_temp)

        return X, y
        
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.sentences))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, sentences, next_words):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.zeros((len(sentences), self.maxlen, len(word_index))) # (sentences)
        y = np.zeros((len(sentences), len(word_index)), dtype=np.bool)

        # Generate data
        for i, sentence in enumerate(sentences):
            for t, word in enumerate(sentence):
                X[i, t, word_index[word]] = 1    # one hot encoding
                y[i, word_index[next_words[i]]] = 1

        return X, y

def load_json(json_path, artists=[]):
    if (os.path.isfile(json_path)):
        print("json")
        with open(json_path) as f:
            song_data = json.load(f)
            return song_data['songs']
        
    elif (os.path.isdir(json_path)):
        data = []
        json_files = []
        if (len(artists) > 0):
            for artist in artists:
                json_files = json_files + [json_file for json_file in os.listdir(json_path) if ((json_file.endswith('.json')) & (artist in json_file))]
        else:
            json_files = [json_file for json_file in os.listdir(json_path) if json_file.endswith('.json')]

        for json_file in json_files:
            path_to_json = os.path.join(json_path, json_file)
            with open(path_to_json) as f:
                song_data = json.load(f)
                data = data + song_data['songs']
        
        return data
    
    
def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    
    return distribution / np.sum(distribution)


def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)

def normalize_lyric(text, lower=True):
    if lower:
        text = text.lower()
    text = re.sub('\[.+\](\\n)|\[.+\](\(.*\))', '', text)
    return text 

## Load Data

In [3]:
json_path = '../data/deutsch'
artists = ['Bushido']

data = load_json(json_path, artists)
df = json_normalize(data)
lyrics = df.lyrics.map(lambda lyric: normalize_lyric(lyric))

## Preprocess Data

In [4]:
lyrics_in_words = []
for lyric in lyrics:
    lyric = lyric.replace('\n', ' \n ').lower()
    words = lyric.split(' ')
    lyrics_in_words.append(words)
    
print('Corpus length in words:', len(list(itertools.chain(*lyrics_in_words))))

Corpus length in words: 58768


In [5]:
words = set(list(itertools.chain(*lyrics_in_words)))
print('Unique words:', len(words))
word_index = dict((c, i) for i, c in enumerate(words))
index_word = dict((i, c) for i, c in enumerate(words))

Unique words: 9013


In [6]:
maxlen = 7
step = 1

sentences = []
next_words = []
for lyric in lyrics_in_words:
    for i in range(0, len(lyric) - maxlen, step): # iterates by step size
        sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
        next_words.append(lyric[i + maxlen])

## Create Datagenerators

In [7]:
sentences_train, sentences_test, next_words_train, next_words_test = train_test_split(sentences, next_words)

In [8]:
EPOCHS = 1
BATCH_SIZE = 64
DIR = '../outputs/wordbased/LSTM_Simple_WordBased_{}_E{}_BS{}_ML{}_SS{}'.format(artists[0], EPOCHS, BATCH_SIZE, maxlen, step)

if not os.path.exists(DIR):
    os.makedirs(DIR)

In [9]:
training_generator = DataGenerator(sentences_train, next_words_train, maxlen, word_index, batch_size=BATCH_SIZE)
test_generator = DataGenerator(sentences_test, next_words_test, maxlen, word_index, batch_size=BATCH_SIZE)

## Callbacks

In [10]:
tensorboard = TensorBoard(log_dir=os.path.join(DIR, 'logs'), write_images=True, write_grads=True)
modelCheckpoint_best = ModelCheckpoint(filepath=os.path.join(DIR, "model_best.h5"), save_best_only=True)
modelCheckpoint = ModelCheckpoint(filepath=os.path.join(DIR, "model.h5"), save_best_only=False)
#earlyStopping = EarlyStopping(patience=8)

## Build Model

In [11]:
model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(maxlen, len(words)), return_sequences=True))
model.add(CuDNNLSTM(128))
model.add(Dropout(0.5))
model.add(Dense(len(words), activation='softmax'))

optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_1 (CuDNNLSTM)     (None, 7, 128)            4681216   
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 128)               132096    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 9013)              1162677   
Total params: 5,975,989
Trainable params: 5,975,989
Non-trainable params: 0
_________________________________________________________________


## Train Model

In [14]:
model.fit_generator(training_generator,
    epochs=EPOCHS,
    validation_data=test_generator,
    callbacks=[tensorboard, modelCheckpoint, modelCheckpoint_best])

Epoch 1/1


<keras.callbacks.History at 0x129cca438>

In [13]:
#AUTO TRAIN
BATCH_SIZE = 64
RANGE = 15
GEN_WORD_LEN = 453
training_generator = DataGenerator(sentences_train, next_words_train, maxlen, word_index, batch_size=BATCH_SIZE)
test_generator = DataGenerator(sentences_test, next_words_test, maxlen, word_index, batch_size=BATCH_SIZE)
for it in range(1, RANGE):
    EPOCHS = 5
    DIR = '../outputs/wordbased/2layer_dropout/LSTM_2layer_dropout_WordBased_{}_E{}_BS{}_ML{}_SS{}'.format(artists[0], EPOCHS*it, BATCH_SIZE, maxlen, step)
    if not os.path.exists(DIR):
        os.makedirs(DIR)
        
    tensorboard = TensorBoard(log_dir=os.path.join(DIR, 'logs'), write_images=True, write_grads=True)
    modelCheckpoint_best = ModelCheckpoint(filepath=os.path.join(DIR, "model_best.h5"), save_best_only=True)
    modelCheckpoint = ModelCheckpoint(filepath=os.path.join(DIR, "model.h5"), save_best_only=False)

    model.fit_generator(training_generator,
        epochs=EPOCHS,
        validation_data=test_generator,
        callbacks=[tensorboard, modelCheckpoint, modelCheckpoint_best])
    
    ##########
    temperatures = [0.2, 0.4, 0.5, 0.6, 0.8, 1.]
    for temperature in temperatures:
        generated_text_temp = ["wenn", "der", "benz", "anspringt", "und", "die", "reifen"]
        generated_text = copy(generated_text_temp)
        print(" ".join(generated_text) + '_')
        #print('\n___________________\n')
        for i in range(GEN_WORD_LEN):
            sampled = np.zeros((1, maxlen, len(words)))

            for t, word in enumerate(generated_text_temp):
                sampled[0, t, word_index[word]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_word = index_word[next_index]
            generated_text_temp.append(next_word)
            generated_text.append(next_word)
            generated_text_temp = generated_text_temp[1:]
            #sys.stdout.write(" " + next_word)
            
        with open(os.path.join(DIR, '{}_temp{}_text.txt'.format(artists[0], temperature)), 'w+') as text_file:
            text_file.write(' '.join(generated_text))

ResourceExhaustedError: OOM when allocating tensor with shape[9013,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: cu_dnnlstm_1/kernel/Assign = Assign[T=DT_FLOAT, use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](cu_dnnlstm_1/kernel, cu_dnnlstm_1/random_uniform)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'cu_dnnlstm_1/kernel/Assign', defined at:
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._run_once()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
    handle._run()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 1233, in inner
    self.run()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 370, in dispatch_queue
    yield self.process_one()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 346, in wrapper
    runner = Runner(result, future, yielded)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 1080, in __init__
    self.run()
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tornado/gen.py", line 326, in wrapper
    yielded = next(result)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3185, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-6adabb63d5f6>", line 2, in <module>
    model.add(CuDNNLSTM(128, input_shape=(maxlen, len(words)), return_sequences=True))
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/keras/engine/sequential.py", line 165, in add
    layer(x)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/keras/layers/recurrent.py", line 532, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/keras/engine/base_layer.py", line 431, in __call__
    self.build(unpack_singleton(input_shapes))
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/keras/layers/cudnn_recurrent.py", line 436, in build
    constraint=self.kernel_constraint)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/keras/engine/base_layer.py", line 252, in add_weight
    constraint=constraint)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 402, in variable
    v = tf.Variable(value, dtype=tf.as_dtype(dtype), name=name)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 235, in __init__
    constraint=constraint)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 387, in _init_from_args
    validate_shape=validate_shape).op
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/state_ops.py", line 283, in assign
    validate_shape=validate_shape)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/gen_state_ops.py", line 60, in assign
    use_locking=use_locking, name=name)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/home/gruppe3B/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[9013,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: cu_dnnlstm_1/kernel/Assign = Assign[T=DT_FLOAT, use_locking=true, validate_shape=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](cu_dnnlstm_1/kernel, cu_dnnlstm_1/random_uniform)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



In [None]:
range(2)

## Generate Text

In [32]:
temperature = 0.5
random.seed(3004)

#lyrics_index = random.randint(0, len(lyrics))
#chosen_lyric = lyrics_in_words[lyrics_index]
#start_index = random.randint(0, len(chosen_lyric) - maxlen - 1)
#generated_text_temp = chosen_lyric[start_index: start_index + maxlen]
generated_text = ["wenn", "der", "benz", "anspringt", "und", "die", "reifen"]
generated_text_temp = copy(generated_text_temp)
print(" ".join(generated_text) + '_')
#print('\n___________________\n')
for i in range(100):
    sampled = np.zeros((1, maxlen, len(words)))
            
    for t, word in enumerate(generated_text_temp):
        sampled[0, t, word_index[word]] = 1.
                      
    preds = model.predict(sampled, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_word = index_word[next_index]
    generated_text_temp.append(next_word)
    generated_text.append(next_word)
    generated_text_temp = generated_text_temp[1:]
    sys.stdout.write(" " + next_word)

wenn der benz anspringt und die reifen_
 tag 
 ich bin ein - und du bist 
 doch es ist noch immer wie ein mann 
  
  
 kannst du die es sehen, kannst du gott haben mein mit hat 
 in bleib mehr wenn der meine nicht ist 
 mein rap ist geld und mein hund 
 die es der nicht mehr einfach aus 
 mein ohne ist mehr mit als weil es zum nicht mehr wie 
 du für mir scheiße nie mehr von der - ich kannst du nicht mit mir auf sein 
 du niemals mich einfach an der vater 


In [29]:
generated_text_temp

['auf', 'dem', '\n', 'ihr', '', '\n', 'black,']

In [30]:
generated_text

['wenn',
 'der',
 'benz',
 'anspringt',
 'und',
 'die',
 'reifen',
 'immer',
 'als',
 'ein',
 'der',
 'in',
 'deinen',
 'kein',
 'im',
 '\n',
 'bist',
 'weil',
 'ein',
 'mir',
 'wenn',
 'die',
 'du',
 'bist',
 'ich',
 'nicht',
 'dich',
 'a',
 'als',
 '\n',
 'am',
 'den',
 'mir',
 'am',
 'was',
 'ihr',
 'dass',
 'ihr',
 'einfach',
 'nicht',
 'mal',
 'und',
 'die',
 'dir',
 'nicht',
 'mehr',
 '\n',
 'ihr',
 'habt',
 'alle',
 "'ner",
 'gegen',
 'auf',
 '\n',
 'mann,',
 'gibt',
 'es',
 'keine',
 'wieder',
 'war',
 '\n',
 'ich',
 'bin',
 'immer',
 'noch',
 'der',
 'nicht',
 'in',
 'meinen',
 'cool',
 '\n',
 'so',
 'dein',
 'ihr',
 'leben',
 'war',
 '\n',
 'ich',
 'hab',
 "'ne",
 'ein',
 'mit',
 'der',
 'nie',
 'mehr',
 'nach',
 'der',
 'sein',
 '\n',
 'du',
 'hast',
 'ein',
 'es',
 'gibt',
 'seine',
 "'ne",
 '\n',
 'wir',
 'sind',
 'ein',
 'auf',
 'dem',
 '\n',
 'ihr',
 '',
 '\n',
 'black,']