In [1]:
import os
import re
import sys
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import ngrams
from pandas.io.json import json_normalize
import keras
from keras.layers import LSTM, Dense, CuDNNLSTM, Dropout
from keras.models import Sequential, load_model
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split



Using TensorFlow backend.


## Utils

In [2]:
def load_json(json_path, artists=[]):
    if (os.path.isfile(json_path)):
        print("json")
        with open(json_path) as f:
            song_data = json.load(f)
            return song_data['songs']
        
    elif (os.path.isdir(json_path)):
        data = []
        json_files = []
        if (len(artists) > 0):
            for artist in artists:
                json_files = json_files + [json_file for json_file in os.listdir(json_path) if ((json_file.endswith('.json')) & (artist in json_file))]
        else:
            json_files = [json_file for json_file in os.listdir(json_path) if json_file.endswith('.json')]

        for json_file in json_files:
            path_to_json = os.path.join(json_path, json_file)
            with open(path_to_json) as f:
                song_data = json.load(f)
                data = data + song_data['songs']
        
        return data
    
    
def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    
    return distribution / np.sum(distribution)


def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)

def normalize_lyric(text, lower=True):
    if lower:
        text = text.lower()
    text = re.sub('\[.+\](\\n)|\[.+\](\(.*\))', '', text)
    return text 


def plotTraining(model, epochs):
        """Plot graphs"""
        # Load the training statistics (model.history)
        # Plot training loss and accuracy
        
        
        # YOUR CODE HERE
        epochs = range(1, epochs + 1)
        loss_values = model.history.history['loss']
        val_loss_values = model.history.history['val_loss']        
        
        plt.plot(epochs, loss_values, 'bo', label='Training loss')
        plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
        plt.title('Training and validation loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()
        plt.clf()

        # Plot validation loss and accuracy
        # YOUR CODE HERE
        acc_values = model.history.history['acc']
        val_acc_values = model.history.history['val_acc']
        
        plt.plot(epochs, acc_values, 'bo', label='Training acc')
        plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
        plt.title('Training and validation accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.show()

In [3]:
# Parameterts
maxlen = 45  # extraxt sequences of n characters
step = 3    # sample new seq every n characters
n_grams_len = 0
json_path = '../data/deutsch'
artists = ['Bushido']

## Datapreprocessing

In [4]:
# load data
data = load_json(json_path, artists)
df = json_normalize(data)
lyrics = df.lyrics.map(lambda lyric: normalize_lyric(lyric))

print('Number of Songs: {}'.format(len(df)))
print('Corpus length: {}'.format(len("".join(lyrics))))

Number of Songs: 100
Corpus length: 297389


In [5]:
sentences = []
next_chars = []
chars = []
for lyric in lyrics:
    lyric = lyric.lower()
    if n_grams_len > 1:
        for i in range(0, len(lyric) - maxlen - n_grams_len): # iterates by step size
            sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
            next_chars.append(lyric[i + maxlen: i + maxlen + n_grams_len])
        
        ngrams_iter = ngrams(lyric, n_grams_len)
        for gram in ngrams_iter:
            chars.append(''.join(list(gram)))
        chars = sorted(list(set(chars)))
    else:
        for i in range(0, len(lyric) - maxlen, step): # iterates by step size
            sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
            next_chars.append(lyric[i + maxlen])
        
print('Number of sequences:', len(sentences))

if n_grams_len < 1:
    chars = sorted(list(set(''.join(lyrics)))) # list of unique characters

print('Unique characters:', len(chars))

char_indices = dict((char, chars.index(char)) for char in chars) # maps char with index

Number of sequences: 97666
Unique characters: 76


In [6]:
print('Vectorization...')

x = np.zeros((len(sentences), maxlen, len(chars))) # (sentences)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1    # one hot encoding
    y[i, char_indices[next_chars[i]]] = 1  # one hot encoding
    
#for i, sentence in enumerate(sentences):
#    for t in range(0, len(sentence) - n_grams_len):
#        char = sentence[t:t+n_grams_len]
#        x[i, t, char_indices[char]] = 1    # one hot encoding
#    y[i, char_indices[next_chars[i]]] = 1  # one hot encodin


Vectorization...


## Split Dataset

In [7]:
sentences_train, sentences_test, next_chars_train, next_chars_test = train_test_split(x, y)

## Build Model

In [9]:
model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(maxlen, len(chars)), return_sequences=True))
#model.add(Dropout(0.5))
model.add(CuDNNLSTM(128))
model.add(Dropout(0.5))
model.add(Dense(len(chars), activation='softmax'))

optimizer = keras.optimizers.RMSprop(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_3 (CuDNNLSTM)     (None, 45, 128)           105472    
_________________________________________________________________
cu_dnnlstm_4 (CuDNNLSTM)     (None, 128)               132096    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 76)                9804      
Total params: 247,372
Trainable params: 247,372
Non-trainable params: 0
_________________________________________________________________


In [10]:
EPOCHS = 1
BATCH_SIZE = 64

DIR = '../outputs/charbased/2layer/LSTM_2Layer_CharBased_{}_E{}_BS{}_ML{}_SS{}'.format(artists[0], EPOCHS, BATCH_SIZE, maxlen, step)

if not os.path.exists(DIR):
    os.makedirs(DIR)

## Callbacks

In [38]:
tensorboard = TensorBoard(log_dir=os.path.join(DIR, 'logs'), write_images=True, write_grads=True)
modelCheckpoint_best = ModelCheckpoint(filepath=os.path.join(DIR, "model_best.h5"), save_best_only=True)
modelCheckpoint = ModelCheckpoint(filepath=os.path.join(DIR, "model.h5"), save_best_only=False)

## Train Model

In [39]:
model.fit(sentences_train, next_chars_train, 
          batch_size=BATCH_SIZE, 
          epochs=EPOCHS,
          validation_data=(sentences_test, next_chars_test),
          callbacks=[tensorboard, modelCheckpoint, modelCheckpoint_best])

Train on 219666 samples, validate on 73223 samples
Epoch 1/1


<keras.callbacks.History at 0x12a415860>

In [None]:
##############

In [None]:
for it in range(1, 10):
    EPOCHS = 5
    BATCH_SIZE = 64
    GEN_CHAR_LEN = 2973

    DIR = '../outputs/charbased/2layer_dropout_lr0.001/LSTM_2Layer_lr0.001_CharBased_{}_E{}_BS{}_ML{}_SS{}'.format(artists[0], EPOCHS*it, BATCH_SIZE, maxlen, step)
    if not os.path.exists(DIR):
        os.makedirs(DIR)
    
    tensorboard = TensorBoard(log_dir=os.path.join(DIR, 'logs'), write_images=True, write_grads=True)
    modelCheckpoint_best = ModelCheckpoint(filepath=os.path.join(DIR, "model_best.h5"), save_best_only=True)
    modelCheckpoint = ModelCheckpoint(filepath=os.path.join(DIR, "model.h5"), save_best_only=False)
        
    model.fit(sentences_train, next_chars_train, 
          batch_size=BATCH_SIZE, 
          epochs=EPOCHS,
          validation_data=(sentences_test, next_chars_test),
          callbacks=[tensorboard, modelCheckpoint, modelCheckpoint_best])
    
    temperatures = [0.2, 0.4, 0.5, 0.6, 0.8, 1.]
    for temperature in temperatures:
        generated_text_temp = "wenn der benz anspringt und die reifen wieder"
        generated_text = generated_text_temp
        print(generated_text + '_')
        #print('\n___________________\n')
        for i in range(GEN_CHAR_LEN):
            sampled = np.zeros((1, maxlen, len(chars)))

            for t, char in enumerate(generated_text_temp):
                sampled[0, t, char_indices[char]] = 1.

            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]
            generated_text_temp += next_char
            generated_text += next_char
            generated_text_temp = generated_text_temp[1:]
            #sys.stdout.write(next_char)
        
        with open(os.path.join(DIR, '{}_temp{}_text.txt'.format(artists[0], temperature)), 'w+') as text_file:
            text_file.write(generated_text)
            
            

Train on 73249 samples, validate on 24417 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
wenn der benz anspringt und die reifen wieder_
wenn der benz anspringt und die reifen wieder_


## Generate Text

In [40]:
temperature = 0.5

#start_index = random.randint(0, len(lyrics) - maxlen - 1)
#generated_text = lyrics[start_index: start_index + maxlen]
#random.seed(3004)
#lyrics_index = random.randint(0, len(lyrics))
#chosen_lyric = lyrics[lyrics_index]
#start_index = random.randint(0, len(chosen_lyric) - maxlen - 1)
#generated_text_temp = chosen_lyric[start_index: start_index + maxlen]
generated_text_temp = "wenn der benz anspringt und die reifen wieder"
generated_text = generated_text_temp
print(generated_text + '_')
#print('\n___________________\n')
for i in range(1500):
    sampled = np.zeros((1, maxlen, len(chars)))
            
    for t, char in enumerate(generated_text_temp):
        sampled[0, t, char_indices[char]] = 1.
                      
    preds = model.predict(sampled, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_char = chars[next_index]
    generated_text_temp += next_char
    generated_text += next_char
    generated_text_temp = generated_text_temp[1:]
    sys.stdout.write(next_char)

wenn der benz anspringt und die reifen wieder_
 herz
und der ich hab' mich schein freunde wie deinen teil die mama sieht
ich bin der himmel mir auf deine mutter peine musste
hängt man kuter keine streit, dass die schickt
scheiß fallende resten treiber
ich bin ein platz, der bruder die ganze scheine

das ist mein hunde hat der aller, der für mich
ich bin euer die ganze redienste reinen
dass die schlasse berliner hat ihr will mich aus
die scheine welter sich wie ein arsch
das ist die schlicht auf die schwarz es ein erdesenter
habe meine scheine ersticht haben, dass ich will
wir die schickte mich sagt mal hier jetzt bist
war die kingt schlieft und das ist schwarzgelden und im schwanz sein
ich bin deine tränen scheine nicht mehr eine erzähle
der verschwinne ist meine straße und das bist
du warst nur bin es leiden hat die schralen zu lebt
ich bin der leben, war die wesst du deine schweine sohne ein sohne schein, dass ich sein in dem fallen
fick mich nur in der haufen bist du verliehst
habe 

In [44]:
generated_text

"wenn der benz anspringt und die reifen wieder sein\nich kann den kommen in den schaffen für dich\nich bin der schwuchel dieser andere schleiße\ndenn wenn der heiman, ich weiß du wieder scheiße\ndenn wenn der heiman, ich war die schreie dich\ndenn wir kommen, wie du den schwule schlafen ist schleiß auf den schutz\nes ist so wie gern kann ich weiß sein ich wie ein grauen schlafe\ndenn ich keine feinde in den schatz, ich schaff\nich weiß du keine feinde auf dem schwanzen geschicht\ndenn wenn der hart nicht schon mit den kommt schleinte\nich schaffe dich den schwule sein schleiß stadt\ndenn ich weiß du keine schleinen hart sehen\ndenn wir kommen, wie das gesehen, dass du ein freunde\ndenn ich weiß sein erwegen kommt der stadt\nich hab' die schwanz, wie du den schwanzen schickt\ndas ist der kommen geld macht den schlechten an die schatz, was ich den schwarzen auf dem komm\nich hab die schwanzen auf der wollen schule\nes ist euch scheiß den aller schickt\nich bin der stadt an die schwuchte 

In [41]:
#file_name = "Bushido_60epochs_30maxlen_0ngrams"
with open(os.path.join(DIR, 'temp{}_text.txt'.format(temperature))) as text_file:
    text_file.write(generated_text)

FileNotFoundError: [Errno 2] No such file or directory: '../outputs/charbased/LSTM_Simple_CharBased_Bushido_E1_BS64_ML45_SS1/temp0.5_text.txt'