In [1]:
import os
import sys
import json
import random
import numpy as np
import pandas as pd
from nltk import ngrams
from pandas.io.json import json_normalize
import keras
from keras.layers import LSTM, Dense
from keras.models import Sequential, load_model

Using TensorFlow backend.


In [2]:
def load_json(json_path, artists=[]):
    if (os.path.isfile(json_path)):
        print("json")
        with open(json_path) as f:
            song_data = json.load(f)
            return song_data['songs']
        
    elif (os.path.isdir(json_path)):
        data = []
        json_files = []
        if (len(artists) > 0):
            for artist in artists:
                json_files = json_files + [json_file for json_file in os.listdir(json_path) if ((json_file.endswith('.json')) & (artist in json_file))]
        else:
            json_files = [json_file for json_file in os.listdir(json_path) if json_file.endswith('.json')]

        for json_file in json_files:
            path_to_json = os.path.join(json_path, json_file)
            with open(path_to_json) as f:
                song_data = json.load(f)
                data = data + song_data['songs']
        
        return data
    
    
def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    
    return distribution / np.sum(distribution)


def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)


In [4]:
# Parameterts
maxlen = 60  # extraxt sequences of n characters
step = 3     # sample new seq every n characters
n_grams_len = 3
json_path = '../data/deutsch'
artists = ['Bushido']

## Datapreprocessing

In [5]:
# load data
data = load_json(json_path, artists)
df = json_normalize(data)
lyrics = df.lyrics.map(lambda lyric: lyric.lower())

print('Number of Songs: {}'.format(len(df)))
print('Corpus length: {}'.format(len("".join(lyrics))))

Number of Songs: 100
Corpus length: 303654


In [6]:
sentences = []
next_chars = []
chars = []
for lyric in lyrics:
    lyric = lyric.lower()
    if n_grams_len > 1:
        for i in range(0, len(lyric) - maxlen - n_grams_len): # iterates by step size
            sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
            next_chars.append(lyric[i + maxlen: i + maxlen + n_grams_len])
        
        ngrams_iter = ngrams(lyric, n_grams_len)
        for gram in ngrams_iter:
            chars.append(''.join(list(gram)))
        chars = sorted(list(set(chars)))
    else:
        for i in range(0, len(lyric) - maxlen, step): # iterates by step size
            sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
            next_chars.append(lyric[i + maxlen])
        
print('Number of sequences:', len(sentences))

if n_grams_len < 1:
    chars = sorted(list(set(''.join(lyrics)))) # list of unique characters

print('Unique characters:', len(chars))

char_indices = dict((char, chars.index(char)) for char in chars) # maps char with index

Number of sequences: 297354
Unique characters: 9563


In [8]:
len(chars)

9563

In [9]:
next_chars[2]

'rn\n'

In [10]:
print('Vectorization...')

x = np.zeros((len(sentences), maxlen, len(chars))) # (sentences)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

"""for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1    # one hot encoding
    y[i, char_indices[next_chars[i]]] = 1  # one hot encoding
"""
for i, sentence in enumerate(sentences):
    for t in range(0, len(sentence) - n_grams_len):
        char = sentence[t:t+n_grams_len]
        x[i, t, char_indices[char]] = 1    # one hot encoding
    y[i, char_indices[next_chars[i]]] = 1  # one hot encoding

Vectorization...


## Build Model

In [11]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               4962304   
_________________________________________________________________
dense_1 (Dense)              (None, 9563)              1233627   
Total params: 6,195,931
Trainable params: 6,195,931
Non-trainable params: 0
_________________________________________________________________


## Train Model

In [13]:
epochs = 60
model.fit(x, y, batch_size=128, epochs=epochs)

Epoch 1/60
  1792/297354 [..............................] - ETA: 1:48:37 - loss: 8.5619

KeyboardInterrupt: 

In [11]:
artist = artists[0]
file_name = '{}_{}epochs_{}maxlen_{}ngrams'.format(artist, epochs, maxlen, n_grams_len)

In [12]:
file_name

'Bushido_120epochs_60maxlen_0ngrams'

In [13]:
model.save('./model_{}.h5'.format(file_name))

In [14]:
#model = load_model('./model_Bushido_60epochs_30maxlen_0ngrams.h5')

In [15]:
temperature = 0.5

#start_index = random.randint(0, len(lyrics) - maxlen - 1)
#generated_text = lyrics[start_index: start_index + maxlen]
lyrics_index = random.randint(0, len(lyrics))
chosen_lyric = lyrics[lyrics_index]
start_index = random.randint(0, len(chosen_lyric) - maxlen - 1)
generated_text_temp = chosen_lyric[start_index: start_index + maxlen]
generated_text = generated_text_temp
print(generated_text)
#print('\n___________________\n')
for i in range(1500):
    sampled = np.zeros((1, maxlen, len(chars)))
            
    for t, char in enumerate(generated_text_temp):
        sampled[0, t, char_indices[char]] = 1.
                      
    preds = model.predict(sampled, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_char = chars[next_index]
    generated_text_temp += next_char
    generated_text += next_char
    generated_text_temp = generated_text_temp[1:]
    sys.stdout.write(next_char)

lst, oder nicht
glaub mir das hier ist kein film
du kriegst 
am botorschreiben wie ein tage, du bist du mich so viele scheiße hoffnicht
ich bin der endstern, hol'n werden geben vor dem betzutter-sohn
denn ich war bestehe deine mutter, wenn du denkst du



 wie ein boms
ich bin der gatäich mallen geben
und nicht scheißen fotzen, kart drauf wie ein schwanz
ich weiß nicht start, du bist du mit dem schwanz
ich hab den scheid connt den rap selne in den strassche umphier
streit mich mit ein rocht
ich bin der endtagen und die fresse, dass du wirst du wird nur nicht mehr gern
es tut mir jetzt noch mehr an, kannst du mir
ich kann mir mein wast, wer dieser dich aus internacht
ich kauf dir deine jungs ich mache, dass du ein hur scheiße zu tunkt
ich vor dir dealte straßen sich stängt
nein mussonne kack genau jetzt sein
denn ich war ich scheiß du willst du deine wange
viele deutschen, sie wolltest du auf dem straßen
du wirst kein besten verstehen
du bist mein erztehen filgt mich reingen
denn sie weil hab' ich dich bein gesehn
was du so getängt ich scheiß auf hier und eguteltechn dich frei
hat hier da ich ficke deine mutter warte von dem schwanz
denn ich hab man mal auf der nacht jetzt nehmen
ich lieb dich ins tag und ein folte zu sehen
ich bin der e

In [16]:
#file_name = "Bushido_60epochs_30maxlen_0ngrams"
with open(file_name + '.txt', 'w') as text_file:
    text_file.write(generated_text)