In [1]:
import os
import sys
import json
import random
import numpy as np
import pandas as pd
from nltk import ngrams
from pandas.io.json import json_normalize
import keras
from keras.layers import LSTM, Dense
from keras.models import Sequential, load_model

Using TensorFlow backend.


In [2]:
def load_json(json_path, artists=[]):
    if (os.path.isfile(json_path)):
        print("json")
        with open(json_path) as f:
            song_data = json.load(f)
            return song_data['songs']
        
    elif (os.path.isdir(json_path)):
        data = []
        json_files = []
        if (len(artists) > 0):
            for artist in artists:
                json_files = json_files + [json_file for json_file in os.listdir(json_path) if ((json_file.endswith('.json')) & (artist in json_file))]
        else:
            json_files = [json_file for json_file in os.listdir(json_path) if json_file.endswith('.json')]

        for json_file in json_files:
            path_to_json = os.path.join(json_path, json_file)
            with open(path_to_json) as f:
                song_data = json.load(f)
                data = data + song_data['songs']
        
        return data
    
    
def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    
    return distribution / np.sum(distribution)


def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)


In [4]:
# Parameterts
maxlen = 60  # extraxt sequences of n characters
step = 3     # sample new seq every n characters
n_grams_len = 0
json_path = '../../data/deutsch'
artists = ['Bushido']

## Datapreprocessing

In [5]:
# load data
data = load_json(json_path, artists)
df = json_normalize(data)
lyrics = df.lyrics.map(lambda lyric: lyric.lower())

print('Number of Songs: {}'.format(len(df)))
print('Corpus length: {}'.format(len("".join(lyrics))))

Number of Songs: 100
Corpus length: 303654


In [6]:
sentences = []
next_chars = []
chars = []
for lyric in lyrics:
    lyric = lyric.lower()
    if n_grams_len > 1:
        for i in range(0, len(lyric) - maxlen - n_grams_len): # iterates by step size
            sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
            next_chars.append(lyric[i + maxlen: i + maxlen + n_grams_len])
        
        ngrams_iter = ngrams(lyric, n_grams_len)
        for gram in ngrams_iter:
            chars.append(''.join(list(gram)))
        chars = sorted(list(set(chars)))
    else:
        for i in range(0, len(lyric) - maxlen, step): # iterates by step size
            sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
            next_chars.append(lyric[i + maxlen])
        
print('Number of sequences:', len(sentences))

if n_grams_len < 1:
    chars = sorted(list(set(''.join(lyrics)))) # list of unique characters

print('Unique characters:', len(chars))

char_indices = dict((char, chars.index(char)) for char in chars) # maps char with index

Number of sequences: 99246
Unique characters: 78


In [12]:
chars = sorted(list(set(chars)))

In [13]:
len(chars)

9563

In [8]:
print('Vectorization...')

x = np.zeros((len(sentences), maxlen, len(chars))) # (sentences)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1    # one hot encoding
    y[i, char_indices[next_chars[i]]] = 1  # one hot encoding
    
"""
for i, sentence in enumerate(sentences):
    for t in range(0, len(sentence) - n_grams_len):
        char = sentence[t:t+n_grams_len]
        x[i, t, char_indices[char]] = 1    # one hot encoding
    y[i, char_indices[next_chars[i]]] = 1  # one hot encodin
    """

Vectorization...


'\nfor i, sentence in enumerate(sentences):\n    for t in range(0, len(sentence) - n_grams_len):\n        char = sentence[t:t+n_grams_len]\n        x[i, t, char_indices[char]] = 1    # one hot encoding\n    y[i, char_indices[next_chars[i]]] = 1  # one hot encoding'

## Build Model

In [9]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               105984    
_________________________________________________________________
dense_1 (Dense)              (None, 78)                10062     
Total params: 116,046
Trainable params: 116,046
Non-trainable params: 0
_________________________________________________________________


## Train Model

In [None]:
epochs = 60
model.fit(x, y, batch_size=128, epochs=epochs)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
  896/99246 [..............................] - ETA: 57s - loss: 0.8245

In [None]:
artist = artists[0]
epochs = 180
file_name = '{}_{}epochs_{}maxlen'.format(artist, epochs, maxlen, n_grams_len)

In [35]:
file_name

'Bushido_120epochs_60maxlen'

In [36]:
pwd

'/Users/hao/workspace/hpi-de/dl4textmining/dl4tm-project/exploration/RNN'

In [37]:
model.save('./increase_epochs/models/model_{}.h5'.format(file_name))

In [38]:
#model = load_model('./model_Bushido_60epochs_30maxlen_0ngrams.h5')

In [39]:
temperature = 0.5

#start_index = random.randint(0, len(lyrics) - maxlen - 1)
#generated_text = lyrics[start_index: start_index + maxlen]
random.seed(3004)
lyrics_index = random.randint(0, len(lyrics))
chosen_lyric = lyrics[lyrics_index]
start_index = random.randint(0, len(chosen_lyric) - maxlen - 1)
generated_text_temp = chosen_lyric[start_index: start_index + maxlen]
generated_text = generated_text_temp
print(generated_text)
#print('\n___________________\n')
for i in range(1500):
    sampled = np.zeros((1, maxlen, len(chars)))
            
    for t, char in enumerate(generated_text_temp):
        sampled[0, t, char_indices[char]] = 1.
                      
    preds = model.predict(sampled, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_char = chars[next_index]
    generated_text_temp += next_char
    generated_text += next_char
    generated_text_temp = generated_text_temp[1:]
    sys.stdout.write(next_char)

mach den fernseher an, kleine bushidos
ich kam aus dem ghett
o seh wie ich mehr die greeben
dein erziehten frieden schwein in den schönel freshhlingt
passinaut gestatt, yeah

[part 2]
das ich mache, wer 



ich bin ich musste dich gebust
die gedamper sind wie ich dich meine stadt, was renze weiter
ich bin deine mutter machst du die geschichten und dann berlin
ich hab mach ich will ich mir alles keiner so vaner
ihr könnte von deinem eisen
du bist um benz am kurschau mein leben
und ich schaffen grauerapart
ich seh mich glaubt und alles sehen werden wie in, bapal, weil du kein'n choruser und den stell in der schlechten tag bst
ich schaff es nich jetzt mein mir an

[part 2]
deine freunde würde dich?, frenn
sag misss den staatsaus
wir für die ganzen leinkie hängen ich machs' die scheißen wenn der ferren
meine schreißt mir ob ich keine skalt meine schrechten zieht
wer ein allo und schon so das du bege einer leben
ich war die schlechten zeiten und auch die gesucht war nur bei nicht
die shindy mir alles fehlen
man sich wenn ich machen, dass man auf der stell in der juice du siehst

[hook]
ich hab gewächte alter gebleit verlier?
denn sie woll' nur erst du alles der bollys, wenn du sie wirst du for

In [40]:
#file_name = "Bushido_60epochs_30maxlen_0ngrams"
with open('./increase_epochs/texts/' + file_name + '.txt', 'w') as text_file:
    text_file.write(generated_text)