In [27]:
import os
import re
import sys
import json
import random
import numpy as np
import pandas as pd
from nltk import ngrams
from pandas.io.json import json_normalize
import keras
from keras.layers import LSTM, Dense, Conv1D, MaxPool1D, AveragePooling1D, Flatten
from keras.models import Sequential, load_model
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

## Utils

In [28]:
def load_json(json_path, artists=[]):
    if (os.path.isfile(json_path)):
        print("json")
        with open(json_path) as f:
            song_data = json.load(f)
            return song_data['songs']
        
    elif (os.path.isdir(json_path)):
        data = []
        json_files = []
        if (len(artists) > 0):
            for artist in artists:
                json_files = json_files + [json_file for json_file in os.listdir(json_path) if ((json_file.endswith('.json')) & (artist in json_file))]
        else:
            json_files = [json_file for json_file in os.listdir(json_path) if json_file.endswith('.json')]

        for json_file in json_files:
            path_to_json = os.path.join(json_path, json_file)
            with open(path_to_json) as f:
                song_data = json.load(f)
                data = data + song_data['songs']
        
        return data
    
    
def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    
    return distribution / np.sum(distribution)


def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probas)

def normalize_lyric(text, lower=True):
    if lower:
        text = text.lower()
    text = re.sub('\[.+\](\\n)|\[.+\](\(.*\))', '', text)
    return text 

In [29]:
# Parameterts
maxlen = 45  # extraxt sequences of n characters
step = 1    # sample new seq every n characters
n_grams_len = 0
json_path = '../data/deutsch'
artists = ['Bushido']

## Datapreprocessing

In [30]:
# load data
data = load_json(json_path, artists)
df = json_normalize(data)
lyrics = df.lyrics.map(lambda lyric: normalize_lyric(lyric))

print('Number of Songs: {}'.format(len(df)))
print('Corpus length: {}'.format(len("".join(lyrics))))

Number of Songs: 100
Corpus length: 297389


In [31]:
sentences = []
next_chars = []
chars = []
for lyric in lyrics:
    lyric = lyric.lower()
    if n_grams_len > 1:
        for i in range(0, len(lyric) - maxlen - n_grams_len): # iterates by step size
            sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
            next_chars.append(lyric[i + maxlen: i + maxlen + n_grams_len])
        
        ngrams_iter = ngrams(lyric, n_grams_len)
        for gram in ngrams_iter:
            chars.append(''.join(list(gram)))
        chars = sorted(list(set(chars)))
    else:
        for i in range(0, len(lyric) - maxlen, step): # iterates by step size
            sentences.append(lyric[i: i + maxlen]) # get maxlen amount of characters
            next_chars.append(lyric[i + maxlen])
        
print('Number of sequences:', len(sentences))

if n_grams_len < 1:
    chars = sorted(list(set(''.join(lyrics)))) # list of unique characters

print('Unique characters:', len(chars))

char_indices = dict((char, chars.index(char)) for char in chars) # maps char with index

Number of sequences: 292889
Unique characters: 76


In [32]:
print('Vectorization...')

x = np.zeros((len(sentences), maxlen, len(chars))) # (sentences)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1    # one hot encoding
    y[i, char_indices[next_chars[i]]] = 1  # one hot encoding
    
#for i, sentence in enumerate(sentences):
#    for t in range(0, len(sentence) - n_grams_len):
#        char = sentence[t:t+n_grams_len]
#        x[i, t, char_indices[char]] = 1    # one hot encoding
#    y[i, char_indices[next_chars[i]]] = 1  # one hot encodin


Vectorization...


## Split Dataset

In [33]:
sentences_train, sentences_test, next_chars_train, next_chars_test = train_test_split(x, y)

## Build Model

In [34]:
model = Sequential()
model.add(Conv1D(input_shape=(maxlen, len(chars)),
                filters=32,
                kernel_size=7,
                padding='same',
                activation='relu',
                strides=1))
model.add(MaxPool1D(pool_size=2))

model.add(Conv1D(filters=64,
                kernel_size=3,
                padding='same',
                activation='relu',
                strides=1))
#model.add(MaxPool1D(pool_size=2))

"""model.add(Conv1D(filters=64,
                kernel_size=3,
                padding='same',
                activation='relu',
                strides=1))
model.add(MaxPool1D(pool_size=2))

model.add(Conv1D(filters=32,
                kernel_size=3,
                padding='same',
                activation='relu',
                strides=1))"""
model.add(AveragePooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(len(chars), activation='softmax'))

optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [35]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_4 (Conv1D)            (None, 45, 32)            17056     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 22, 32)            0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 22, 64)            6208      
_________________________________________________________________
average_pooling1d_2 (Average (None, 11, 64)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 704)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 76)                53580     
Total params: 76,844
Trainable params: 76,844
Non-trainable params: 0
_________________________________________________________________


In [36]:
EPOCHS = 1
BATCH_SIZE = 64

DIR = '../outputs/charbased/CNN_Simple_CharBased_{}_E{}_BS{}_ML{}_SS{}'.format(artists[0], EPOCHS, BATCH_SIZE, maxlen, step)

if not os.path.exists(DIR):
    os.makedirs(DIR)

## Callbacks

In [37]:
tensorboard = TensorBoard(log_dir=os.path.join(DIR, 'logs'), write_images=True, write_grads=True)
modelCheckpoint_best = ModelCheckpoint(filepath=os.path.join(DIR, "model_best.h5"), save_best_only=True)
modelCheckpoint = ModelCheckpoint(filepath=os.path.join(DIR, "model.h5"), save_best_only=False)

## Train Model

In [38]:
model.fit(sentences_train, next_chars_train, 
          batch_size=BATCH_SIZE, 
          epochs=EPOCHS,
          validation_data=(sentences_test, next_chars_test),
          callbacks=[tensorboard, modelCheckpoint, modelCheckpoint_best])

Train on 219666 samples, validate on 73223 samples
Epoch 1/1


<keras.callbacks.History at 0x1082501d0>

## Generate Text

In [40]:
temperature = 0.5

#start_index = random.randint(0, len(lyrics) - maxlen - 1)
#generated_text = lyrics[start_index: start_index + maxlen]
random.seed(3004)
lyrics_index = random.randint(0, len(lyrics))
chosen_lyric = lyrics[lyrics_index]
start_index = random.randint(0, len(chosen_lyric) - maxlen - 1)
generated_text_temp = chosen_lyric[start_index: start_index + maxlen]
generated_text = generated_text_temp
print(generated_text)
#print('\n___________________\n')
for i in range(1500):
    sampled = np.zeros((1, maxlen, len(chars)))
            
    for t, char in enumerate(generated_text_temp):
        sampled[0, t, char_indices[char]] = 1.
                      
    preds = model.predict(sampled, verbose=0)[0]
    next_index = sample(preds, temperature)
    next_char = chars[next_index]
    generated_text_temp += next_char
    generated_text += next_char
    generated_text_temp = generated_text_temp[1:]
    sys.stdout.write(next_char)

shidos
ich kam aus dem ghetto auf die leinwan
n ich wie kan und die denn in die kicken  zum ferdun mich hericht junge sein 
dick senn ist denn sos, ich haben die mir was wir dich bin hier rinzast wein ich deine seine nin die fickt fass ip, sein tlog perbe mein so auf
die schah ainter kinden mein rib ein clip gesten es die mein und rasen
wie mergenicht ich beiwe und der dich auf die keine deinen fannst du die denn die e



ins jeden mir in die dich ist und der verim sind mir in vorser anden nicht
der du ich das yaft
ich wir dich dich sein nicht die big wie bar 
ich bin manne geinmene bann manne tifst du jeld blare einen blann dein ein zu wenn ihr schann wir wie big in und ich ich mir schund ich woll die dich die des cizzum habt wit wes nicht denn fann auf die für dich sind heur astnicht wir eine inder mir deinen eint mir richt mir die auf der beifen jeld die bin so pein, gannst du nenndas nicht die zu mich ist erafwec gingzwirne anden was ist dich nicht die gonnk f win dar ticht sobe will kann ich die ist du willst ern in schaln soingann die denn denn jeld sind den die woin die verwicht einen  dise gein boir
ich bin sond ich bin mich enden rauf den einginnter -wein der wie ind und schusste kalod ein mir auf der dich die lann war was war genden glar ist du dannen weine zu siehs ainst du schace weine sind wird mitser zinds wir mir wer ich sich sein are songst du nicht die dich will du nicht die fim-tre sch

In [40]:
#file_name = "Bushido_60epochs_30maxlen_0ngrams"
with open('./increase_epochs/texts/' + file_name + '.txt', 'w') as text_file:
    text_file.write(generated_text)