# Funk Generator - TensorFlow

Documentation: https://www.tensorflow.org/tutorials/sequences/text_generation

In [10]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import os
import time
import pandas as pd

# Read Data

In [3]:
df = pd.read_json('funks.json')

In [4]:
df2 = df.letter

df2.to_csv('funkemtxt.txt',index=None, header=None)

  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
text = open('funkemtxt.txt', 'rb').read().decode(encoding='utf-8')
print ('Length of text: {} characters'.format(len(text)))

Length of text: 1282732 characters


In [8]:
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

122 unique characters


# Process Text

In [9]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [10]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '"' :   3,
  '%' :   4,
  '&' :   5,
  "'" :   6,
  '(' :   7,
  ')' :   8,
  '*' :   9,
  ',' :  10,
  '-' :  11,
  '.' :  12,
  '/' :  13,
  '0' :  14,
  '1' :  15,
  '2' :  16,
  '3' :  17,
  '4' :  18,
  '5' :  19,
  ...
}


In [11]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13])) 

'"Sem neurose ' ---- characters mapped to int ---- > [ 3 45 59 67  1 68 59 75 72 69 73 59  1]


# Create training examples and targets

In [12]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

"
S
e
m
 


In [13]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))



'"Sem neurose eu começo esse funk\nCom a maior satisfação\nPois eu tenho com sentimento\nDo primeiro a fa'
'cção\nFoi 5 dias de terror, oh qe a Zona Sul tremeu\nQuem abalou a Zona Sul foi o Bonde do Zebedeu\nNão '
'se espante com minhas palavras,\nVou citar sem emoção,\nMas pra fechar com o primeiro\nTem que ser de co'
'ração,\nPaz, Justiça e Liberdade\nPara quem se encontra trancado,\nEssa luta não é em vão\nE os nosso irm'
'ãos estão bolado.\nVou mandando um forte abraço\nPara o irmão Sidney,\nRonaldinho fecha com a Sul,\nGordã'


For each sequence, duplicate and shift it to form the input and target text by using the map method to apply a simple function to each batch:

In [14]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

# Create training batches

In [17]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

# Build The Model


In [18]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [19]:
vocab

['\n',
 ' ',
 '!',
 '"',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '}',
 '~',
 '¡',
 '°',
 '²',
 '¹',
 'À',
 'Á',
 'Ã',
 'Ç',
 'È',
 'É',
 'Ê',
 'Í',
 'Ó',
 'Ô',
 'Ú',
 'à',
 'á',
 'â',
 'ã',
 'ç',
 'è',
 'é',
 'ê',
 'ë',
 'í',
 'î',
 'ò',
 'ó',
 'ô',
 'õ',
 'ö',
 'ù',
 'ú',
 'ü',
 '–',
 '’',
 '…',
 '−']

In [20]:
if tf.test.is_gpu_available():
  rnn = tf.keras.layers.CuDNNGRU
else:
  import functools
  rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [21]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [22]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [23]:
vocab_size

122

# Try the model

In [24]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 122) # (batch_size, sequence_length, vocab_size)


In [25]:
dataset.take(1)


<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           31232     
_________________________________________________________________
lstm (LSTM)                  (64, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (64, None, 122)           125050    
Total params: 5,403,258
Trainable params: 5,403,258
Non-trainable params: 0
_________________________________________________________________


In [27]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [28]:
sampled_indices

array([ 59,  18,  29,  72, 112, 107,  34,  73,  21,  40,  91,  45,   4,
         7,  52,   7, 111,  10,  36,  78,  91,  99,  35,  98,  49,  87,
       106,  86,  88,  37,  94, 113,  76,  69,   1,  78,  85,  73,  82,
        88,  17,  59,  73,  41,  67,  28,  50, 100, 119,  87,  64,  65,
        33,  42,  17, 118,  10,  42,  63,  11, 120,  35,  64,  18,  91,
        72, 101,  64,  50, 116,  35,  61,  51,  83,  93,  22,  68,  57,
        61, 110,  57,  85,  93,  14,  77,  59,  81,  76,  50,  68,   7,
        73,  22,  74,   2,  55, 114,  78,  16,   2])

In [29]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'on pra nós\nFaz o que você quiser quando nós tamo a sós, paz\nPro nosso rolê que a nossa vida é louca\n'

Next Char Predictions: 
 'e4CrôëHs7NÇS%(Z(ó,JxÇàIÚW¹ê²ÀKÊõvo x°s}À3esOmBXá’¹jkGP3–,Pi-…Ij4ÇrâjXúIgY~É8ncgòc°É0we{vXn(s8t!aöx2!'


# Trainning

In [30]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())


Prediction shape:  (64, 100, 122)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.8049493


In [31]:
model.compile(optimizer='adam', loss=loss)


In [32]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [None]:
#save my checkpoints

In [33]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [34]:
EPOCHS=50

In [35]:
#history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Epoch 1/50


W0714 14:50:56.332778 140006398977856 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_grad.py:1251: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0714 14:50:57.223992 140006398977856 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/training/optimizer.py:166: BaseResourceVariable.constraint (from tensorflow.python.ops.resource_variable_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Apply a constraint manually following the optimizer update step.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
