<a href="https://colab.research.google.com/github/Nathan-Mekuria-Solomon/ML-practice/blob/main/natural_language_processing_edited/char_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import tensorflow as tf
import numpy as np

In [26]:
# create dataset
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
file_path = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)

with open(file_path) as f:
  shakespeare_text = f.read()

In [27]:
# tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level= True) # default: word-level encoding
tokenizer.fit_on_texts([shakespeare_text])

In [28]:
tokenizer.texts_to_sequences(["First"])

[[20, 6, 9, 8, 3]]

In [29]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [30]:
max_id = len(tokenizer.word_index)
max_id

39

In [31]:
dataset_size = sum(len(text) for text in shakespeare_text)
dataset_size

1115394

In [32]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [33]:
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
first_few = []
for item in dataset.take(10):
  first_few.append(int(item.numpy()))

tokenizer.sequences_to_texts([first_few])

['c a s h e , a e a']

In [34]:
# breakdown the sequence into chunks
n_steps = 100
window_length = n_steps + 1
dataset = dataset.window(window_length, shift= 1, drop_remainder= True)

In [37]:
# creating a flat dataset for training
dataset = dataset.flat_map(lambda window: window.batch(window_length))

# split input and target
dataset = dataset.map(lambda window: (window[:-1], window[1:])) # (input, target)

# batch and shuffle
dataset = dataset.shuffle(10000).batch(32)

In [39]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth= max_id), Y_batch))

In [40]:
dataset = dataset.prefetch(1)