In [None]:
!curl -O https://s3.amazonaws.com/text-datasets/nietzsche.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  586k  100  586k    0     0  1995k      0 --:--:-- --:--:-- --:--:-- 1995k


In [None]:
import tensorflow as tf 
batch_size = 64 
raw_data_ds = tf.data.TextLineDataset(["nietzsche.txt"])

In [None]:
for elem in raw_data_ds.take(10):
  print(elem.numpy().decode("utf-8")) # before we can decode to regular text we need to convert it to numpy
#Now that we can see the text exists, what we need to do is convert all of this text into one line.

PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists, have failed to understand women--that the terrible
seriousness and clumsy importunity with which they have usually paid
their addresses to Truth, have been unskilled and unseemly methods for
winning a woman? Certainly she has never allowed herself to be won; and
at present every kind of dogma stands with sad and discouraged mien--IF,


In [None]:
text = ""
for lines in raw_data_ds:
  text += lines.numpy().decode("utf-8")
splitted = tf.strings.bytes_split(text)
splitted[:100] #Note that here we are doing character level tokenization.
#It is general advice to have at least 100,000 characters in our training corpus, 1M is even better so lets check how many characters we have

<tf.Tensor: shape=(100,), dtype=string, numpy=
array([b'P', b'R', b'E', b'F', b'A', b'C', b'E', b'S', b'U', b'P', b'P',
       b'O', b'S', b'I', b'N', b'G', b' ', b't', b'h', b'a', b't', b' ',
       b'T', b'r', b'u', b't', b'h', b' ', b'i', b's', b' ', b'a', b' ',
       b'w', b'o', b'm', b'a', b'n', b'-', b'-', b'w', b'h', b'a', b't',
       b' ', b't', b'h', b'e', b'n', b'?', b' ', b'I', b's', b' ', b't',
       b'h', b'e', b'r', b'e', b' ', b'n', b'o', b't', b' ', b'g', b'r',
       b'o', b'u', b'n', b'd', b'f', b'o', b'r', b' ', b's', b'u', b's',
       b'p', b'e', b'c', b't', b'i', b'n', b'g', b' ', b't', b'h', b'a',
       b't', b' ', b'a', b'l', b'l', b' ', b'p', b'h', b'i', b'l', b'o',
       b's'], dtype=object)>

In [None]:
#The number of distinct charaters
vals = list(set(text))
print(len(vals)) #We have 83 distinct characters

83


In [None]:
#Now lets split our data into two parts, lets have the data part and the label part. The data will be a sequence that has a fixed max length
#the label will be the next character produced by this sequence for each sequence.
#input_chars -> model -> next_char (and input_chars have max_length)
maxlen = 20 #Longest input sequence of characters we predict on 
step = 3
input_chars = []
next_chars= []
#Since we are going to for loop the step lets us avoid reusing sequences that only differ by two characters at a time, instead we differ by
#losing 3 and gaining 3 new ones which is 6 in total difference
for i in range(0, len(text)-maxlen, step):
  input_chars.append(text[i:i+maxlen])
  next_chars.append(text[i+maxlen]) #We set the ending spot to be len(text)-maxlen since we are indexing + maxlen everytime so if we went to
  #the full length of the text with i, and we tried to index i + maxlen, this would be trying to index out of bounds by an amount of maxlen
  #since i at that point is already max len
#Lets check to make sure the first couple of instances worked
# print(input_chars[6], next_chars[7])

In [None]:
print(input_chars[7], next_chars[7])

 Truth is a woman--w h


In [None]:
#Now we can make this into datasets! Since we are loading from memory we use .from_tensor_slices
X_raw_train_ds = tf.data.Dataset.from_tensor_slices(input_chars)
Y_raw_train_ds = tf.data.Dataset.from_tensor_slices(next_chars)
#Now we can still view this using a for loop and zip() and numpy() and decode('utf-8')

In [None]:
#Now we have to do some preprocessing
#1- Standardizing (lower casing and removing punctuations)
#2- Split each sample into word length
#3- Recombine substrings into tokens (1-gram here for character length)
#4- Index tokens
#5- Transform each of these index tokens into vector representations (Embedding) (A vector of ints or dense float vector)

In [None]:
import re 
import string 
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
  stripped_num = tf.strings.regex_replace(stripped_html, "[\d-]", " ")
  stripped_punc = tf.strings.regex_replace(stripped_num, "[%s]" % re.escape(string.punctuation), " ")
  return stripped_punc 
def char_split(input_data):
  return tf.strings.unicode_split(input_data, 'UTF-8')

In [None]:
#We are able to limit the number of distinct characters in TextVectorization we also set fixed size seq length which is required by our model
max_features = 83 #number of distinct word
embedding_dim = 16 #embedding layer output dimension
seq_length = maxlen #size of input to model, but size of output of embeddings per sequence we feed it! 
#Note that we will specify whether we are doing character split or word split in our TextVectorization function.
#tf.squeeze removes all of the 1 dimension parts of an input if our inputs dim is (1,2,3,1) it becomes (2,3)
#Here what we will do in text vectorization is normalize, split, and then set the output to be integers representing the characters

In [None]:
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize =custom_standardization,
    max_tokens = max_features,
    split = char_split,
    output_mode = 'int',
    output_sequence_length = seq_length
)
#No matter size of input the output embedding size will be of size 20, however  we can have 83 distinct characters and these output embeddings
#will be integers. For the y dataset we will be taking the first numerical indice in the embeddings to represent one character rather than 
#use a whole sequence type embedding to represent a single character.
vectorize_layer.adapt(X_raw_train_ds)

In [None]:
def text_vectorize(text): #the point of expanding dim is to raise from a regular string to an array which we can perform textvectorization on
  value = tf.expand_dims(text, -1)
  return tf.squeeze(vectorize_layer(value))
X_train_ds = X_raw_train_ds.map(text_vectorize)
Y_train_ds = Y_raw_train_ds.map(text_vectorize)
Y_train_ds = Y_train_ds.map(lambda x: x[0])
train_ds = tf.data.Dataset.zip((X_train_ds, Y_train_ds)).shuffle(buffer_size=512).batch(batch_size,drop_remainder=True).cache().prefetch(
    buffer_size=tf.data.AUTOTUNE
)

In [None]:
#Now we are going to work on the sampling methods 
def softmax(z):
  return np.exp(z)/sum(np.exp(z))
def greedy_search(conditional_probability):
  return np.argmax(conditional_probability)
def temperature_sampling(conditional_probability, temperature=1.0):
  conditional_probability = np.asarray(conditional_probability).astype("float64")
  conditional_probability = np.log(conditional_probability)/temperature 
  reweighted_conditional_probability = softmax(conditional_probability)
  probas = np.random.multinomial(1, reweighted_conditional_probability, 1)
  return np.argmax(probas)
def top_k_sampling(conditional_probability, k):
  top_k_probabilities, top_k_indices = tf.math.top_k(conditional_probability, k=k, sorted=True)
  top_k_probabilities = np.asarray(top_k_probabilities).astype("float32")
  top_k_probabilities = np.squeeze(top_k_probabilities)
  top_k_indices = np.asarray(top_k_indices).astype("int32")
  top_k_redistributed_probabilities = softmax(top_k_probabilities)
  top_k_redistributed_probabilities = np.asarray(top_k_redistributed_probabilities).astype("float32")
  return np.random.choice(np.squeeze(top_k_indices), p = top_k_redistributed_probabilities)
#these all simply return indice of the given conditional probability and extra arguments

In [None]:
#Now we simply have to build the language model, remember that our tokens were of length 20 for character level
inputs = tf.keras.Input(shape=(20), dtype = 'int64')
x = tf.keras.layers.Embedding(max_features, embedding_dim)(inputs) #Takes token in to make embeddings 
#Embedding needs to know # of distinct tokens and #of output dim "embedding_dim"
#The embedder wants to know the number of distinct tokens 
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.LSTM(128, return_sequences=True)(x)
x = tf.keras.layers.Flatten()(x)
predictions = tf.keras.layers.Dense(max_features, activation='softmax')(x)
model = tf.keras.Model(inputs = inputs, outputs = predictions)
model.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(), optimizer = tf.keras.optimizers.Adam(), metrics = ['accuracy'])

In [None]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 20, 16)            1328      
                                                                 
 dropout_1 (Dropout)         (None, 20, 16)            0         
                                                                 
 lstm_1 (LSTM)               (None, 20, 128)           74240     
                                                                 
 flatten_1 (Flatten)         (None, 2560)              0         
                                                                 
 dense_1 (Dense)             (None, 83)                212563    
                                                                 
Total params: 288,131
Trainable params: 288,131
Non-trainab

In [None]:
model.fit(train_ds, epochs = 11)

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<keras.callbacks.History at 0x7f9a1dfcb090>

In [None]:
#Now its pretty simple to do the sampling, all we have to do is pass in our models prediction into the functions we made above, and then take that returned index
#and use .get_vocabulary()[returned index] #This is what we need to do in order to get back our models predictions. For some reason my code fails to exectue when I try to do it
#as it is saying there is something wrong with my javascript. Either way here is the really basic but pretty trash model. Could add more LSTM layers and distribute the units
#accross them. However this is a pretty weak model and I am going to see how I can improve this model drastically. Note that also this corpus is not good enough to produce
