In [6]:
#run it in google colab
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
from tensorflow.keras.models import Sequential

from google.colab import drive
import numpy as np
import pandas as pd

In this example, we're going to train a [CharRNN](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) on a body of Shakespearian text. Ultimtely, this is an unsuperived learning task. But similar to our previous explorations in unsupervised DL, we will use an unlabeled dataset and create many samples of labeled data that we can use with our familiar supervised loss functions. The result will be a model that has learned the statistical properties of the input text, and can then be considered a "generative" model of language because we can use it to generate synthetic passages of Shakespeare.  

In [15]:
drive.mount('/content/gdrive/') #connect with google drive

Mounted at /content/gdrive/


In [16]:
file_path = "/content/gdrive/My Drive/Colab Notebooks/shakespeare.txt"

with open(file_path,"r") as f:
  text = f.read()

We've loaded our Shakespeare text, let's take a look at a random snippet.

In [17]:
print(text[31600:32000])

 lies i' the second chamber?
  LADY MACBETH. Donalbain.
  MACBETH. This is a sorry sight.           [Looks on his hands.
  LADY MACBETH. A foolish thought, to say a sorry sight.
  MACBETH. There's one did laugh in 's sleep, and one cried,
      "Murther!"
    That they did wake each other. I stood and heard them,
    But they did say their prayers and address'd them
    Again to sleep.
  LADY MACB


We need to convert our text into numeric arrays, the next several blocks accomplish this.

First, we'll create a mapping between characters and their numeric index. We'll also create the reverse mapping, which is useful.

In [5]:
chars = sorted(list(set(text))) #vocabulary set
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 75


Next, we'll create a training set of sub-sequences. Remember, we're trying to train a model to be able to predict the next chracter if it is given several characters of a subsequence. So we will create training pairs where each X is a fixed-length subsequences and each Y is the corresponding next letter in the text.

In [6]:
maxlen = 40
step = 3
sub_sequences = []
next_chars = []
for i in range(0, len(text) - maxlen, step): #i is the 1st word, maxlen is the input length;
    sub_sequences.append(text[i: i + maxlen]) #sub_sequences is the input characters
    next_chars.append(text[i + maxlen]) #next_chars is the output character
print('nb sequences:', len(sub_sequences))

nb sequences: 38700


In [7]:
k=300
print("(Sequence):\n" + sub_sequences[k])
print("\n(Target Character): \n" + next_chars[k])

(Sequence):
 and other Apparitions
  Lords, Gentleme

(Target Character): 
n


Next we'll create one-hot vectors for our sub-sequences. The tensor we create here will be shaped as (num_sequences x sequence_length x alphabet_size).

In [8]:
X = np.zeros((len(sub_sequences), maxlen, len(chars)), dtype=np.uint8 ) 
# X: input dimension * dim of every input * character set (vocabulary set);
# 可以理解为有个matrix，每一行是一个输入，列数代表每个输入有多少字符，而第三维度即每个字符又以one-hot encoder表示
Y = np.zeros((len(sub_sequences), len(chars)), dtype=np.uint8)
# Y可以理解为：每一行是一个输出，由于每个输出是一个字符，所以可以直接以one-hot encoder来表示
for i, seq in enumerate(sub_sequences):
    for t, char in enumerate(seq): #t代表40个字符中已经遍历到哪个字符了
        X[i, t, char_indices[char]] = 1
        Y[i, char_indices[next_chars[i]]] = 1

In [11]:
X[0,0,:] 
"""
0th input 
-> 0th character (40 characters form an input) 
-> all dimensions for one-hot vector (75 characters in total)
"""

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=uint8)

In [10]:
Y[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

Our RNN model will be quite simple.

In [12]:
char_rnn = Sequential()
char_rnn.add(tfkl.LSTM(128, input_shape=(maxlen, len(chars))))
char_rnn.add(tfkl.Dense(len(chars),activation="softmax"))

In [13]:
char_rnn.compile(loss='categorical_crossentropy', optimizer=tfk.optimizers.RMSprop(lr=0.01))

In [14]:
char_rnn.fit(X,Y, epochs=20, batch_size=1024)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fa8373e3e10>

Once we have a trained model, we can simulate new text by making predictions about the next character and then drawing characters in proportion to the predicted probabilities. And then simple repeat that process over and over, each time drawing the next character.

In [15]:
def draw_char(probs):
    probs = np.asarray(probs).astype('float64')
    if sum(probs) != 1.0:
      probs = probs / np.sum(probs)
    draw = np.random.choice(range(len(probs)) , p=probs) 
    #instead of picking the highest nunber, we take a random draw
    return draw

def sample_text(model, sample_length=100):
    start = np.random.randint(0, len(text) - maxlen - 1)
    sequence = text[start: start + maxlen]
  
    x_preds = np.zeros((sample_length, maxlen, len(chars)))
    for i in range(sample_length):
        for t, char in enumerate(sequence[-maxlen:]):
            x_preds[i, t, char_indices[char]] = 1.

        preds = model.predict(np.expand_dims(x_preds[i,:,:], axis=0), verbose=0)[0]
        next_index = draw_char(preds) #append the randomly drawn output as next_index
        next_char = indices_char[next_index]

        sequence += next_char
    return sequence

In [16]:
sim = sample_text(char_rnn,sample_length=500) 

In [17]:
print(sim)

he devil.
  LADY MACBETH. O proper stuff boust my had, silley. Come,
    Her will and Fit.
  MACDUFF. Als me. 
  LORSS. And me what agains.
  DOCTOR. Their freaces thene?
  MACBETH. [Wes her arm-
  SIWAND dEITCHPFISSTON   BE HOP DoWoss him newfill to ar. I bey, my lord,
    say all of and scrat's roudies: thee stath,
    Which candroms withhen may,
    Of 'conquo'll hairs thee thuse about, the
    biveous undee to be not tomantent; me venest hatrine shead,
    Our 'tis ous see of is sence,
    Why does his wood. Bom comy, shy most the


Notice that we can do pretty well to learn the typical statistical patterns of this text and then simulate new text that appears to be very similar to legitimate Shakespeare. 

But just a caution - we can also do pretty well with a much simpler method (Markov model): http://nbviewer.jupyter.org/gist/yoavg/d76121dfde2618422139

So the lesson is to try something simple before jumping right in to deep learning.

## Exercise

In this example, we're going to use an RNN for sequence classification. The task we'll set up is to generate a training set of randomized strings, and train our model to detect whether a string contains any vowels.

First, we'll create a training dataset of short randomized character sequences and the corresponding label of whether or not they contain at least one vowel.

In [1]:
import string

In [2]:
def contains_vowels(sequence):
  vowels = ["a", "e", "i", "o", "u"]
  return any([vowel in list(sequence) for vowel in vowels])

In [4]:
contains_vowels("gradient")

True

In [7]:
sequences = []
labels = []
for i in range(1000):
 char_list = np.random.choice( list(string.ascii_lowercase), size = 5, replace=True)
 seq = "".join(char_list)
 sequences.append(seq)
 labels.append(int(contains_vowels(seq)))

In [11]:
sequences[0:5]

['qcdyy', 'hpbbu', 'vylff', 'sfjbp', 'wvtzo']

In [12]:
df = pd.DataFrame({"sequence": sequences, "label":labels})

In [13]:
df.head()

Unnamed: 0,sequence,label
0,qcdyy,0
1,hpbbu,1
2,vylff,0
3,sfjbp,0
4,wvtzo,1


Next, set up and train an RNN (of any type) to solve this task. What preprocessing will you need to do first on the raw data in order to prepare it for the network?

In [22]:
# Data Preprocessing
# your code here

In [76]:
txt_chars = sorted(list(set(text))) #vocabulary set
print('total chars:', len(txt_chars))
txt_char_indices = dict((c, i) for i, c in enumerate(txt_chars))

total chars: 75


In [47]:
txt_maxlen = 5
step = 1
txt_sub_sequences = []
txt_label = []
for i in range(0, len(text) - txt_maxlen, step): #i is the 1st word, maxlen is the input length;
    #transform to lowercase
    seq = text[i: i + txt_maxlen].lower()
    txt_sub_sequences.append(seq) #txt_sub_sequences is the input characters;
    txt_label.append(int(contains_vowels(seq))) #label is using function: contains_vowels;
print('nb txt_sub_sequences:', len(txt_sub_sequences))
print('nb txt_label:', len(txt_label))

nb txt_sub_sequences: 116134
nb txt_label: 116134


In [68]:
#split training set and test set
training_ix = np.random.choice( range(0,len(txt_label)), size=int(len(txt_label)*0.6), replace=False)
test_ix = [i for i in range(0,len(txt_label)) if i not in training_ix]
####
training_txt_sub_sequences = np.array(txt_sub_sequences)[training_ix]
test_txt_sub_sequences = np.array(txt_sub_sequences)[test_ix]
training_txt_label = np.array(txt_label)[training_ix]
test_txt_label = np.array(txt_label)[test_ix]

In [74]:
print("training_txt_label:",training_txt_label[0:5])
print("training_txt_sub_sequences",training_txt_sub_sequences[0:5])

training_txt_label: [1 0 0 1 1]
training_txt_sub_sequences ['deed,' '.\n   ' 'n!\n  ' '   i ' 'pon t']


In [73]:
# Model setup and training
# your code here

In [78]:
X_txt = np.zeros((len(training_txt_sub_sequences), txt_maxlen, len(txt_chars)), dtype=np.uint8 ) 
# X: input dimension * dim of every input * character set (vocabulary set);
# 可以理解为有个matrix，每一行是一个输入，列数代表每个输入有多少字符，而第三维度即每个字符又以one-hot encoder表示
for i, seq in enumerate(training_txt_sub_sequences):
    for t, char in enumerate(seq): #t代表40个字符中已经遍历到哪个字符了
        X_txt[i, t, txt_char_indices[char]] = 1

In [80]:
Y_txt = training_txt_label

In [101]:
vowel_rnn = Sequential()
vowel_rnn.add(tfkl.LSTM(128, input_shape=(txt_maxlen, len(txt_chars))))
#vowel_rnn.add(tfkl.Dense(len(txt_chars),activation="softmax"))
vowel_rnn.add(tfkl.Dense(1,activation="sigmoid"))

In [102]:
vowel_rnn.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 128)               104448    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 104,577
Trainable params: 104,577
Non-trainable params: 0
_________________________________________________________________


In [105]:
vowel_rnn.compile(loss='binary_crossentropy', optimizer=tfk.optimizers.RMSprop(lr=0.01), metrics=['acc'])
# if you set the wrong loss function, for example, loss="categorical_crossentropy", then the loss will not move any little bit
# To use "categorical_crossentropy" and "softmax", you need to transform label={0,1} or {1,0} instead of label=0 or 1
### because "softmax" will finally give you a y_predicted={0.25,0.25,0.4,0.1} which add up to 1 

In [106]:
results =  vowel_rnn.fit(X_txt,Y_txt, epochs=10, batch_size=1024)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
