In [51]:
##First we'll import all our tools

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [52]:
##Let's get our scripts ready to go, we'll read in all the data
def getData(csvName= 'transcripts.csv'):
    dataFrame = pd.read_csv(csvName)
    allScripts = dataFrame['transcript'].tolist()
    allScripts = [script.lower() for script in allScripts]
    return allScripts


useAll = False #Set to true to use all transcripts in your training data, false uses the first transcript

allScripts = getData()
transcript = ''

if not useAll:
    transcript = allScripts[0]
else:
    transcript = "\n".join(allScripts) #Joining our transcripts seperated by new lines

print(len(transcript))

17409


In [53]:
##Let's look at a list of all unique characters in our scripts, 
##we'll eventually need to one hot encode them to make training easier:
uniqueChars = sorted(list(set(transcript)))
numUniqueChars = len(chars)
print(uniqueChars)

[' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '5', '6', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—']


In [54]:
##Lets make a mapping of each character to a specific number, this will help our training since we need numerical data:
charsToInt = dict((char, i) for i, char in enumerate(uniqueChars))
print(charsToInt)

{'t': 38, '"': 2, '?': 18, 'g': 25, "'": 3, '-': 7, 'q': 35, '9': 15, '.': 8, 'p': 34, 'd': 22, 'x': 42, 'o': 33, 's': 37, 'w': 41, 'k': 29, 'n': 32, 'i': 27, 'v': 40, ';': 17, 'm': 31, 'h': 26, '2': 11, '1': 10, ',': 6, 'e': 23, ')': 5, 'r': 36, 'z': 44, ' ': 0, '!': 1, '3': 12, '—': 45, 'b': 20, ':': 16, '(': 4, 'j': 28, '5': 13, '0': 9, 'u': 39, '6': 14, 'c': 21, 'l': 30, 'y': 43, 'f': 24, 'a': 19}


In [55]:
##For our network we are going to train it by feeding it strings of characters and have it predict what the next
##character in the sequence will be. So to generate these sequences we will copy 100 characters from our transcript
##into a sequence, take the next character as the target answer, and then shift our window by one character and do that
##over and over again. 

lengthOfSequence = 100
def prepSequences(rawText, encoding, sequenceLength = 100): 
    data = []
    targets = []
    for i in range(0, len(rawText) - sequenceLength, 1):
        sequence = rawText[i: i+sequenceLength]
        target = rawText[i + sequenceLength]
        data.append([encoding[char] for char in sequence]) #Here we are encoding the characters to their previous assigned values
        targets.append(encoding[target])                   #Same with the target answer

    return data, targets

data, targets = prepSequences(transcript, charsToInt, lengthOfSequence)
print(data[0])

[25, 33, 33, 22, 0, 31, 33, 36, 32, 27, 32, 25, 8, 0, 26, 33, 41, 0, 19, 36, 23, 0, 43, 33, 39, 18, 4, 30, 19, 39, 25, 26, 38, 23, 36, 5, 27, 38, 3, 37, 0, 20, 23, 23, 32, 0, 25, 36, 23, 19, 38, 6, 0, 26, 19, 37, 32, 3, 38, 0, 27, 38, 18, 0, 27, 3, 40, 23, 0, 20, 23, 23, 32, 0, 20, 30, 33, 41, 32, 0, 19, 41, 19, 43, 0, 20, 43, 0, 38, 26, 23, 0, 41, 26, 33, 30, 23, 0, 38, 26]


In [56]:
##To finish off prepping our data, we need to convert x to be [samples, time steps, features]
##and we need to convert our training answers to a one hot encoding
def prepX(data, lengthOfSequence, numUniqueChars):
    data = np.reshape(data, (len(data), lengthOfSequence, 1))
    data = data / float(numUniqueChars)
    return data

def prepY(targets):
    targets = np_utils.to_categorical(targets)
    return targets

preppedX = prepX(data, lengthOfSequence, numUniqueChars)
preppedY = prepY(targets)


##The last thing we can do before we train is get our model set up
def generateModel(X, y):
    model = Sequential()
    model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(256))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = generateModel(preppedX, preppedY)

In [58]:
##Training time!

def trainModel(model, X, y, numEpochs= 20, batchSize= 128):
    filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5" #replace with lowest loss file
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]

    model.fit(X, y, epochs = numEpochs, batch_size= batchSize, callbacks=callbacks_list)
    return model

model = trainModel(model, preppedX, preppedY)

Epoch 1/5

Epoch 00001: loss improved from inf to 2.97876, saving model to weights-improvement-01-2.9788.hdf5
Epoch 2/5

Epoch 00002: loss improved from 2.97876 to 2.86389, saving model to weights-improvement-02-2.8639.hdf5
Epoch 3/5

Epoch 00003: loss improved from 2.86389 to 2.79827, saving model to weights-improvement-03-2.7983.hdf5
Epoch 4/5

Epoch 00004: loss improved from 2.79827 to 2.73744, saving model to weights-improvement-04-2.7374.hdf5
Epoch 5/5

Epoch 00005: loss improved from 2.73744 to 2.67949, saving model to weights-improvement-05-2.6795.hdf5


In [68]:
##The code below loads back in the best weights we add
filename = "weights-improvement-04-2.7374.hdf5" #replace with best weights file for your training
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [69]:
##Now for text generation
def generateSeedFromData(data):
    start = np.random.randint(0, len(data)-1)
    pattern = data[start]
    print("Starting Seed: ", ''.join([intToChar[value] for value in pattern]), end= '\n\n\n')
    return pattern

def generateText(model, pattern, decoding, length= 1000, vocabSize= 47):
    text= ''.join([decoding[value] for value in pattern])
    for i in range(length):
        preppedPattern = prepSeed(pattern, vocabSize)
        prediction = model.predict(preppedPattern, verbose= 0)
        index = np.argmax(prediction)
        result = decoding[index]
        text += result
        pattern.append(index)
        pattern = pattern[1:]
        
    return text


def prepPattern(pattern, vocabSize):
    pattern = np.reshape(pattern, (1, len(pattern), 1))
    pattern = pattern / float(vocabSize)
    return pattern

intToChar = dict((i, char) for i, char in enumerate(chars))  #creating a demapping of our original encoding
seed = generateSeedFromData(data) #get a random starting point from our paper and let the network continue the writing
numCharacters= 100   #length of each window the network will use to predict the output
text = generateText(model, seed, intToChar, length= numCharacters, vocabSize= numUniqueChars)
print(text)

Starting Seed:  e is wonderfully interactive. the brain isn't divided into compartments. in fact, creativity — which


e is wonderfully interactive. the brain isn't divided into compartments. in fact, creativity — which  and the the the the the the the the the the the the the the the the the the the the the the the th
