In [1]:
##First we'll import all our tools

import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
##Let's get our scripts ready to go, we'll read in all the 
def getData(csvName= 'clean_transcripts.csv'):
    dataFrame = pd.read_csv(csvName)
    allScripts = dataFrame['transcript'].tolist()
    allScripts = [script.lower() for script in allScripts]
    return allScripts


useAll = False #Set to true to use all transcripts in your training data, false uses the first transcript

allScripts = getData()
transcript = ''

if not useAll:
    transcript = allScripts[0]
else:
    transcript = "\n".join(allScripts) #Joining our transcripts seperated by new lines

print(len(transcript))

17394


In [3]:
##Let's look at a list of all unique characters in our scripts, 
##we'll eventually need to one hot encode them to make training easier:
uniqueChars = sorted(list(set(transcript)))
numUniqueChars = len(uniqueChars)
print(uniqueChars)

[' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '5', '6', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
##Lets make a mapping of each character to a specific number, this will help our training since we need numerical data:
charsToInt = dict((char, i) for i, char in enumerate(uniqueChars))
print(charsToInt)

{'?': 18, '2': 11, 'c': 21, 'h': 26, 'y': 43, 'w': 41, 'r': 36, '"': 2, 's': 37, 'u': 39, 'a': 19, 'f': 24, 'p': 34, 'g': 25, 'd': 22, 'b': 20, ',': 6, 't': 38, '.': 8, 'o': 33, 'k': 29, 'i': 27, 'm': 31, '(': 4, ' ': 0, '0': 9, 'j': 28, ':': 16, '-': 7, 'e': 23, '5': 13, '1': 10, '3': 12, 'z': 44, 'q': 35, '6': 14, 'v': 40, '!': 1, 'l': 30, 'n': 32, ';': 17, "'": 3, ')': 5, 'x': 42, '9': 15}


In [5]:
##For our network we are going to train it by feeding it strings of characters and have it predict what the next
##character in the sequence will be. So to generate these sequences we will copy 100 characters from our transcript
##into a sequence, take the next character as the target answer, and then shift our window by one character and do that
##over and over again. 

lengthOfSequence = 100
def prepSequences(rawText, encoding, sequenceLength = 100): 
    data = []
    targets = []
    for i in range(0, len(rawText) - sequenceLength, 1):
        sequence = rawText[i: i+sequenceLength]
        target = rawText[i + sequenceLength]
        data.append([encoding[char] for char in sequence]) #Here we are encoding the characters to their previous assigned values
        targets.append(encoding[target])                   #Same with the target answer

    return data, targets

data, targets = prepSequences(transcript, charsToInt, lengthOfSequence)
print(data[0])

[25, 33, 33, 22, 0, 31, 33, 36, 32, 27, 32, 25, 8, 0, 26, 33, 41, 0, 19, 36, 23, 0, 43, 33, 39, 18, 4, 30, 19, 39, 25, 26, 38, 23, 36, 5, 27, 38, 3, 37, 0, 20, 23, 23, 32, 0, 25, 36, 23, 19, 38, 6, 0, 26, 19, 37, 32, 3, 38, 0, 27, 38, 18, 0, 27, 3, 40, 23, 0, 20, 23, 23, 32, 0, 20, 30, 33, 41, 32, 0, 19, 41, 19, 43, 0, 20, 43, 0, 38, 26, 23, 0, 41, 26, 33, 30, 23, 0, 38, 26]


In [15]:
##To finish off prepping our data, we need to convert x to be [samples, time steps, features]
##and we need to convert our training answers to a one hot encoding
def prepX(data, lengthOfSequence, numUniqueChars):
    data = np.reshape(data, (len(data), lengthOfSequence, 1))
    data = data / float(numUniqueChars)
    return data

def prepY(targets):
    targets = np_utils.to_categorical(targets)
    return targets

preppedX = prepX(data, lengthOfSequence, numUniqueChars)
preppedY = prepY(targets)


##The last thing we can do before we train is get our model set up
def generateModel(X, y, size= 256):
    model = Sequential()
    model.add(LSTM(size, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(size))
    model.add(Dropout(0.2))
    model.add(Dense(y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = generateModel(preppedX, preppedY)

In [16]:
##Training time!

def trainModel(model, X, y, numEpochs= 20, batchSize= 128):
    filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5" #replace with lowest loss file
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]

    model.fit(X, y, epochs = numEpochs, batch_size= batchSize, callbacks=callbacks_list)
    return model

model = trainModel(model, preppedX, preppedY, 1)

Epoch 1/1

Epoch 00001: loss improved from inf to 3.05627, saving model to weights-improvement-01-3.0563.hdf5


In [17]:
##The code below loads back in the best weights we add
def loadModel(model, filename):  #replace with best weights file for your training
    model.load_weights(filename)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = loadModel(model, "weights-improvement-01-3.0563.hdf5")

In [18]:
##Now for text generation
def generateSeedFromData(data):
    start = np.random.randint(0, len(data)-1)
    pattern = data[start]
    return pattern



def generateText(model, pattern, decoding, length= 1000, vocabSize= 47, delimeter= ''):
    text= delimeter.join([decoding[value] for value in pattern])
    for i in range(length):
        preppedPattern = prepPattern(pattern, vocabSize)
        prediction = model.predict(preppedPattern, verbose= 0)
        index = np.argmax(prediction)
        result = decoding[index]
        text += delimeter + result
        pattern.append(index)
        pattern = pattern[1:]
        
    return text

def prepPattern(pattern, vocabSize):
    pattern = np.reshape(pattern, (1, len(pattern), 1))
    pattern = pattern / float(vocabSize)
    return pattern

intToChar = dict((i, char) for i, char in enumerate(uniqueChars))  #creating a demapping of our original encoding
seed = generateSeedFromData(data) #get a random starting point from our paper and let the network continue the writing
print("Starting Seed: ", ''.join([intToChar[value] for value in seed]), end= '\n\n\n')
numCharacters= 200   #length of each window the network will use to predict the output
text = generateText(model, seed, intToChar, length= numCharacters, vocabSize= numUniqueChars)
print(text)

Starting Seed:  oo, there's a hierarchy within the arts. art and music are normally given a higher status in schools


oo, there's a hierarchy within the arts. art and music are normally given a higher status in schools                                                                                                                                                                                                        


In [19]:
##Let's look at a list of all unique characters in our scripts, 
##we'll eventually need to one hot encode them to make training easier:
uniqueWords = sorted(transcript.split(' '))
numUniqueWords = len(uniqueWords)

##Lets make a mapping of each character to a specific number, this will help our training since we need numerical data:
stringToInt = dict((string, i) for i, string in enumerate(uniqueWords))
print(numUniqueWords)

3066


In [21]:
lengthOfSequence = 25
words = transcript.split(" ")
data, targets = prepSequences(words, stringToInt, lengthOfSequence)
preppedX = prepX(data, lengthOfSequence, numUniqueWords)
preppedY = prepY(targets)
#print([intToString[point] for point in data[10]], intToString[targets[10]])

In [22]:
model = generateModel(preppedX, preppedY, size= 512)
model = trainModel(model, preppedX, preppedY, 5, 5)

In [23]:

model = loadModel(model, "weights-improvement-05-6.2815.hdf5")
intToString = dict((i, word) for i, word in enumerate(uniqueWords))  #creating a demapping of our original encoding
seed = generateSeedFromData(data) #get a random starting point from our paper and let the network continue the writing
numWords= 100   #length of each window the network will use to predict the output
text = generateText(model, seed, intToString, length= numWords, vocabSize= numUniqueWords, delimeter= ' ')
print(text)

OSError: Unable to open file (unable to open file: name = 'weights-improvement-05-6.2815.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)