In [22]:
import pickle
import pandas as pd
import numpy as np
import string, os 
import warnings

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku 

from numpy.random import seed
import tensorflow
tensorflow.random.set_seed(4)
seed(20)
os.chdir("../quoteAIsData")

def writeToFile(filename, data):
    file = open(filename, 'w', encoding = "utf-8")
    file.write(data)
    file.close()
    
def loadFromFile(filename):
    file = open(filename, 'r', encoding = "utf-8")
    text = file.read()
    file.close()
    return text

In [12]:
quote_dict = pickle.load(open('quotes.pkl', 'rb'))
corpus_dict = {}
for k in quote_dict:
    corpus = ' '.join(quote_dict[k])
    corpus_dict[k] = corpus
    print(k, len(corpus))

DAVID 108324
DIEGO 106373
ERIC 1249063
GWYN 1312330
JETT 181826
MARS 695638
MILES 791200
MILO 886187
PARSA 339260


In [8]:
token_dict = {}
for key, corpus in corpus_dict.items():
    # split into tokens by white space
    tokens = corpus.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    token_dict[key] = tokens
    print(key)
    print('Total Tokens: %d' % len(tokens))
    print('Unique Tokens: %d' % len(set(tokens)))

for key, tokens in token_dict.items(): 
    length = 51
    sequences = list()
    for i in range(length, len(tokens)):
        seq = tokens[i-length:i]
        line = ' '.join(seq)
        sequences.append(line)
    print(f"{key} - {len(sequences)} sequences")
    writeToFile(key+"_sequences.txt", '\n'.join(sequences))

DAVID
Total Tokens: 19020
Unique Tokens: 3960
DIEGO
Total Tokens: 19084
Unique Tokens: 3077
ERIC
Total Tokens: 231264
Unique Tokens: 15618
GWYN
Total Tokens: 149662
Unique Tokens: 10075
JETT
Total Tokens: 33283
Unique Tokens: 5291
MARS
Total Tokens: 128621
Unique Tokens: 10492
MILES
Total Tokens: 121145
Unique Tokens: 10124
MILO
Total Tokens: 163472
Unique Tokens: 11676
PARSA
Total Tokens: 63888
Unique Tokens: 7952
DAVID - 18969 sequences
DIEGO - 19033 sequences
ERIC - 231213 sequences
GWYN - 149611 sequences
JETT - 33232 sequences
MARS - 128570 sequences
MILES - 121094 sequences
MILO - 163421 sequences
PARSA - 63837 sequences


In [25]:
def trainModel(name):
    in_filename = name + "_sequences.txt"
    file = loadFromFile(in_filename)
    lines = file.split('\n')
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    sequences = tokenizer.texts_to_sequences(lines)
    vocab_size = len(tokenizer.word_index) + 1
    print(f"{name}: {vocab_size} vocabulary size")
    pickle.dump(tokenizer, open(name + '_tokenizer.pkl', 'wb'))
    sequences = np.array(sequences)
    X, y = sequences[:,:-1], sequences[:,-1]
    y = np_utils.to_categorical(y, num_classes=vocab_size)
    seq_length = X.shape[1]
    # define model
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=seq_length))
    model.add(LSTM(200, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(vocab_size, activation='softmax'))
    print(model.summary())
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit model
    model.fit(X, y, batch_size=128, epochs=100)
    model.save(name + '_char_word_model.h5')

In [None]:
trainOrder = ["PARSA", "JETT", "MILO", "MARS", "MILES", "GWYN", "ERIC"]
for n in trainOrder:
    print(f"Training {n} bot")
    trainModel(n)

Training PARSA bot
PARSA: 7953 vocabulary size
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 50, 50)            397650    
_________________________________________________________________
lstm_8 (LSTM)                (None, 50, 200)           200800    
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 7953)              803253    
Total params: 1,532,203
Trainable params: 1,532,203
Non-trainable params: