In [1]:
import re
import string
from unicodedata import normalize
import numpy as np
import keras
import keras.utils as ku 
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
import os
os.getcwd()

'/Users/joey/Desktop/C/CS584/assignment4'

In [3]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [4]:
def load_doc(filename):
    # open and read file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

def to_sentence(doc):
    # @para: doc is the whole context
    # return: a list of sentences.
    lines = doc.strip().split('\n')
    sentences = [line.split('\t') for line in lines]
    return sentences

def clean_data(lines):
    # @para: lines is a list of lists, aka a list of all sentences
    # return sentences with only words, seperated by spaceß. 
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha() and word != 'unk']
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return cleaned

In [5]:
def to_token(clean_lines):
    # @paras: input is clean_lines
    # returns: vocab-a dictionary matches word to index. word as key, index as value
    # sequences: list of sentences, represented by number
    # mmax: length of the Slongest sentence 
    
    # create a word_set contains all words
    word_set = set()
    for line in clean_lines:
        cur_set = set(line[0].split())
        word_set = word_set.union(cur_set)
    word_list = sorted(word_set) # a sorted list contains all words
    vocab = dict() # word as key, index as value
    for i in range(len(word_list)):
        vocab[word_list[i]] = i+1
    # text into tokens. 
    mmax = 0
    sequences = []
    for line in clean_lines:
        words = line[0].split()
        tokens = []
        for word in words:
            idx = vocab[word]
            tokens.append(idx)
        sequences.append(tokens)
        mmax = len(words) if len(words) > mmax else mmax
    return vocab, sequences, mmax

In [6]:
f1 = load_doc('a3-data/train.txt')

In [7]:
lines = to_sentence(f1) # convert how passage to sentences
clean_lines = clean_data(lines) # 2d list, each sentence for one list

In [8]:
# 2d list to string list. each sentence as a string
# because the input of tokenizer should be a string list
corpus = []
for c in clean_lines:
    corpus.append(c[0])

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus) # feed tokenizer with only TRAINING set!!!

In [10]:
total_words = len(tokenizer.word_index) + 1 # +1 for zero padding, word_index starts from 1

In [11]:
total_words

9885

In [12]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[: i + 1]  
        input_sequences.append(n_gram_sequence)
max_sequence_length = max([len(x) for x in input_sequences])
padded_seq = np.array(pad_sequences(input_sequences, maxlen=20, padding='pre')) # window 20

In [13]:
len(padded_seq)

790431

In [14]:
from keras.callbacks import ModelCheckpoint
import pickle
import glob

In [15]:
# input of training
predictors, labels = padded_seq[:, :-1], padded_seq[:, -1] # use 19 to train, predict the 20th word
labels = ku.to_categorical(labels, num_classes=total_words) # onehot encoding labels

In [16]:
predictors

array([[   0,    0,    0, ...,    0,    0, 9855],
       [   0,    0,    0, ...,    0, 9855, 9856],
       [   0,    0,    0, ..., 9855, 9856, 9857],
       ...,
       [ 635,  743,   10, ...,   56,  241, 4076],
       [ 743,   10,  465, ...,  241, 4076,  210],
       [  10,  465,  871, ..., 4076,  210,    4]], dtype=int32)

In [17]:
labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [46]:
filepath = "saved-best-model.hdf5"
model = Sequential()
model.add(Embedding(total_words, 15, input_length=19)) # input_length: predictor size
model.add(LSTM(256))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax')) # outputlayer with vocab size
adam = keras.optimizers.Adam(lr=0.001) # set learning rate to 0.001(which is default)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# create checkpoint to save weights for each epoch
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='auto', period=1)
model.fit(predictors[:500], labels[:500], epochs=10, batch_size=50, verbose=1, callbacks=[checkpoint])
model.compile(loss='categorical_crossentropy', optimizer='adam')


Epoch 1/10

Epoch 00001: loss improved from inf to 9.19194, saving model to saved-best-model.hdf5
Epoch 2/10

Epoch 00002: loss improved from 9.19194 to 8.40960, saving model to saved-best-model.hdf5
Epoch 3/10

Epoch 00003: loss improved from 8.40960 to 5.92698, saving model to saved-best-model.hdf5
Epoch 4/10

Epoch 00004: loss improved from 5.92698 to 5.44112, saving model to saved-best-model.hdf5
Epoch 5/10

Epoch 00005: loss improved from 5.44112 to 5.28271, saving model to saved-best-model.hdf5
Epoch 6/10

Epoch 00006: loss improved from 5.28271 to 5.22102, saving model to saved-best-model.hdf5
Epoch 7/10

Epoch 00007: loss improved from 5.22102 to 5.20869, saving model to saved-best-model.hdf5
Epoch 8/10

Epoch 00008: loss improved from 5.20869 to 5.18689, saving model to saved-best-model.hdf5
Epoch 9/10

Epoch 00009: loss improved from 5.18689 to 5.18502, saving model to saved-best-model.hdf5
Epoch 10/10

Epoch 00010: loss improved from 5.18502 to 5.16979, saving model to saved

In [47]:
from keras.models import load_model
model = load_model("saved-best-model.hdf5")
model.fit(predictors[:500], labels[:500], epochs=10, batch_size=50, verbose=1, callbacks=[checkpoint])

Epoch 1/10

Epoch 00001: loss did not improve from 5.16979
Epoch 2/10

Epoch 00002: loss did not improve from 5.16979
Epoch 3/10

Epoch 00003: loss improved from 5.16979 to 5.16926, saving model to saved-best-model.hdf5
Epoch 4/10

Epoch 00004: loss did not improve from 5.16926
Epoch 5/10

Epoch 00005: loss did not improve from 5.16926
Epoch 6/10

Epoch 00006: loss improved from 5.16926 to 5.14974, saving model to saved-best-model.hdf5
Epoch 7/10

Epoch 00007: loss did not improve from 5.14974
Epoch 8/10

Epoch 00008: loss did not improve from 5.14974
Epoch 9/10

Epoch 00009: loss did not improve from 5.14974
Epoch 10/10

Epoch 00010: loss did not improve from 5.14974


<keras.callbacks.History at 0x23f10292e8>

In [41]:
filepath = "saved-model-{epoch:02d}-{loss:.2f}.hdf5"
model = Sequential()
model.add(Embedding(total_words, 15, input_length=19)) # input_length: predictor size
model.add(LSTM(256))
model.add(Dropout(0.3))
model.add(Dense(total_words, activation='softmax')) # outputlayer with vocab size
adam = keras.optimizers.Adam(lr=0.001) # set learning rate to 0.001(which is default)
model.compile(loss='categorical_crossentropy', optimizer='adam')
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=False, mode='auto', period=1)
model.fit(predictors[:500], labels[:500], epochs=10, batch_size=50, verbose=1, callbacks=[checkpoint])

Epoch 1/10

Epoch 00001: saving model to saved-model-01-9.19.hdf5
Epoch 2/10

Epoch 00002: saving model to saved-model-02-8.35.hdf5
Epoch 3/10

Epoch 00003: saving model to saved-model-03-5.93.hdf5
Epoch 4/10

Epoch 00004: saving model to saved-model-04-5.41.hdf5
Epoch 5/10

Epoch 00005: saving model to saved-model-05-5.29.hdf5
Epoch 6/10

Epoch 00006: saving model to saved-model-06-5.20.hdf5
Epoch 7/10

Epoch 00007: saving model to saved-model-07-5.20.hdf5
Epoch 8/10

Epoch 00008: saving model to saved-model-08-5.19.hdf5
Epoch 9/10

Epoch 00009: saving model to saved-model-09-5.18.hdf5
Epoch 10/10

Epoch 00010: saving model to saved-model-10-5.19.hdf5


<keras.callbacks.History at 0x23e7f4e3c8>

In [27]:
model.save('./final_model.h5')

In [28]:
callbacks = list()

In [33]:
from keras.callbacks import TensorBoard
tensorboard = TensorBoard()

In [34]:
callbacks.append(tensorboard)



Epoch 1/10

Epoch 00001: saving model to saved-model-01-9.19.hdf5
Epoch 2/10

Epoch 00002: saving model to saved-model-02-8.39.hdf5
Epoch 3/10

Epoch 00003: saving model to saved-model-03-5.99.hdf5
Epoch 4/10

Epoch 00004: saving model to saved-model-04-5.45.hdf5
Epoch 5/10

Epoch 00005: saving model to saved-model-05-5.29.hdf5
Epoch 6/10

Epoch 00006: saving model to saved-model-06-5.20.hdf5
Epoch 7/10

Epoch 00007: saving model to saved-model-07-5.20.hdf5
Epoch 8/10

Epoch 00008: saving model to saved-model-08-5.19.hdf5
Epoch 9/10

Epoch 00009: saving model to saved-model-09-5.18.hdf5
Epoch 10/10

Epoch 00010: saving model to saved-model-10-5.19.hdf5


<keras.callbacks.History at 0x23e904bb38>

In [22]:
import h5py

In [23]:
json_string=model.to_json()

In [24]:
open('model_test.json', 'w').write(json_string)

2007

In [25]:
model.save_weights('test_weights.h5')

In [110]:
len(input_seq[1])

70

In [111]:
def create_model(max_length, vocab_size):

    input_len = max_length - 1
    model = Sequential()
    
    # add embedding layer
    # max_length is the biggest sequence length
    # 10 is the size for each word vector after embedding layer
    # max_length is N in the ngram 
    model.add(Embedding(vocab_size, 10, input_length=input_len))
    
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # the last layer implement softmax on the whole vocabulary
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model



In [112]:
max_length

70

In [113]:
vocab_size

7110

In [114]:
len(predictors[0])

69

In [115]:
model = create_model(max_length, vocab_size)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 69, 10)            71100     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 7110)              718110    
Total params: 833,610
Trainable params: 833,610
Non-trainable params: 0
_________________________________________________________________


In [135]:
len(predictors[0])

69

In [147]:
predictors[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  138,  554,  656,  900, 1005, 1164, 2645, 2740, 2846,
       3063, 3332, 3453, 3909, 4021, 4115, 5024, 5077, 5216, 5494, 5809,
       5884, 6013, 6277], dtype=int32)

In [116]:
model.fit(predictors, labels, epochs=100, verbose=5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x6598c4f90>

In [None]:
def generate_text(input_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list =

In [80]:
f2 = load_doc('a3-data/valid.txt')
f2_lines = to_sentence(f2)
f2_clean_lines = clean_data(f2_lines)

In [133]:
val_in

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  138,  554,  656,  900, 1005, 1164, 2645, 2740, 2846,
       3063, 3332, 3453, 3909, 4021, 4115, 5024, 5077, 5216, 5494, 5809,
       5884, 6013, 6277], dtype=int32)

In [154]:
val_pred = model.predict_classes(val_in, verbose=0)

ValueError: Error when checking input: expected embedding_4_input to have shape (69,) but got array with shape (1,)

In [81]:
f2_tokens=[]

for line in f2_clean_lines:
    # ues the training vocabulary, match text in validation dataset to numbers
    # if the word doesn't exist in the dictionary, it's zero.
    # the length of each input has to match the model, in this case max_length
    words = line[0].split()
    token = np.zeros(len(words))
    for i in range(len(words)):
        if words[i] in vocab:
            token[i] = vocab[words[i]]
    f2_tokens.append(token)

In [None]:
lines = to_sentence(f1)
clean_lines = clean_data(lines)
vocab, sequences, max_length = to_token(clean_lines)
vocab_size = len(vocab) + 1
# max_length = 10
input_seq = np.array(pad_sequences(sequences, maxlen=max_length, padding='pre'))
    
predictors, labels = input_seq[:,:-1],input_seq[:,-1]
labels = ku.to_categorical(labels, num_classes=vocab_size)

In [13]:
# def preprocess(text):  # text is a string list, for one paragraph 
#     # remove line breaks 
#     text = text.strip()
#     text = text.replace('\r', ' ').replace('\n', ' ')
#     text = ' '.join(text.split())
    
#     # remove punctuations from paragraphs
#     text = ''.join(c for c in text if c not in punctuation)
#     # change 's 
#     text = re.sub('\t', ' ', text)
#     text = re.sub(r"it\'s","it is",text)
#     text = re.sub(r"i\'d","i would",text)
#     text = re.sub(r"I\'d","I would",text)
#     text = re.sub(r"don\'t","do not",text)
#     text = re.sub(r"he\'s","he is",text)
#     text = re.sub(r"there\'s","there is",text)
#     text = re.sub(r"that\'s","that is",text)
#     text = re.sub(r"can\'t", "can not", text)
#     text = re.sub(r"cannot", "can not ", text)
#     text = re.sub(r"what\'s", "what is", text)
#     text = re.sub(r"What\'s", "what is", text)
#     text = re.sub(r"\'ve ", " have ", text)
#     text = re.sub(r"n\'t", " not ", text)
#     text = re.sub(r"i\'m", "i am ", text)
#     text = re.sub(r"I\'m", "I am ", text)
#     text = re.sub(r"\'re", " are ", text)
#     text = re.sub(r"\'d", " would ", text)
#     text = re.sub(r"\'ll", " will ", text)
#     text = re.sub(r"\'s"," is",text)
    
#     text = re.sub('[^a-zA-Z]', ' ', text)  # only keep characters
#     p = text.lower()  # to lowercase
    
#     # remove line breaks 
#     text = text.strip()
#     text = text.replace('\r', ' ').replace('\n', ' ')
#     text = ' '.join(text.split())
#     text = re.sub('[^a-zA-Z]', ' ', text)
#     text = text.lower()
#     tokens = word_tokenize(text)  
#     tokens = [w for w in tokens if w != 'unk']
#     return ' '.join(tokens) # return filtered

In [14]:
train_5k = preprocess(txt_5k)

In [22]:
words = set(train_5k.split())
words = sorted(words)

In [23]:
len(words)

7113

In [26]:
np.where(np.array(words) == words[0])[0][0]

0

In [29]:
onehotvec.shape

(7113,)

In [44]:
input_array = np.random.randint(1000, size=(32, 10))

In [46]:
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)









In [56]:
word_idx['calloway']

900

In [55]:
train_5k[3]

'calloway'

In [None]:
word_matrix = []
word_idx = {} # a dictionary to match word with volcabulary index
# one hot encoding
for i in range(len(words)):
    word_idx[words[i]] = i
    onehotvec = np.zeros(len(words))
    onehotvec[i] = 1
    word_matrix.append(onehotvec)

In [57]:
# represent word by index of volcabulary
train = []
for i in train_5k:
    cur_idx = word_idx[i]
    train.append(cur_idx)

In [None]:
model = Sequential()
# word embedding, volcabulary size: 7113, output word vector dim:64
# 7114: volab size+1, 64: output vector size, 50: batch size for training
model.complile('rmsprop', 'mse')
model.add(Embedding(7114, 64, input_length=50)) 
output_array = model.predict(train)