In [11]:
import pathlib
import os
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku
from keras.utils import pad_sequences
import tensorflow as tf
from numpy.random import seed
import pandas as pd
import numpy as np
# simplified chinese tokenizer
import jieba
import time
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import re
import nltk

seed(1)
tf.random.set_seed(2)
punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.《》（）+-=()""''/="

# skipped directories
# SKIP = ["0", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
SKIP = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
# SKIP = []

t1 = time.time()

def get_all_items(root: pathlib.Path, exclude):
    itemList = []
    for item in root.iterdir():
        if item.name in exclude:
            continue
        if item.is_dir():
            itemList.append(get_all_items(item, []))
            continue
        itemList.append(item)
    return itemList


# begin preprocessing
largeDir = pathlib.Path("./Books")
BookList = get_all_items(largeDir, SKIP)
BookList = [item for sublist in BookList for item in sublist]


# clean the dataset
# for path in BookList:
#     print(path)
#     file = open(path, 'r')
#     try:
#         fileStr = file.read()
#     except UnicodeDecodeError as error:
#         file.close()
#         os.remove(path)
#     continue

bigString = ""

for path in BookList:
    with open(path, 'r') as fiction:
        bigString += fiction.read()

# methods to strip punctuation and symbols
# bigString = re.sub(r"[%s]+" %punc, "", bigString)
bigString = re.sub(r'[^\w\s]', '', bigString)

# list of the words in their original order
allTokens = jieba.lcut(bigString, cut_all=False)
t2 = time.time()
print("Runtime for this cell in seconds: ", t2 - t1)
print("Corpus length in words: ", len(allTokens))

Runtime for this cell in seconds:  41.138999938964844
Corpus length in words:  6407510


In [12]:
minFreq = 5
wordFreq = {}
for token in allTokens:
    wordFreq[token] = wordFreq.get(token, 0) + 1

rareWords = set()
for k, v in wordFreq.items():
    if wordFreq[k] < minFreq:
        rareWords.add(k)

words = set(allTokens)
print("Unique words before filter: ", len(words))
print("To reduce vocab size, neglect words with appearances < ", minFreq)
words = sorted(set(words) - rareWords)
print("Unique words after filter: ", len(words))

word_to_int = dict((c, i) for i, c in enumerate(words))
int_to_word = dict((i, c) for i, c in enumerate(words))

Unique words before filter:  166485
To reduce vocab size, neglect words with appearances <  5
Unique words after filter:  52982


In [15]:
t1 = time.time()
seqLen = 50
step = 1
sequences = []
nextWords = []
seqIgnored = 0
for i in range(0, len(allTokens) - seqLen, step):
    if len(set(allTokens[i:i+seqLen+1]).intersection(rareWords)) == 0:
        sequences.append(allTokens[i:i + seqLen])
        nextWords.append(allTokens[i + seqLen])
    else:
        seqIgnored += 1

print("Number of sequences ignored: ", seqIgnored)
print("Number of remaining sequences: ", len(sequences))

t2 = time.time()
print("Runtime for this cell in seconds: ", t2 - t1)


Number of sequences ignored:  4443219
Number of remaining sequences:  1964241
Runtime for this cell in seconds:  19.494029760360718


In [17]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)


(sentences, nextWords), (testSentences, testNextWords) = shuffle_and_split_training_set(sequences, nextWords)

Shuffling sentences
Size of training set = 1924956
Size of test set = 39285


In [18]:
# model
def get_model(dropout=0.2):
    print('Build model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(128), input_shape=(seqLen, len(words))))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    return model


model = get_model()
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])



Build model...


In [None]:
checkPath = ""

In [None]:
# below code is too slow
# freqDist = nltk.FreqDist(allTokens)
# words = freqDist.most_common(50000)
#
# afterRareTokens = [word for word in allTokens if word in words]
#
# charSet = set(afterRareTokens)
# chars = sorted(list(charSet))
# char_to_int = dict((c, i) for i, c in enumerate(chars))
# int_to_char = dict((i, c) for i, c in enumerate(chars))
#
# numWords = len(afterRareTokens)
# numVocab = len(charSet)
#
# print(afterRareTokens)
#
# print("Number of words: ", numWords)
# print("Vocab size: ", numVocab)

# seqLen = 100
# dataX = []
# dataY = []
# for i in range(0, numWords - seqLen, 1):
#     seqIn = allTokens[i:i + seqLen]
#     seqOut = allTokens[i + seqLen]
#     dataX.append([char_to_int[char] for char in seqIn])
#     dataY.append(char_to_int[seqOut])
# nPatterns = len(dataX)
# print("Total Patterns: ", nPatterns)
#
# # reshape X to be [samples, time steps, features]
# X = np.reshape(dataX, (nPatterns, seqLen, 1))
# # normalize
# X = X / float(numVocab)
# # one hot encode the output variable
# y = to_categorical(dataY)
# print(y)
# print(y.shape)