In [46]:
import pathlib
import os
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku
from keras.utils import pad_sequences
import tensorflow as tf
from numpy.random import seed
import pandas as pd
import numpy as np
# simplified chinese tokenizer
import jieba
import time
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import re
import nltk


seed(1)
tf.random.set_seed(2)
punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.《》（）+-=()""''/="

# skipped directories
# SKIP = ["0", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
SKIP = ["0", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
# SKIP = []

t1 = time.time()

def get_all_items(root: pathlib.Path, exclude):
    itemList = []
    for item in root.iterdir():
        if item.name in exclude:
            continue
        if item.is_dir():
            itemList.append(get_all_items(item, []))
            continue
        itemList.append(item)
    return itemList


# begin preprocessing
largeDir = pathlib.Path("./Books")
# largeDir = pathlib.Path("./Books")
BookList = get_all_items(largeDir, SKIP)
BookList = [item for sublist in BookList for item in sublist]


# clean the dataset
# for path in BookList:
#     print(path)
#     file = open(path, 'r')
#     try:
#         fileStr = file.read()
#     except UnicodeDecodeError as error:
#         file.close()
#         os.remove(path)
#     continue

bigString = ""

for path in BookList:
    with open(path, 'r', encoding='gbk') as fiction:
        bigString += fiction.read()

# methods to strip punctuation and symbols
# bigString = re.sub(r"[%s]+" %punc, "", bigString)
bigString = re.sub(r'[^\w\s]', '', bigString)

# list of the words in their original order
allTokens = jieba.lcut(bigString, cut_all=False)
t2 = time.time()
print("Runtime for this cell in seconds: ", t2 - t1)
print("Corpus length in words: ", len(allTokens))

Runtime for this cell in seconds:  164.67968273162842
Corpus length in words:  24253160


In [47]:
minFreq = 300
# got rid of the 4 most common words
maxFreq = 5000000
wordFreq = {}
for token in allTokens:
    wordFreq[token] = wordFreq.get(token, 0) + 1

skipWords = set()
for k, v in wordFreq.items():
    if wordFreq[k] < minFreq or wordFreq[k] > maxFreq:    
        skipWords.add(k)
    elif k.isascii():
        skipWords.add(k)

# skipWords.remove("\n")
skipWords.remove(" ")
words = set(allTokens)
print("Unique words before filter: ", len(words))
print("To reduce vocab size, neglect words with appearances < ", minFreq)
print("To reduce vocab size, neglect words with appearances > ", maxFreq)
words = sorted(set(words) - skipWords)
print("Unique words after filter: ", len(words))

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

Unique words before filter:  235561
To reduce vocab size, neglect words with appearances <  300
To reduce vocab size, neglect words with appearances >  5000000
Unique words after filter:  6535


In [48]:
import codecs

words_file_path = "vocab.txt"

words_file = codecs.open(words_file_path, 'w', encoding='gbk')
# hugeStr = "".join(str(words))
# words_file.write(hugeStr)

for w in words:
    if w != "\n":
        words_file.write(w)
        words_file.write("\n")
    else:
        words_file.write(str(w))
words_file.close()

In [49]:
# vocabFile = "vocab.txt"
#
# with open(vocabFile, 'r') as vocabulary:
#     vocab = []
#     for line in vocabulary:
#         tmp_line = line.rstrip("\n")
#         vocab.append(tmp_line)
#
# word_indices = dict((c, i) for i, c in enumerate(vocab))
# indices_word = dict((i, c) for i, c in enumerate(vocab))
# print(word_indices)

In [50]:
print(word_indices)

{' ': 0, '\u3000': 1, '一': 2, '一一': 3, '一丁点': 4, '一万': 5, '一下': 6, '一下子': 7, '一世': 8, '一丝': 9, '一丝丝': 10, '一两个': 11, '一个': 12, '一个个': 13, '一个多月': 14, '一个月': 15, '一举': 16, '一事': 17, '一些': 18, '一人': 19, '一代': 20, '一件': 21, '一份': 22, '一众': 23, '一会': 24, '一会儿': 25, '一位': 26, '一倍': 27, '一共': 28, '一具': 29, '一出': 30, '一击': 31, '一刀': 32, '一分': 33, '一切': 34, '一切都是': 35, '一刻': 36, '一刻钟': 37, '一剑': 38, '一副': 39, '一动': 40, '一动不动': 41, '一千': 42, '一半': 43, '一双': 44, '一双眼': 45, '一变': 46, '一口': 47, '一口气': 48, '一句': 49, '一只': 50, '一号': 51, '一叹': 52, '一同': 53, '一名': 54, '一后': 55, '一向': 56, '一命': 57, '一品': 58, '一喜': 59, '一回': 60, '一团': 61, '一圈': 62, '一地': 63, '一场': 64, '一块': 65, '一堆': 66, '一声': 67, '一声声': 68, '一处': 69, '一夜': 70, '一大': 71, '一大群': 72, '一天': 73, '一头': 74, '一套': 75, '一如': 76, '一定': 77, '一家': 78, '一对': 79, '一尊': 80, '一小': 81, '一层': 82, '一届': 83, '一巴掌': 84, '一带': 85, '一幕': 86, '一年': 87, '一座': 88, '一张': 89, '一息': 90, '一惊': 91, '一想': 92, '一战': 93, '一手': 94, '一批': 95, '一把': 96, '一抖': 97, '一抹': 98

In [51]:
t1 = time.time()
seqLen = 10
step = 1
sequences = []
nextWords = []
seqIgnored = 0
for i in range(0, len(allTokens) - seqLen, step):
    if len(set(allTokens[i:i+seqLen+1]).intersection(skipWords)) == 0:
        sequences.append(allTokens[i:i + seqLen])
        nextWords.append(allTokens[i + seqLen])
    else:
        seqIgnored += 1

print("Number of sequences ignored: ", seqIgnored)
print("Number of remaining sequences: ", len(sequences))

t2 = time.time()
print("Runtime for this cell in seconds: ", t2 - t1)


Number of sequences ignored:  20497103
Number of remaining sequences:  3756047
Runtime for this cell in seconds:  39.271031618118286


In [52]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=5):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []

    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)


(sentences, nextWordsTrain), (testSentences, testNextWords) = shuffle_and_split_training_set(sequences, nextWords)

Shuffling sentences
Size of training set = 3568244
Size of test set = 187803


In [53]:
print(testSentences[:10])
print(testNextWords[:10])

sentences_train = "sentences_train.txt"
nextWords_train = "nextWords_train.txt"
sentences_tests = "sentences_tests.txt"
nextWords_tests = "nextWords_tests.txt"

with open(sentences_train, "w") as sentences_file:
    for sentence in sentences:
        tmp_sentence = ",".join(sentence)
        sentences_file.write(f"{tmp_sentence}\n")

with open(sentences_tests, "w") as sentences_tests_file:
    for sentence in testSentences:
        tmp_sentence = ",".join(sentence)
        sentences_tests_file.write(f"{tmp_sentence}\n")

with open(nextWords_train, "w") as nextWords_testFile:
    for nextword in nextWordsTrain:
        nextWords_testFile.write(f"{nextword}\n")

with open(nextWords_tests, "w") as nextWords_testFile:
    for nextword in testNextWords:
        nextWords_testFile.write(f"{nextword}\n")

[['找', '我', '谈', '生意', '至少', '要', '尊重', '一下', '我', '难道'], ['没有', '时空', '之门', '的', '印记', '快', '过来', '和', '我们', '一起'], ['天尊', '都', '要', '强大', '很多', '怎么', '会', '关注', '自己', '这种'], ['一团', '模糊', '的', '影子', '撞', '向', '了', '另', '一位', '九品'], ['\u3000', '\u3000', '沉重', '的', '气氛', '被', '徐素', '的话', '给', '破坏'], ['间', '就', '过去', '了', '沈翔', '悄悄', '离开', '了', '丹香', '桃源'], ['的', '魔物', '只不过', '是', '最', '弱小', '的', '一些', '而已', '越'], ['压根', '没有', '察觉', '泛', '东流', '的', '异样', '但', '陈雪', '和'], ['你', '是从', '哪里', '弄', '来', '的', '沈翔', '看', '了', '看'], ['面对', '这', '家伙', '东皇', '陛下', '居然', '是', '一脸', '的', '无奈']]
['就', '夏', '等级', '武尊', '了', '他', '深入', '陈昊', '那', '对']


In [54]:
with open(sentences_train, "r") as sentences_file:
    lines = []
    for line in sentences_file:
        tmp_line = line.rstrip("\n").split(",")
        lines.append(tmp_line)

with open(nextWords_train, "r") as answers_file:
    answers = []
    for line in answers_file:
        tmp_line = line.rstrip("\n")
        answers.append(tmp_line)

with open(sentences_tests, "r") as sentences_tests_file:
    lines_tests = []
    for line in sentences_tests_file:
        tmp_line = line.rstrip("\n").split(",")
        lines_tests.append(tmp_line)

with open(nextWords_tests, "r") as answers_tests_file:
    answers_tests = []
    for line in answers_tests_file:
        tmp_line = line.rstrip("\n")
        answers_tests.append(tmp_line)


print(lines_tests[:10])
print(answers_tests[:10])

[['找', '我', '谈', '生意', '至少', '要', '尊重', '一下', '我', '难道'], ['没有', '时空', '之门', '的', '印记', '快', '过来', '和', '我们', '一起'], ['天尊', '都', '要', '强大', '很多', '怎么', '会', '关注', '自己', '这种'], ['一团', '模糊', '的', '影子', '撞', '向', '了', '另', '一位', '九品'], ['\u3000', '\u3000', '沉重', '的', '气氛', '被', '徐素', '的话', '给', '破坏'], ['间', '就', '过去', '了', '沈翔', '悄悄', '离开', '了', '丹香', '桃源'], ['的', '魔物', '只不过', '是', '最', '弱小', '的', '一些', '而已', '越'], ['压根', '没有', '察觉', '泛', '东流', '的', '异样', '但', '陈雪', '和'], ['你', '是从', '哪里', '弄', '来', '的', '沈翔', '看', '了', '看'], ['面对', '这', '家伙', '东皇', '陛下', '居然', '是', '一脸', '的', '无奈']]
['就', '夏', '等级', '武尊', '了', '他', '深入', '陈昊', '那', '对']


In [55]:
print(lines[14948])
print(sentences[14948])

['嘛', '创道', '门', '掌教', '也', '走上', '来', '笑', '道', '里面']
['嘛', '创道', '门', '掌教', '也', '走上', '来', '笑', '道', '里面']


In [56]:
# model
def get_model(dropout=0.2):
    print('Build model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(128), input_shape=(seqLen, len(words))))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    return model


model = get_model()
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

print(model.summary())


Build model...
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_3 (Bidirectio  (None, 256)              6823936   
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 6535)              1679495   
                                                                 
 activation_3 (Activation)   (None, 6535)              0         
                                                                 
Total params: 8,503,431
Trainable params: 8,503,431
Non-trainable params: 0
_________________________________________________________________
None


In [57]:
# from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
# from keras.models import load_model
# import os, psutil
#
#
# seed(1)
# tf.random.set_seed(2)
#
# process = psutil.Process()
# print(process.memory_info().rss/1024/1024)
# print(psutil.virtual_memory())
#
#
# def sample(preds, temperature=1.0):
#     # helper function to sample an index from a probability array
#     preds = np.asarray(preds).astype('float64')
#     preds = np.log(preds) / temperature
#     exp_preds = np.exp(preds)
#     preds = exp_preds / np.sum(exp_preds)
#     probas = np.random.multinomial(1, preds, 1)
#     return np.argmax(probas)
#
# examples = "/kaggle/working/examples.txt"
#
# # os.remove("/kaggle/working/LSTM_Fic_model.h5")
# # os.remove("/kaggle/working/examples.txt")
# # os.remove("/kaggle/working/state.db")
#
#
# def on_epoch_end(epoch, logs):
#     # Function invoked at end of each epoch. Prints generated text.
#     examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)
#
#     # Randomly pick a seed sequence
#     seed_index = np.random.randint(len(sentences+testSentences))
#     seed = (sentences+testSentences)[seed_index]
#
#     for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
#         sentence = seed
#         examples_file.write('----- Diversity:' + str(diversity) + '\n')
#         examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
#         examples_file.write(' '.join(sentence))
#
#         for i in range(50):
#             x_pred = np.zeros((1, seqLen, len(words)))
#             for t, word in enumerate(sentence):
#                 x_pred[0, t, wordAsKey[word]] = 1.
#
#             preds = model.predict(x_pred, verbose=0)[0]
#             next_index = sample(preds, diversity)
#             next_word = intAsKey[next_index]
#
#             sentence = sentence[1:]
#             sentence.append(next_word)
#
#             examples_file.write(" "+next_word)
#         examples_file.write('\n')
#     examples_file.write('='*80 + '\n')
#     examples_file.flush()
#
#
# def generator(sentence_list, next_word_list, batch_size):
#     index = 0
#     while True:
#         x = np.zeros((batch_size, seqLen, len(words)), dtype=bool)
#         y = np.zeros((batch_size, len(words)), dtype=bool)
#         for i in range(batch_size):
#             for t, w in enumerate(sentence_list[index % len(sentence_list)]):
#                 x[i, t, wordAsKey[w]] = 1
#             y[i, wordAsKey[next_word_list[index % len(sentence_list)]]] = 1
#             index = index + 1
#         yield x, y
#
#
# checkPath = "/kaggle/working/LSTM_Fic_model.h5"
# BATCH_SIZE = 64
#
# checkpoint = ModelCheckpoint(checkPath, monitor='val_accuracy', save_best_only=True)
# print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.0001)
# callbacks_list = [checkpoint, print_callback, reduce_lr]
#
#
# examples_file = open(examples, "a")
#
# # comment out below block if picking up training
# # model.fit(generator(sentences, nextWordsTrain, BATCH_SIZE),
# #                         steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
# #                         epochs=3,
# #                         callbacks=callbacks_list,
# #                         validation_data=generator(testSentences, testNextWords, BATCH_SIZE),
# #                         validation_steps=int(len(testSentences)/BATCH_SIZE) + 1)
#
#
# model_path = "/kaggle/input/lstm-10e/LSTM_Fic_model.h5"
#
# # already have 10 epochs
# # load the model after saving
# new_model = load_model(model_path)
# checkpoint = ModelCheckpoint(checkPath, monitor='val_accuracy', save_best_only=True)
# print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=0.00001)
# callbacks_list = [checkpoint, print_callback, reduce_lr]
#
#
# examples_file = open(examples, "a")
# new_model.fit(generator(sentences, nextWordsTrain, BATCH_SIZE),
#                         steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
#                         epochs=2,
#                         callbacks=callbacks_list,
#                         validation_data=generator(testSentences, testNextWords, BATCH_SIZE),
#                         validation_steps=int(len(testSentences)/BATCH_SIZE) + 1)
