In [6]:
import pathlib
import os
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku
from keras.utils import pad_sequences
import tensorflow as tf
from numpy.random import seed
import pandas as pd
import numpy as np
# simplified chinese tokenizer
import jieba
import time
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import re
import nltk


seed(1)
tf.random.set_seed(2)
punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.《》（）+-=()""''/="

# skipped directories
# SKIP = ["0", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
SKIP = ["0", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
# SKIP = []

t1 = time.time()

def get_all_items(root: pathlib.Path, exclude):
    itemList = []
    for item in root.iterdir():
        if item.name in exclude:
            continue
        if item.is_dir():
            itemList.append(get_all_items(item, []))
            continue
        itemList.append(item)
    return itemList


# begin preprocessing
largeDir = pathlib.Path("./Books")
# largeDir = pathlib.Path("./Books")
BookList = get_all_items(largeDir, SKIP)
BookList = [item for sublist in BookList for item in sublist]


# clean the dataset
# for path in BookList:
#     print(path)
#     file = open(path, 'r')
#     try:
#         fileStr = file.read()
#     except UnicodeDecodeError as error:
#         file.close()
#         os.remove(path)
#     continue

bigString = ""

for path in BookList:
    with open(path, 'r', encoding='gbk') as fiction:
        bigString += fiction.read()

# methods to strip punctuation and symbols
# bigString = re.sub(r"[%s]+" %punc, "", bigString)
bigString = re.sub(r'[^\w\s]', '', bigString)

# list of the words in their original order
allTokens = jieba.lcut(bigString, cut_all=False)
t2 = time.time()
print("Runtime for this cell in seconds: ", t2 - t1)
print("Corpus length in words: ", len(allTokens))

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\PC\AppData\Local\Temp\jieba.cache
Loading model cost 1.114 seconds.
Prefix dict has been built successfully.


Runtime for this cell in seconds:  111.84600043296814
Corpus length in words:  14712711


In [38]:
minFreq = 400
# got rid of the 4 most common words
maxFreq = 5000000
wordFreq = {}
for token in allTokens:
    wordFreq[token] = wordFreq.get(token, 0) + 1

# le -> 223736 de -> 578590
skipWords = set()
for k, v in wordFreq.items():
    if wordFreq[k] < minFreq or wordFreq[k] > maxFreq:    
        skipWords.add(k)
    elif k.isascii():
        skipWords.add(k)

# skipWords.remove("\n")
skipWords.remove(" ")
words = set(allTokens)
print("Unique words before filter: ", len(words))
print("To reduce vocab size, neglect words with appearances < ", minFreq)
print("To reduce vocab size, neglect words with appearances > ", maxFreq)
words = sorted(set(words) - skipWords)
print("Unique words after filter: ", len(words))

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))



Unique words before filter:  181227
To reduce vocab size, neglect words with appearances <  400
To reduce vocab size, neglect words with appearances >  5000000
Unique words after filter:  3515


In [70]:
print(word_indices)

{' ': 0, '\u3000': 1, '一': 2, '一下': 3, '一下子': 4, '一丝': 5, '一个': 6, '一个个': 7, '一个月': 8, '一些': 9, '一人': 10, '一代': 11, '一件': 12, '一份': 13, '一会': 14, '一位': 15, '一出': 16, '一击': 17, '一刀': 18, '一切': 19, '一刻': 20, '一剑': 21, '一副': 22, '一动': 23, '一千': 24, '一半': 25, '一双': 26, '一变': 27, '一口': 28, '一口气': 29, '一句': 30, '一只': 31, '一叹': 32, '一名': 33, '一向': 34, '一品': 35, '一团': 36, '一圈': 37, '一场': 38, '一块': 39, '一声': 40, '一处': 41, '一夜': 42, '一大': 43, '一天': 44, '一头': 45, '一套': 46, '一定': 47, '一家': 48, '一对': 49, '一层': 50, '一巴掌': 51, '一幕': 52, '一年': 53, '一座': 54, '一张': 55, '一惊': 56, '一战': 57, '一手': 58, '一批': 59, '一把': 60, '一抹': 61, '一招': 62, '一拳': 63, '一挥': 64, '一掌': 65, '一支': 66, '一方': 67, '一旁': 68, '一族': 69, '一旦': 70, '一时': 71, '一时间': 72, '一条': 73, '一枚': 74, '一枪': 75, '一株': 76, '一样': 77, '一根': 78, '一次': 79, '一次次': 80, '一步': 81, '一段': 82, '一段时间': 83, '一滴': 84, '一炉': 85, '一点': 86, '一点点': 87, '一片': 88, '一生': 89, '一番': 90, '一百': 91, '一直': 92, '一看': 93, '一眼': 94, '一瞬间': 95, '一种': 96, '一笑': 97, '一笔': 98, '一粒': 

In [72]:
vocabFile = "vocab.txt"

with open(vocabFile, 'r') as vocabulary:
    vocab = []
    for line in vocabulary:
        tmp_line = line.rstrip("\n")
        vocab.append(tmp_line)

word_indices = dict((c, i) for i, c in enumerate(vocab))
indices_word = dict((i, c) for i, c in enumerate(vocab))
print(word_indices)

{' ': 0, '\u3000': 1, '一': 2, '一下': 3, '一下子': 4, '一丝': 5, '一个': 6, '一个个': 7, '一个月': 8, '一些': 9, '一人': 10, '一代': 11, '一件': 12, '一份': 13, '一会': 14, '一位': 15, '一出': 16, '一击': 17, '一刀': 18, '一切': 19, '一刻': 20, '一剑': 21, '一副': 22, '一动': 23, '一千': 24, '一半': 25, '一双': 26, '一变': 27, '一口': 28, '一口气': 29, '一句': 30, '一只': 31, '一叹': 32, '一名': 33, '一向': 34, '一品': 35, '一团': 36, '一圈': 37, '一场': 38, '一块': 39, '一声': 40, '一处': 41, '一夜': 42, '一大': 43, '一天': 44, '一头': 45, '一套': 46, '一定': 47, '一家': 48, '一对': 49, '一层': 50, '一巴掌': 51, '一幕': 52, '一年': 53, '一座': 54, '一张': 55, '一惊': 56, '一战': 57, '一手': 58, '一批': 59, '一把': 60, '一抹': 61, '一招': 62, '一拳': 63, '一挥': 64, '一掌': 65, '一支': 66, '一方': 67, '一旁': 68, '一族': 69, '一旦': 70, '一时': 71, '一时间': 72, '一条': 73, '一枚': 74, '一枪': 75, '一株': 76, '一样': 77, '一根': 78, '一次': 79, '一次次': 80, '一步': 81, '一段': 82, '一段时间': 83, '一滴': 84, '一炉': 85, '一点': 86, '一点点': 87, '一片': 88, '一生': 89, '一番': 90, '一百': 91, '一直': 92, '一看': 93, '一眼': 94, '一瞬间': 95, '一种': 96, '一笑': 97, '一笔': 98, '一粒': 

In [39]:
t1 = time.time()
seqLen = 15
step = 1
sequences = []
nextWords = []
seqIgnored = 0
for i in range(0, len(allTokens) - seqLen, step):
    if len(set(allTokens[i:i+seqLen+1]).intersection(skipWords)) == 0:
        sequences.append(allTokens[i:i + seqLen])
        nextWords.append(allTokens[i + seqLen])
    else:
        seqIgnored += 1

print("Number of sequences ignored: ", seqIgnored)
print("Number of remaining sequences: ", len(sequences))

t2 = time.time()
print("Runtime for this cell in seconds: ", t2 - t1)


Number of sequences ignored:  14304996
Number of remaining sequences:  407700
Runtime for this cell in seconds:  31.199272632598877


In [40]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=5):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []

    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)


(sentences, nextWordsTrain), (testSentences, testNextWords) = shuffle_and_split_training_set(sequences, nextWords)

Shuffling sentences
Size of training set = 387315
Size of test set = 20385


In [68]:
print(testSentences[:10])
print(testNextWords[:10])

sentences_train = "sentences_train.txt"
nextWords_train = "nextWords_train.txt"
sentences_tests = "sentences_tests.txt"
nextWords_tests = "nextWords_tests.txt"

with open(sentences_train, "w") as sentences_file:
    for sentence in sentences:
        tmp_sentence = " ".join(sentence)
        sentences_file.write(f"{tmp_sentence}\n")

with open(sentences_tests, "w") as sentences_tests_file:
    for sentence in testSentences:
        tmp_sentence = " ".join(sentence)
        sentences_tests_file.write(f"{tmp_sentence}\n")

with open(nextWords_train, "w") as nextWords_testFile:
    for nextword in nextWordsTrain:
        nextWords_testFile.write(f"{nextword}\n")

with open(nextWords_tests, "w") as nextWords_testFile:
    for nextword in testNextWords:
        nextWords_testFile.write(f"{nextword}\n")

[['答应', '过', '武轩', '要', '帮', '武轩', '炼制', '法则', '珠', '的', '所以', '他会', '先', '炼制', '武轩'], ['什么', '似的', '先前', '还', '在', '好奇', '究竟', '是', '什么', '逆天', '的', '资质', '才能', '被', '这'], ['我', '若', '消失', '你', '会', '不会', '想', '我', '反正', '以后', '要是', '看不到', '你', '我会', '非常'], ['的', '力量', '确实', '令人', '感到', '震撼', '特别', '是', '那些', '布置', '结界', '的', '掌教', '巨头', '他们'], ['\u3000', '\u3000', '可是', '我', '能够', '感觉', '到', '那', '都', '是', '真的', '真的', '是', '真的', '啊'], ['\u3000', '\u3000', '修为', '到', '了', '九品', '至尊', '这种', '地步', '轻易', '不会', '受伤', '但', '一旦', '受伤'], ['是', '感受', '到', '这股', '气息', '才', '立即', '跑', '开', '的', '原本', '他', '可是', '打算', '把'], ['中', '那么', '弱', '更何况', '只有', '你', '彻底', '恢复', '了', '才能', '在', '今后', '帮', '我', '更'], ['她', '从小', '也', '见', '过', '不少', '她', '爷爷', '布置', '的', '阵法', '要', '怎么', '开启', '阵法'], ['\u3000', '楚阳', '与', '乌倩倩', '严肃', '地', '点头', '不用', '她', '说', '两人', '也', '知道', '这次', '机会']]
['的', '两大', '的', '能', '谈', '就', '下面', '多', '她', '是']


In [69]:
with open(sentences_train, "r") as sentences_file:
    lines = []
    for line in sentences_file:
        tmp_line = line.rstrip("\n").split(" ")
        lines.append(tmp_line)

with open(nextWords_train, "r") as answers_file:
    answers = []
    for line in answers_file:
        tmp_line = line.rstrip("\n")
        answers.append(tmp_line)

with open(sentences_tests, "r") as sentences_tests_file:
    lines_tests = []
    for line in sentences_tests_file:
        tmp_line = line.rstrip("\n").split(" ")
        lines_tests.append(tmp_line)

with open(nextWords_tests, "r") as answers_tests_file:
    answers_tests = []
    for line in answers_tests_file:
        tmp_line = line.rstrip("\n")
        answers_tests.append(tmp_line)


print(lines_tests[:10])
print(answers_tests[:10])

[['答应', '过', '武轩', '要', '帮', '武轩', '炼制', '法则', '珠', '的', '所以', '他会', '先', '炼制', '武轩'], ['什么', '似的', '先前', '还', '在', '好奇', '究竟', '是', '什么', '逆天', '的', '资质', '才能', '被', '这'], ['我', '若', '消失', '你', '会', '不会', '想', '我', '反正', '以后', '要是', '看不到', '你', '我会', '非常'], ['的', '力量', '确实', '令人', '感到', '震撼', '特别', '是', '那些', '布置', '结界', '的', '掌教', '巨头', '他们'], ['\u3000', '\u3000', '可是', '我', '能够', '感觉', '到', '那', '都', '是', '真的', '真的', '是', '真的', '啊'], ['\u3000', '\u3000', '修为', '到', '了', '九品', '至尊', '这种', '地步', '轻易', '不会', '受伤', '但', '一旦', '受伤'], ['是', '感受', '到', '这股', '气息', '才', '立即', '跑', '开', '的', '原本', '他', '可是', '打算', '把'], ['中', '那么', '弱', '更何况', '只有', '你', '彻底', '恢复', '了', '才能', '在', '今后', '帮', '我', '更'], ['她', '从小', '也', '见', '过', '不少', '她', '爷爷', '布置', '的', '阵法', '要', '怎么', '开启', '阵法'], ['\u3000', '楚阳', '与', '乌倩倩', '严肃', '地', '点头', '不用', '她', '说', '两人', '也', '知道', '这次', '机会']]
['的', '两大', '的', '能', '谈', '就', '下面', '多', '她', '是']


In [None]:
# model
def get_model(dropout=0.2):
    print('Build model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(128), input_shape=(seqLen, len(words))))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    return model


model = get_model()
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

print(model.summary())


In [None]:
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.models import load_model
import os, psutil


seed(1)
tf.random.set_seed(2)

process = psutil.Process()
print(process.memory_info().rss/1024/1024)
print(psutil.virtual_memory())


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

examples = "/kaggle/working/examples.txt"

# os.remove("/kaggle/working/LSTM_Fic_model.h5")
# os.remove("/kaggle/working/examples.txt")
# os.remove("/kaggle/working/state.db")


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+testSentences))
    seed = (sentences+testSentences)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(50):
            x_pred = np.zeros((1, seqLen, len(words)))
            for t, word in enumerate(sentence):
                x_pred[0, t, wordAsKey[word]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = intAsKey[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()


def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, seqLen, len(words)), dtype=bool)
        y = np.zeros((batch_size, len(words)), dtype=bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t, wordAsKey[w]] = 1
            y[i, wordAsKey[next_word_list[index % len(sentence_list)]]] = 1
            index = index + 1
        yield x, y


checkPath = "/kaggle/working/LSTM_Fic_model.h5"
BATCH_SIZE = 64

checkpoint = ModelCheckpoint(checkPath, monitor='val_accuracy', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.0001)
callbacks_list = [checkpoint, print_callback, reduce_lr]


examples_file = open(examples, "a")

# comment out below block if picking up training
# model.fit(generator(sentences, nextWordsTrain, BATCH_SIZE),
#                         steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
#                         epochs=3,
#                         callbacks=callbacks_list,
#                         validation_data=generator(testSentences, testNextWords, BATCH_SIZE),
#                         validation_steps=int(len(testSentences)/BATCH_SIZE) + 1)


model_path = "/kaggle/input/lstm-10e/LSTM_Fic_model.h5"

# already have 10 epochs
# load the model after saving
new_model = load_model(model_path)
checkpoint = ModelCheckpoint(checkPath, monitor='val_accuracy', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=0.00001)
callbacks_list = [checkpoint, print_callback, reduce_lr]


examples_file = open(examples, "a")
new_model.fit(generator(sentences, nextWordsTrain, BATCH_SIZE),
                        steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                        epochs=2,
                        callbacks=callbacks_list,
                        validation_data=generator(testSentences, testNextWords, BATCH_SIZE),
                        validation_steps=int(len(testSentences)/BATCH_SIZE) + 1)
