In [3]:
import pathlib
import os
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import tensorflow as tf
from numpy.random import seed
import numpy as np
import jieba
import time
from keras.callbacks import ModelCheckpoint
import re



seed(1)
# tf.random.set_seed(2)
punc = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.《》（）+-=()""''/="

# skipped directories
# SKIP = ["0", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
SKIP = ["0", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
# SKIP = []

t1 = time.time()

def get_all_items(root: pathlib.Path, exclude):
    itemList = []
    for item in root.iterdir():
        if item.name in exclude:
            continue
        if item.is_dir():
            itemList.append(get_all_items(item, []))
            continue
        itemList.append(item)
    return itemList


# begin preprocessing
largeDir = pathlib.Path("./Books")
# largeDir = pathlib.Path("./Books")
BookList = get_all_items(largeDir, SKIP)
BookList = [item for sublist in BookList for item in sublist]


# clean the dataset
# for path in BookList:
#     print(path)
#     file = open(path, 'r')
#     try:
#         fileStr = file.read()
#     except UnicodeDecodeError as error:
#         file.close()
#         os.remove(path)
#     continue

bigString = ""

for path in BookList:
    with open(path, 'r', encoding='gbk') as fiction:
        bigString += fiction.read()

# methods to strip punctuation and symbols
# bigString = re.sub(r"[%s]+" %punc, "", bigString)
bigString = re.sub(r'[^\w\s]', '', bigString)
cleaned = re.sub(r'_','', bigString)
cleaned1 = re.sub(r'(\n+)', '', cleaned)

# list of the words in their original order
allTokens = jieba.lcut(cleaned1, cut_all=False)
t2 = time.time()
print("Runtime for this cell in seconds: ", t2 - t1)
print("Corpus length in words: ", len(allTokens))

Runtime for this cell in seconds:  113.72538256645203
Corpus length in words:  14335894


In [4]:
minFreq = 1000
maxFreq = 50000000
wordFreq = {}
for token in allTokens:
    wordFreq[token] = wordFreq.get(token, 0) + 1

skipWords = set()
for k, v in wordFreq.items():
    if wordFreq[k] < minFreq or wordFreq[k] > maxFreq:    
        skipWords.add(k)
    elif k.isascii():
        skipWords.add(k)

# skipWords.remove("\n")
skipWords.remove(" ")
words = set(allTokens)
print("Unique words before filter: ", len(words))
print("To reduce vocab size, neglect words with appearances < ", minFreq)
print("To reduce vocab size, neglect words with appearances > ", maxFreq)
words = sorted(set(words) - skipWords)
print("Unique words after filter: ", len(words))

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

Unique words before filter:  194525
To reduce vocab size, neglect words with appearances <  1000
To reduce vocab size, neglect words with appearances >  50000000
Unique words after filter:  1490


In [5]:
import codecs

words_file_path = "vocab.txt"

words_file = codecs.open(words_file_path, 'w', encoding='gbk')
# hugeStr = "".join(str(words))
# words_file.write(hugeStr)

for w in words:
    if w != "\n":
        words_file.write(w)
        words_file.write("\n")
    else:
        words_file.write(str(w))
words_file.close()

In [6]:
# vocabFile = "vocab.txt"
#
# with open(vocabFile, 'r') as vocabulary:
#     vocab = []
#     for line in vocabulary:
#         tmp_line = line.rstrip("\n")
#         vocab.append(tmp_line)
#
# word_indices = dict((c, i) for i, c in enumerate(vocab))
# indices_word = dict((i, c) for i, c in enumerate(vocab))
# print(word_indices)

In [7]:
# print(word_indices)

In [8]:
t1 = time.time()
seqLen = 10
step = 1
sequences = []
nextWords = []
seqIgnored = 0
for i in range(0, len(allTokens) - seqLen, step):
    if len(set(allTokens[i:i+seqLen+1]).intersection(skipWords)) == 0:
        sequences.append(allTokens[i:i + seqLen])
        nextWords.append(allTokens[i + seqLen])
    else:
        seqIgnored += 1

print("Number of sequences ignored: ", seqIgnored)
print("Number of remaining sequences: ", len(sequences))

t2 = time.time()
print("Runtime for this cell in seconds: ", t2 - t1)


Number of sequences ignored:  13751964
Number of remaining sequences:  583920
Runtime for this cell in seconds:  23.012579202651978


In [9]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=30):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []

    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)


(sentences, nextWordsTrain), (testSentences, testNextWords) = shuffle_and_split_training_set(sequences, nextWords)

Shuffling sentences
Size of training set = 408744
Size of test set = 175176


In [11]:
print(testSentences[:10])
print(testNextWords[:10])

sentences_train = "sentences_train.txt"
nextWords_train = "nextWords_train.txt"
sentences_tests = "sentences_tests.txt"
nextWords_tests = "nextWords_tests.txt"

with open(sentences_train, "w", encoding='gbk') as sentences_file:
    for sentence in sentences:
        tmp_sentence = ",".join(sentence)
        sentences_file.write(f"{tmp_sentence}\n")

with open(sentences_tests, "w", encoding='gbk') as sentences_tests_file:
    for sentence in testSentences:
        tmp_sentence = ",".join(sentence)
        sentences_tests_file.write(f"{tmp_sentence}\n")

with open(nextWords_train, "w", encoding='gbk') as nextWords_testFile:
    for nextword in nextWordsTrain:
        nextWords_testFile.write(f"{nextword}\n")

with open(nextWords_tests, "w", encoding='gbk') as nextWords_testFile:
    for nextword in testNextWords:
        nextWords_testFile.write(f"{nextword}\n")

[['多', '但是', '对', '现在', '的', '杨清', '来说', '并', '不能', '提升'], ['我', '放弃', '了', '所谓', '的', '真', '爱', '我', '甚至', '离开'], ['不是', '没有', '一点', '作用', '了', '先', '不', '说', '你', '能'], ['修炼', '的', '状态', '中', '可是', '他', '感受', '到', '了', '在'], ['看着', '她', '\u3000', '\u3000', '至少', '苏辰', '做', '不到', '\u3000', '\u3000'], ['跟', '石林', '说话', '的', '时间', '还是', '有', '的', '特别', '是'], ['跑', '来', '指着', '王河', '大', '骂', '\u3000', '\u3000', '我', '不'], ['去', '看', '\u3000', '\u3000', '此时', '就算', '有', '再', '多', '的'], ['是', '不', '对', '有', '什么', '东西', '正在', '失去', '掌控', '\u3000'], ['哈哈', '王河', '有', '本事', '过来', '啊', '难道', '你', '怕', '了']]
['多少', '了', '不能', '外面', '只不过', '两人', '认识', '人', '\u3000', '么']


In [None]:
with open(sentences_train, "r") as sentences_file:
    lines = []
    for line in sentences_file:
        tmp_line = line.rstrip("\n").split(",")
        lines.append(tmp_line)

with open(nextWords_train, "r") as answers_file:
    answers = []
    for line in answers_file:
        tmp_line = line.rstrip("\n")
        answers.append(tmp_line)

with open(sentences_tests, "r") as sentences_tests_file:
    lines_tests = []
    for line in sentences_tests_file:
        tmp_line = line.rstrip("\n").split(",")
        lines_tests.append(tmp_line)

with open(nextWords_tests, "r") as answers_tests_file:
    answers_tests = []
    for line in answers_tests_file:
        tmp_line = line.rstrip("\n")
        answers_tests.append(tmp_line)


print(lines_tests[:10])
print(answers_tests[:10])

In [None]:
print(lines[14948])
print(sentences[14948])

In [None]:
# q = 0
# trainDir = "./Books/Train/"
# for filename in os.listdir(trainDir):
#     fn = trainDir + str(q) + ".txt"
#     originalName = trainDir + filename
#     os.rename(originalName,fn)
#     q += 1

In [None]:
# model
def get_model(dropout=0.2):
    print('Build model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(128), input_shape=(seqLen, len(words))))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    return model


model = get_model()
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

print(model.summary())


In [None]:
# from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
# from keras.models import load_model
# import os, psutil
#
#
# seed(1)
# tf.random.set_seed(2)
#
# process = psutil.Process()
# print(process.memory_info().rss/1024/1024)
# print(psutil.virtual_memory())
#
#
# def sample(preds, temperature=1.0):
#     # helper function to sample an index from a probability array
#     preds = np.asarray(preds).astype('float64')
#     preds = np.log(preds) / temperature
#     exp_preds = np.exp(preds)
#     preds = exp_preds / np.sum(exp_preds)
#     probas = np.random.multinomial(1, preds, 1)
#     return np.argmax(probas)
#
# examples = "/kaggle/working/examples.txt"
#
# # os.remove("/kaggle/working/LSTM_Fic_model.h5")
# # os.remove("/kaggle/working/examples.txt")
# # os.remove("/kaggle/working/state.db")
#
#
# def on_epoch_end(epoch, logs):
#     # Function invoked at end of each epoch. Prints generated text.
#     examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)
#
#     # Randomly pick a seed sequence
#     seed_index = np.random.randint(len(sentences+testSentences))
#     seed = (sentences+testSentences)[seed_index]
#
#     for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
#         sentence = seed
#         examples_file.write('----- Diversity:' + str(diversity) + '\n')
#         examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
#         examples_file.write(' '.join(sentence))
#
#         for i in range(50):
#             x_pred = np.zeros((1, seqLen, len(words)))
#             for t, word in enumerate(sentence):
#                 x_pred[0, t, wordAsKey[word]] = 1.
#
#             preds = model.predict(x_pred, verbose=0)[0]
#             next_index = sample(preds, diversity)
#             next_word = intAsKey[next_index]
#
#             sentence = sentence[1:]
#             sentence.append(next_word)
#
#             examples_file.write(" "+next_word)
#         examples_file.write('\n')
#     examples_file.write('='*80 + '\n')
#     examples_file.flush()
#
#
# def generator(sentence_list, next_word_list, batch_size):
#     index = 0
#     while True:
#         x = np.zeros((batch_size, seqLen, len(words)), dtype=bool)
#         y = np.zeros((batch_size, len(words)), dtype=bool)
#         for i in range(batch_size):
#             for t, w in enumerate(sentence_list[index % len(sentence_list)]):
#                 x[i, t, wordAsKey[w]] = 1
#             y[i, wordAsKey[next_word_list[index % len(sentence_list)]]] = 1
#             index = index + 1
#         yield x, y
#
#
# checkPath = "/kaggle/working/LSTM_Fic_model.h5"
# BATCH_SIZE = 64
#
# checkpoint = ModelCheckpoint(checkPath, monitor='val_accuracy', save_best_only=True)
# print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=2, min_lr=0.0001)
# callbacks_list = [checkpoint, print_callback, reduce_lr]
#
#
# examples_file = open(examples, "a")
#
# # comment out below block if picking up training
# # model.fit(generator(sentences, nextWordsTrain, BATCH_SIZE),
# #                         steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
# #                         epochs=3,
# #                         callbacks=callbacks_list,
# #                         validation_data=generator(testSentences, testNextWords, BATCH_SIZE),
# #                         validation_steps=int(len(testSentences)/BATCH_SIZE) + 1)
#
#
# model_path = "/kaggle/input/lstm-10e/LSTM_Fic_model.h5"
#
# # already have 10 epochs
# # load the model after saving
# new_model = load_model(model_path)
# checkpoint = ModelCheckpoint(checkPath, monitor='val_accuracy', save_best_only=True)
# print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=0.00001)
# callbacks_list = [checkpoint, print_callback, reduce_lr]
#
#
# examples_file = open(examples, "a")
# new_model.fit(generator(sentences, nextWordsTrain, BATCH_SIZE),
#                         steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
#                         epochs=2,
#                         callbacks=callbacks_list,
#                         validation_data=generator(testSentences, testNextWords, BATCH_SIZE),
#                         validation_steps=int(len(testSentences)/BATCH_SIZE) + 1)
