In [None]:
import tensorflow as tf
import numpy as np
import os

# 语料预处理

In [None]:
def loaddicts(wordsweight_path,tokToInt_path,intToTok_path):
  # word2vec词向量获取、字典映射获取
  wordEmbedding=np.load(wordsweight_path)
  dict_tokToInt=np.load(tokToInt_path,allow_pickle=True).item()
  dict_intToTok=np.load(intToTok_path,allow_pickle=True).item()
  return wordEmbedding,dict_tokToInt,dict_intToTok

In [None]:
def tokenized(cutpath,dict_tokToInt):
  text_to_int = []
  with open(cutpath,'r',encoding='UTF-8') as f:
    texts=f.read()   # 获得分词后的以空格分割的文本字符串
  texts=texts.split(' ')
  for word in texts:
    try:
      text_to_int.append(dict_tokToInt[word]) # 将分词后的所有小说文本改成索引数字列表
    except:
      print(word+"不在词典内")
  print("小说词数共",len(text_to_int))
  return text_to_int

In [None]:
# 创建dataset存储输入目标对
def get_pairs(text_to_int,seq_length,batch_size):
  ids_dataset = tf.data.Dataset.from_tensor_slices(text_to_int) # 将索引转化为tensor
  sequences = ids_dataset.batch(seq_length+1, drop_remainder=True) # 拆分文本索引序列
  pairs_num = len(text_to_int) // (seq_length+1) # 全文所包含的所有输入目标对数
  def split_input_target(sequence): # 此函数用于获得每个时间步的输入与目标序列
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text
  # 将sequences拆分成输入目标对并随机打乱组成batch
  datasets = sequences.map(split_input_target).shuffle(pairs_num).batch(batch_size, drop_remainder=True)#.prefetch(tf.data.experimental.AUTOTUNE)
  return datasets

In [None]:
# 创建训练、验证、测试集
def split_datasets(datasets):
  # 8:1:1
  dataset_size = len(datasets)
  train_size = int(0.8 * dataset_size)
  val_size = int(0.1 * dataset_size)
  test_size = dataset_size - train_size - val_size
  # print(train_size,val_size,test_size)
  train_ds = datasets.take(train_size)
  valid_test_ds = datasets.skip(train_size)
  valid_ds = valid_test_ds.take(val_size)
  test_ds = valid_test_ds.skip(val_size)
  return train_ds,valid_ds,test_ds

# 模型创建

In [None]:
# 创建模型
class MyModel_gui(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units, batch_size, wordEmbedding):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,weights=[wordEmbedding],batch_input_shape=[batch_size,None])
    self.LSTM = tf.keras.layers.LSTM(units=rnn_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform')
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.LSTM.get_initial_state(x)
    x = self.LSTM(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

# 模型训练

In [None]:
# 损失函数：计算模型预测值和真实值的差异
def loss(y_true, y_pred):
  return tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [None]:
# 初始化模型，并进行训练
def train(vocab_size,embedding_dim,rnn_units,batch_size,train_ds,valid_ds,epochs,learning_rate,wordEmbedding):
  model = MyModel_gui(vocab_size=vocab_size,embedding_dim=embedding_dim,rnn_units=rnn_units,batch_size=batch_size,wordEmbedding=wordEmbedding)
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),loss=loss)
  history = model.fit(
      train_ds,
      epochs=epochs,
      validation_data=valid_ds,
      validation_freq=1
  )
  return model,history

# 执行训练

In [None]:
def preprocessed(cutpath,wordsweight_path,tokToInt_path,intToTok_path,text_to_int_path,seq_length,batch_size):
  wordEmbedding,dict_tokToInt,dict_intToTok = loaddicts(wordsweight_path,tokToInt_path,intToTok_path)
  text_to_int = tokenized(cutpath,dict_tokToInt)
  # text_to_int存储
  np.save(text_to_int_path,text_to_int)
  # 获取全部datasets
  datasets = get_pairs(text_to_int,seq_length,batch_size)
  # 获取训练、验证、测试集
  train_ds,valid_ds,test_ds= split_datasets(datasets)
  return wordEmbedding,dict_tokToInt,dict_intToTok,train_ds,valid_ds,test_ds

In [None]:
# 执行训练
def main(cutpath,wordsweight_path,tokToInt_path,intToTok_path,text_to_int_path,seq_length,batch_size,embedding_dim,rnn_units,learning_rate,epochs):
  wordEmbedding,dict_tokToInt,dict_intToTok,train_ds,valid_ds,test_ds = preprocessed(cutpath,wordsweight_path,tokToInt_path,intToTok_path,text_to_int_path,seq_length,batch_size)
  vocab_size = len(dict_tokToInt)
  model,history = train(vocab_size,embedding_dim,rnn_units,batch_size,train_ds,valid_ds,epochs,learning_rate,wordEmbedding)
  model.save_weights('gui_best_weights.h5')
  eval = model.evaluate(test_ds)
  return history,eval

In [None]:
history,eval = main(cutpath,wordsweight_path,tokToInt_path,intToTok_path,text_to_int_path,seq_length,batch_size,embedding_dim,rnn_units,learning_rate,epochs)

# 小说生成

In [None]:
import jieba

In [None]:
# 由下标索引转换成字，并连接成句
def text_from_index(novel_ids,dicts_intToTok):
  novel=""
  for id in novel_ids:
    novel += dicts_intToTok[id]
  return novel

In [None]:
# 将句子转换成索引列表,text为字符串
def index_form_text(text,dicts_tokToInt):
  index=[]
  for word in jieba.lcut(text):
    index+=[dicts_tokToInt[word]]
  return index

In [None]:
# 专门用来生成的模型
def generate_Model(embedding_dim,vocab_size,rnn_units,batch_size,ckpt_path):
  gen_model = tf.keras.models.Sequential([
    # 词嵌入层                                      
    tf.keras.layers.Embedding(input_dim=vocab_size,output_dim=embedding_dim,
                 batch_input_shape=[batch_size, None]),
    # LSTM 层
    tf.keras.layers.LSTM(units=rnn_units,return_sequences=True,stateful=True),

    # 全连接层
    tf.keras.layers.Dense(vocab_size)

  ])
  gen_model.load_weights(ckpt_path)      # 读入之前训练时存储下来的权重
  gen_model.build(tf.TensorShape([1, None]))

  return gen_model

In [None]:
def load_para(path1,path2,path3):
  dicts_intToTok = np.load(path1,allow_pickle=True).item()
  dicts_tokToInt = np.load(path2,allow_pickle=True).item()
  vocab_size = len(dicts_intToTok)
  ckpt_path = path3
  return dicts_intToTok,dicts_tokToInt,vocab_size,ckpt_path

In [None]:
def gen_novel(model,start_text,words_num,dicts_ci,temperature):
  start_index = index_form_text(start_text,dicts_ci)
  generateText = []
  for i in range(words_num):
    if i < len(start_index):
      generateText+=[start_index[i]]
    input = tf.expand_dims([generateText[i]], axis=0)
    predictions = model(input)
    
    predictions = tf.squeeze(predictions, 0)   #这个张量是将原始input中所有维度为1的那些维都删掉的结果
    predictions /= temperature
    

    # 从一个分类分布中抽取样本(;num_samples:抽取的样本个数)  #
    # logits:形状为 [batch_size, num_classes]的张量. 每个切片[i, :]代表对于所有类的未正规化的log概率。
    # 最后softmax的概率分布；也可以是整数，会自动变换成概率分布
    sampled_indices = tf.random.categorical(predictions, num_samples=1)  

    if i >= len(start_index)-1:
      generateText += list(sampled_indices.numpy()[0])
  return generateText

In [None]:
def generate(start_words,temperature,gen_num):
  embedding_dim = 128
  rnn_units = 1024
  batch_size = 1
  # start_words="我的话刚说了一半，便听一声巨响，顶门的木椅突然被撞成了数断，"
  path1='/content/drive/MyDrive/LSTM/word-LSTM/word2vec/npy_etc/gui_dict_intToTok.npy'
  path2='/content/drive/MyDrive/LSTM/word-LSTM/word2vec/npy_etc/gui_dict_tokToInt.npy'
  path3='/content/drive/MyDrive/LSTM/word-LSTM/word2vec/model_save/gui_best_weights.h5'
  dicts_ic,dicts_ci,vocab_size,ckpt_path = load_para(path1,path2,path3)
  model_gen = generate_Model(embedding_dim,vocab_size,rnn_units,batch_size,ckpt_path)
  generateText = gen_novel(model_gen,start_words,gen_num,dicts_ci,temperature)
  finalText = text_from_index(generateText,dicts_ic)
  print(finalText)

In [None]:
start_words="我的话刚说了一半，便听一声巨响，顶门的木椅突然被撞成了数断，"
temperature = 0.8
gen_num = 800

In [None]:
for i in range(5):
  print('*'*30)
  generate(start_words,temperature,gen_num)