<a href="https://colab.research.google.com/github/YapingWu/GoogleColab/blob/main/keras/WordLstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word-Level LSTM实现文本生成

## 准备工作

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, AlphaDropout
from keras import optimizers
from keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import time
import pickle

In [4]:
data_name = 'rockyou'
max_lengths = {
    'myspace': 35,
    'phpbb': 21,
    'rockyou': 41,
}
SEED = 7

In [5]:
word_list_file = '{}{}.txt'.format('/content/', data_name)
tokenizer_file = '{}{}.pkl'.format('/content/', data_name)
print(word_list_file)
print(tokenizer_file)
with open(tokenizer_file, 'rb') as file:
  tokenizer = pickle.load(file)
vocab_size = len(tokenizer.word_index) + 1
max_length = max_lengths[data_name]

/content/rockyou.txt
/content/rockyou.pkl


## 加载数据

In [6]:
word_list = pd.read_csv(word_list_file)['grammar']
word_list = word_list.values.tolist()

In [7]:
# 将编码后的密码转换为整数序列
print("将编码后的密码转换为（整数）序列")
sequences = list()
for line in word_list:  # 'L8 D1 '
    line += '<END>'
    # 将文本转换为（整数）序列
    encoded = tokenizer.texts_to_sequences([line])[0]
    # 过滤掉长度大于 MAX_SEQ_LEN 的序列
    if len(encoded) > max_length:
        continue
    for i in range(1, len(encoded) + 1):
        sequence = encoded[:i]
        sequences.append(sequence)

print('Total Sequences: %d' % len(sequences))

# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, max_length, padding='pre')  # 左边填充0
print('Max Sequence Length: %d' % max_length)

# 创建输入输出
print("创建LSTM模型的输入输出")
sequences = np.array(sequences)
x, y = sequences[:, :-1], sequences[:, -1]
print("X Shape: %s, y Shape: %s" % (x.shape, y.shape))
y = to_categorical(y, num_classes=vocab_size)  # 对输出进行one-hot编码

将编码后的密码转换为（整数）序列
Total Sequences: 528028
Max Sequence Length: 41
创建LSTM模型的输入输出
X Shape: (528028, 40), y Shape: (528028,)


### 划分训练集、验证集和测试集

In [8]:
def split_xy(x, y):

    ratio = 0.6  # 训练集比例
    if len(x) > 100000:
        ratio = 0.9

    print("划分训练集、验证集和测试集")
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1 - ratio, random_state=SEED)
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=SEED)

    print("x_train Shape: %s, y_train Shape: %s" % (x_train.shape, y_train.shape))
    print("x_val Shape: %s, y_val Shape: %s" % (x_val.shape, y_val.shape))
    print("x_test Shape: %s, y_test Shape: %s" % (x_test.shape, y_test.shape))

    return x_train, x_val, x_test, y_train, y_val, y_test

x_train, x_val, x_test, y_train, y_val, y_test = split_xy(x, y)

划分训练集、验证集和测试集
x_train Shape: (475225, 40), y_train Shape: (475225, 251)
x_val Shape: (26401, 40), y_val Shape: (26401, 251)
x_test Shape: (26402, 40), y_test Shape: (26402, 251)


## 创建和训练模型

In [9]:
epochs = 30
batch_size = 128
lstm_layers = [[32, True], [32, True], [32, False]]
fully_layers = [32]
dropout_p = 0.5

### 创建模型

In [13]:
print('{:*^106}'.format('创建LSTM模型'))
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=10, input_length=max_length - 1))

for hidden_size, rs in lstm_layers:
    model.add(LSTM(hidden_size, return_sequences=rs))

for hidden_size in fully_layers:
    model.add(Dense(units=hidden_size, activation='relu'))
    # model.add(AlphaDropout(dropout_p))
model.add(Dense(vocab_size, activation='softmax'))

model.summary()

adam = optimizers.Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

*************************************************创建LSTM模型*************************************************
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 10)            2510      
_________________________________________________________________
lstm_3 (LSTM)                (None, 40, 32)            5504      
_________________________________________________________________
lstm_4 (LSTM)                (None, 40, 32)            8320      
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 251)               8283      
Total params:

### 训练模型

In [14]:
cur_time = time.strftime("%y%m%d%H%M%S", time.localtime())
log_name = '{}{}{}'.format('./logs/', data_name, cur_time)
model_file = '{}{}.h5'.format('./model/', data_name)
log_name, model_file

('./logs/rockyou210303013255', './model/rockyou.h5')

In [15]:
tensorboard = TensorBoard(log_dir=log_name)

print('{:*^106}'.format('开始训练LSTM模型'))
model.fit(x_train, y_train,
          validation_data=(x_val, y_val),
          epochs=epochs,
          batch_size=batch_size,
          verbose=1,
          callbacks=[tensorboard])
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print("Model Accuracy on test: %.2f%%, Loss: %.2f" % (accuracy * 100, loss))
print('{:*^106}'.format('完成训练LSTM模型'))

************************************************开始训练LSTM模型************************************************
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Model Accuracy on test: 38.10%, Loss: 2.01
************************************************完成训练LSTM模型************************************************


### 保存模型

In [18]:
print("保存模型：%s" % model_file)
model.save(model_file)

保存模型：./model/rockyou.h5


# 其他命令

In [1]:
!unzip '/content/wordlist.zip'

Archive:  /content/wordlist.zip
  inflating: myspace.txt             
  inflating: phpbb.txt               
  inflating: rockyou.txt             


In [2]:
!unzip '/content/tokenizer.zip'

Archive:  /content/tokenizer.zip
  inflating: phpbb.pkl               
  inflating: rockyou.pkl             
  inflating: myspace.pkl             


In [16]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir '/content/logs/rockyou210303013255'