<a href="https://colab.research.google.com/github/YapingWu/GoogleColab/blob/main/keras/WordLstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word-Level LSTM实现文本生成

## 准备工作

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, AlphaDropout
from keras import optimizers
from keras.callbacks import TensorBoard
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import time
import pickle

In [None]:
data_name = 'myspace'
max_lengths = {
    'myspace': 35,
    'phpbb': 21,
    'rockyou': 41,
}
SEED = 7

In [None]:
word_list_file = '{}{}.txt'.format('/content/', data_name)
tokenizer_file = '{}{}.pkl'.format('/content/', data_name)
print(word_list_file)
print(tokenizer_file)
with open(tokenizer_file, 'rb') as file:
  tokenizer = pickle.load(file)
vocab_size = len(tokenizer.word_index) + 1
max_length = max_lengths[data_name]

/content/myspace.txt
/content/myspace.pkl


## 加载数据

In [None]:
word_list = pd.read_csv(word_list_file)['grammar']
word_list = word_list.values.tolist()

In [None]:
# 将编码后的密码转换为整数序列
print("将编码后的密码转换为（整数）序列")
sequences = list()
for line in word_list:  # 'L8 D1 '
    line += '<END>'
    # 将文本转换为（整数）序列
    encoded = tokenizer.texts_to_sequences([line])[0]
    # 过滤掉长度大于 MAX_SEQ_LEN 的序列
    if len(encoded) > max_length:
        continue
    for i in range(1, len(encoded) + 1):
        sequence = encoded[:i]
        sequences.append(sequence)

print('Total Sequences: %d' % len(sequences))

# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, max_length, padding='pre')  # 左边填充0
print('Max Sequence Length: %d' % max_length)

# 创建输入输出
print("创建LSTM模型的输入输出")
sequences = np.array(sequences)
x, y = sequences[:, :-1], sequences[:, -1]
print("X Shape: %s, y Shape: %s" % (x.shape, y.shape))
y = to_categorical(y, num_classes=vocab_size)  # 对输出进行one-hot编码

将编码后的密码转换为（整数）序列
Total Sequences: 8976
Max Sequence Length: 35
创建LSTM模型的输入输出
X Shape: (8976, 34), y Shape: (8976,)


### 划分训练集、验证集和测试集

In [None]:
def split_xy(x, y):

    ratio = 0.6  # 训练集比例
    if len(x) > 100000:
        ratio = 0.9

    print("划分训练集、验证集和测试集")
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1 - ratio, random_state=SEED)
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=SEED)

    print("x_train Shape: %s, y_train Shape: %s" % (x_train.shape, y_train.shape))
    print("x_val Shape: %s, y_val Shape: %s" % (x_val.shape, y_val.shape))
    print("x_test Shape: %s, y_test Shape: %s" % (x_test.shape, y_test.shape))

    return x_train, x_val, x_test, y_train, y_val, y_test

x_train, x_val, x_test, y_train, y_val, y_test = split_xy(x, y)

划分训练集、验证集和测试集
x_train Shape: (5385, 34), y_train Shape: (5385, 73)
x_val Shape: (1795, 34), y_val Shape: (1795, 73)
x_test Shape: (1796, 34), y_test Shape: (1796, 73)


## 创建和训练模型

In [None]:
epochs = 500
batch_size = 128
lstm_layers = [[32, True], [32, True], [32, False]]
fully_layers = [32]
lr = 0.001
dropout_p = 0.0
opt = 'RMSprop'
# adam = optimizers.Adam(lr=lr)

### 创建模型

In [None]:
print('{:*^106}'.format('创建LSTM模型'))
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=10, input_length=max_length - 1))

for hidden_size, rs in lstm_layers:
    model.add(LSTM(hidden_size, return_sequences=rs))

for hidden_size in fully_layers:
    model.add(Dense(units=hidden_size, activation='relu'))
    # model.add(AlphaDropout(dropout_p))
model.add(Dense(vocab_size, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

*************************************************创建LSTM模型*************************************************
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 34, 10)            730       
_________________________________________________________________
lstm (LSTM)                  (None, 34, 32)            5504      
_________________________________________________________________
lstm_1 (LSTM)                (None, 34, 32)            8320      
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dense_1 (Dense)              (None, 73)                2409      
Total params: 2

### 训练模型

In [None]:
num_batch = int(x_train.shape[0] / batch_size)
def generate_train_data():  
    i = 0
    while True:
        i = i % num_batch
        x_ = x_train[batch_size * i: batch_size * (i + 1)]  # 筛选出一个批次的数据
        y_ = y_train[batch_size * i: batch_size * (i + 1)]  # 筛选出一个批次的数据
        i += 1
        yield x_, y_

In [None]:
cur_time = time.strftime("%y%m%d%H%M%S", time.localtime())
log_name = '{}{}{}'.format('./logs/', data_name, cur_time)
model_file = '{}{}.h5'.format('./model/', data_name)
log_name, model_file

('./logs/myspace210304092254', './model/myspace.h5')

In [None]:
tensorboard = TensorBoard(log_dir=log_name)

print('{:*^106}'.format('开始训练LSTM模型'))
model.fit(generate_train_data(),
          validation_data=(x_val, y_val),
          steps_per_epoch=num_batch,
          epochs=epochs,
          verbose=1,
          callbacks=[tensorboard])
# model.fit(x_train, y_train,
#           validation_data=(x_val, y_val),
#           epochs=epochs,
#           batch_size=batch_size,
#           verbose=1,
#           callbacks=[tensorboard])
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print("Model Accuracy on test: %.2f%%, Loss: %.2f" % (accuracy * 100, loss))
print('{:*^106}'.format('完成训练LSTM模型'))

************************************************开始训练LSTM模型************************************************
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch

### 保存模型

In [None]:
print("保存模型：%s" % model_file)
model.save(model_file)

保存模型：./model/phpbb.h5


# 其他命令

## 解压文件

In [None]:
!unzip '/content/wordlist.zip'

Archive:  /content/wordlist.zip
  inflating: myspace.txt             
  inflating: phpbb.txt               
  inflating: rockyou.txt             


In [None]:
!unzip '/content/tokenizer.zip'

Archive:  /content/tokenizer.zip
  inflating: phpbb.pkl               
  inflating: rockyou.pkl             
  inflating: myspace.pkl             


## 使用tensorboard

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir '/content/logs/myspace210304083459'

## 查看GPU、CPU情况
参考资料：https://blog.csdn.net/qq_38410428/article/details/89963503

查看GPU是否在colab中，如果结果为空，则不能使用GPU

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

如果结果为/device:GPU:0，使用```!/opt/bin/nvidia-smi```查看显存情况

In [None]:
!/opt/bin/nvidia-smi