<a href="https://colab.research.google.com/github/YapingWu/GoogleColab/blob/main/keras/WordLstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word-Level LSTM实现文本生成

## 准备工作

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, AlphaDropout
from keras import optimizers
from keras.callbacks import TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import time
import pickle

In [2]:
data_name = 'myspace'
max_lengths = {
    'myspace': 35,
    'phpbb': 21,
    'rockyou': 41,
}
SEED = 7

In [3]:
# word_list_file = '{}{}.txt'.format('/content/', data_name)
word_list_file = '{}{}_train.txt'.format('/content/', data_name)
tokenizer_file = '{}{}.pkl'.format('/content/', data_name)
print(word_list_file)
print(tokenizer_file)
with open(tokenizer_file, 'rb') as file:
  tokenizer = pickle.load(file)
vocab_size = len(tokenizer.word_index) + 1
max_length = max_lengths[data_name]

/content/phpbb_train.txt
/content/phpbb.pkl


## 加载数据

In [4]:
word_list = pd.read_csv(word_list_file)['grammar']
word_list = word_list.values.tolist()

In [5]:
# 将编码后的密码转换为整数序列
print("将编码后的密码转换为（整数）序列")
sequences = list()
for line in word_list:  # 'L8 D1 '
    line += '<END>'
    # 将文本转换为（整数）序列
    encoded = tokenizer.texts_to_sequences([line])[0]
    # 过滤掉长度大于 MAX_SEQ_LEN 的序列
    if len(encoded) > max_length:
        continue
    for i in range(1, len(encoded) + 1):
        sequence = encoded[:i]
        sequences.append(sequence)

print('Total Sequences: %d' % len(sequences))

# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, max_length, padding='pre')  # 左边填充0
print('Max Sequence Length: %d' % max_length)

# 创建输入输出
print("创建LSTM模型的输入输出")
sequences = np.array(sequences)
x, y = sequences[:, :-1], sequences[:, -1]
print("X Shape: %s, y Shape: %s" % (x.shape, y.shape))
y = to_categorical(y, num_classes=vocab_size)  # 对输出进行one-hot编码

将编码后的密码转换为（整数）序列
Total Sequences: 12278
Max Sequence Length: 21
创建LSTM模型的输入输出
X Shape: (12278, 20), y Shape: (12278,)


### 划分训练集、验证集和测试集

In [6]:
def split_xy(x, y):

    ratio = 0.6  # 训练集比例
    if len(x) > 100000:
        ratio = 0.9

    print("划分训练集、验证集和测试集")
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1 - ratio, random_state=SEED)
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=SEED)

    print("x_train Shape: %s, y_train Shape: %s" % (x_train.shape, y_train.shape))
    print("x_val Shape: %s, y_val Shape: %s" % (x_val.shape, y_val.shape))
    print("x_test Shape: %s, y_test Shape: %s" % (x_test.shape, y_test.shape))

    return x_train, x_val, x_test, y_train, y_val, y_test

x_train, x_val, x_test, y_train, y_val, y_test = split_xy(x, y)

划分训练集、验证集和测试集
x_train Shape: (7366, 20), y_train Shape: (7366, 56)
x_val Shape: (2456, 20), y_val Shape: (2456, 56)
x_test Shape: (2456, 20), y_test Shape: (2456, 56)


## 创建和训练模型

In [7]:
epochs = 200
batch_size = 128
lstm_layers = [[32, True], [32, True], [32, False]]
fully_layers = [32]
lr = 0.001
dropout_p = 0.0
optimizer = optimizers.Adam(lr=lr)
# optimizer = optimizers.RMSprop()

### 创建模型

In [8]:
print('{:*^106}'.format('创建LSTM模型'))
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=10, input_length=max_length - 1))

for hidden_size, rs in lstm_layers:
    model.add(LSTM(hidden_size, return_sequences=rs))

for hidden_size in fully_layers:
    model.add(Dense(units=hidden_size, activation='relu'))
    # model.add(AlphaDropout(dropout_p))
model.add(Dense(vocab_size, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

*************************************************创建LSTM模型*************************************************
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 10)            560       
_________________________________________________________________
lstm (LSTM)                  (None, 20, 32)            5504      
_________________________________________________________________
lstm_1 (LSTM)                (None, 20, 32)            8320      
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dense_1 (Dense)              (None, 56)                1848      
Total params: 2

### 训练模型

In [9]:
num_batch = int(x_train.shape[0] / batch_size)
def generate_train_data():  
    i = 0
    while True:
        i = i % num_batch
        x_ = x_train[batch_size * i: batch_size * (i + 1)]  # 筛选出一个批次的数据
        y_ = y_train[batch_size * i: batch_size * (i + 1)]  # 筛选出一个批次的数据
        i += 1
        yield x_, y_

In [10]:
cur_time = time.strftime("%y%m%d%H%M%S", time.localtime())
log_name = '{}{}{}'.format('./logs/', data_name, cur_time)
# model_file = '{}{}.h5'.format('./model/', data_name)
model_file = '{}{}_part.h5'.format('./model/', data_name)
log_name, model_file

('./logs/phpbb210305070002', './model/phpbb_part.h5')

In [12]:
tensorboard = TensorBoard(log_dir=log_name)
early_stopping = EarlyStopping(monitor='val_loss',patience=10)

print('{:*^106}'.format('开始训练LSTM模型'))
model.fit(generate_train_data(),
          validation_data=(x_val, y_val),
          steps_per_epoch=num_batch,
          epochs=epochs,
          verbose=1,
          callbacks=[tensorboard, early_stopping])

loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print("Model Accuracy on test: %.2f%%, Loss: %.2f" % (accuracy * 100, loss))
print('{:*^106}'.format('完成训练LSTM模型'))

************************************************开始训练LSTM模型************************************************
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Model Accuracy on test: 41.04%, Loss: 1.86
*

### 保存模型

In [13]:
print("保存模型：%s" % model_file)
model.save(model_file)

保存模型：./model/phpbb_part.h5


# 其他命令

## 解压文件

In [1]:
!unzip '/content/wordlist.zip'

Archive:  /content/wordlist.zip
  inflating: phpbb_test.txt          
  inflating: phpbb_train.txt         
  inflating: myspace.txt             
  inflating: myspace_test.txt        
  inflating: myspace_train.txt       
  inflating: phpbb.txt               


In [2]:
!unzip '/content/tokenizer.zip'

Archive:  /content/tokenizer.zip
  inflating: phpbb.pkl               
  inflating: rockyou.pkl             
  inflating: myspace.pkl             


## 使用tensorboard

In [12]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir '/content/logs/phpbb210305064918'

## 查看GPU、CPU情况
参考资料：https://blog.csdn.net/qq_38410428/article/details/89963503

查看GPU是否在colab中，如果结果为空，则不能使用GPU

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

如果结果为/device:GPU:0，使用```!/opt/bin/nvidia-smi```查看显存情况

In [None]:
!/opt/bin/nvidia-smi