<a href="https://colab.research.google.com/github/YapingWu/GoogleColab/blob/main/genpass/word_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 准备工作
1. 上传编码后的密码文件：`myspace.txt` `phpbb.txt`
2. 上传分词器模型：`myspace.pkl` `phpbb.pkl`


In [2]:
!unzip '/content/tokenizer.zip' -d './tokenizer'
!unzip '/content/wordlist.zip' -d './wordlist'

Archive:  /content/tokenizer.zip
  inflating: ./tokenizer/myspace.pkl  
  inflating: ./tokenizer/phpbb.pkl   
  inflating: ./tokenizer/rockyou.pkl  
  inflating: ./tokenizer/phpbb_7.pkl  
  inflating: ./tokenizer/myspace_7.pkl  
Archive:  /content/wordlist.zip
  inflating: ./wordlist/phpbb.txt    
  inflating: ./wordlist/myspace.txt  
  inflating: ./wordlist/phpbb_7.txt  
  inflating: ./wordlist/myspace_7.txt  


# WordLSTM

## 定义模型

In [1]:
from collections import ChainMap
import os
import sys
import time
import pickle
import pandas as pd
import numpy as np

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras import optimizers
from keras.callbacks import TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split

SEED = 7
np.random.seed(SEED)
max_seq_len = {
        'myspace': 18,
        'phpbb': 20,
        'myspace_7': 18,
        'phpbb_7': 20,
        'rockyou': 47,
    }

class WordLSTM:
    """
    Word-level LSTM模型

    :param data_path: 数据集路径
    :param data_name: 数据集名称
    """
    def __init__(self, data_name):
        self.data_name = data_name  # 数据集名称
        self.data = self.load_data('{}{}.txt'.format('./wordlist/', data_name))
        # 分词器
        self.tokenizer = self.load_tokenizer('{}{}.pkl'.format('/content/tokenizer/', data_name))
        self.vocab_size = len(self.tokenizer.word_index) + 1  # 词汇表大小

        self.max_length = max_seq_len[data_name]  # 最大序列长度（单词长度）
        # lstm的超参数
        self.epochs = 500
        self.batch_size = 128
        self.lstm_layers = [[32, True], [32, True], [32, False]]
        self.fully_layers = [32]
        self.lr = 0.001
        self.log_dir = './logs/'  # tensorboard日志文件

        self.model_file = '{}{}.h5'.format('./model/', data_name)  # lstm模型文件

    @staticmethod
    def load_data(fname):
        """
        加载数据。
        :param fname: 数据集文件名
        :return: dataframe
        """
        if os.path.exists(fname):
            print("开始加载编码后的密码数据：%s" % fname)
            data = pd.read_csv(fname)['grammar']
            np.random.shuffle(data)
            return data
        else:
            logger.error("文件不存在：%s" % fname)
            sys.exit(1)

    @staticmethod
    def load_tokenizer(fname):
        """
        从文件中加载tokenizer模型
        :param fname:
        :return: tokenizer模型
        """
        if os.path.exists(fname):
            print("开始加载tokenizer模型：%s" % fname)
            with open(fname, 'rb') as file:
                tokenizer = pickle.load(file)
                return tokenizer
        else:
            logger.error("tokenizer模型文件不存在：%s" % fname)
            sys.exit(1)

    def encode_data(self):
        """
        将文本转换为（整数）序列
        :return: None
        """
        # 将编码后的密码转换为整数序列
        print("将编码后的密码转换为（整数）序列")
        sequences = list()

        data = self.data.values.tolist()
        for line in data:  # 'L8 D1 '
            line += '<END>'
            # 将文本转换为（整数）序列
            encoded = self.tokenizer.texts_to_sequences([line])[0]
            # 过滤掉长度大于 MAX_SEQ_LEN 的序列
            if len(encoded) > self.max_length:
                continue
            for i in range(1, len(encoded) + 1):
                sequence = encoded[:i]
                sequences.append(sequence)

        print('Total Sequences: %d' % len(sequences))

        # pad input sequences
        self.max_length = max([len(seq) for seq in sequences])
        sequences = pad_sequences(sequences, self.max_length, padding='pre')  # 左边填充0
        print('Max Sequence Length: %d' % self.max_length)

        # 创建输入输出
        print("创建LSTM模型的输入输出")
        sequences = np.array(sequences)
        x, y = sequences[:, :-1], sequences[:, -1]
        print("X Shape: %s, y Shape: %s" % (x.shape, y.shape))
        y = to_categorical(y, num_classes=self.vocab_size)  # 对输出进行one-hot编码

        return x, y

    def split_data(self):
        """
        将lstm的输入输出数据划分为训练集、验证集和测试集
        :return: x_train, x_val, x_test, y_train, y_val, y_test
        """
        # 将文本转换为（整数）序列
        x, y = self.encode_data()

        ratio = 0.6  # 训练集比例
        if len(x) > 100000:
            ratio = 0.9

        print("划分训练集、验证集和测试集")
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1 - ratio, random_state=SEED)
        x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=SEED)

        print("x_train Shape: %s, y_train Shape: %s" % (x_train.shape, y_train.shape))
        print("x_val Shape: %s, y_val Shape: %s" % (x_val.shape, y_val.shape))
        print("x_test Shape: %s, y_test Shape: %s" % (x_test.shape, y_test.shape))

        return x_train, x_val, x_test, y_train, y_val, y_test

    def create_and_train_model(self):
        """
        创建和训练LSTM模型
        :return: None
        """
        # 划分训练集、验证集和测试集
        x_train, x_val, x_test, y_train, y_val, y_test = self.split_data()

        print('{:*^106}'.format('创建LSTM模型'))
        model = Sequential()
        model.add(Embedding(input_dim=self.vocab_size, output_dim=10, input_length=self.max_length - 1))

        for hidden_size, rs in self.lstm_layers:
            model.add(LSTM(hidden_size, return_sequences=rs))

        for hidden_size in self.fully_layers:
            model.add(Dense(units=hidden_size, activation='relu'))
        model.add(Dense(self.vocab_size, activation='softmax'))

        model.summary()

        adam = optimizers.Adam(lr=self.lr)
        model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

        cur_time = time.strftime("%y%m%d%H%M%S", time.localtime())
        log_name = '{}{}{}'.format(self.log_dir, self.data_name, cur_time)
        tensorboard = TensorBoard(log_dir=log_name)
        early_stopping = EarlyStopping(monitor='val_loss', patience=50)

        print('{:*^106}'.format('开始训练LSTM模型'))
        model.fit(x_train, y_train,
                  validation_data=(x_val, y_val),
                  epochs=self.epochs,
                  batch_size=self.batch_size,
                  verbose=1,
                  callbacks=[tensorboard, early_stopping])
        loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
        print("Model Accuracy on test: %.2f%%, Loss: %.2f" % (accuracy * 100, loss))

        print("保存模型：%s" % self.model_file)
        model.save(self.model_file)
        print('TensorBoard 日志：{}'.format(log_name))
        print('{:*^106}'.format('完成训练LSTM模型'))

## 训练模型

In [2]:
data_sets = ['myspace', 'phpbb', 'myspace_7', 'phpbb_7']
# 使用PCFG编码后的密码训练lstm模型
for name in data_sets:
  word_lstm = WordLSTM(name)
  word_lstm.create_and_train_model()

开始加载编码后的密码数据：./wordlist/myspace.txt
开始加载tokenizer模型：/content/tokenizer/myspace.pkl
将编码后的密码转换为（整数）序列
Total Sequences: 8583
Max Sequence Length: 18
创建LSTM模型的输入输出
X Shape: (8583, 17), y Shape: (8583,)
划分训练集、验证集和测试集
x_train Shape: (5149, 17), y_train Shape: (5149, 70)
x_val Shape: (1717, 17), y_val Shape: (1717, 70)
x_test Shape: (1717, 17), y_test Shape: (1717, 70)
*************************************************创建LSTM模型*************************************************
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 17, 10)            700       
_________________________________________________________________
lstm (LSTM)                  (None, 17, 32)            5504      
_________________________________________________________________
lstm_1 (LSTM)                (None, 17, 32)            8320      
______________________________________________

## acc-loss

In [None]:
%load_ext tensorboard
%tensorboard --logdir './logs/phpbb_7210312062605'