<a href="https://colab.research.google.com/github/YapingWu/GoogleColab/blob/main/lstm/exp2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

导入需要使用的类的函数

In [85]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

# 1 加载数据

## 1.1 从本地读取文件数据

In [15]:
# 1. 上传本地文件
from google.colab import files
uploaded = files.upload()

Saving myspace.txt to myspace.txt


In [10]:
import pandas as pd
import io
raw_data = pd.read_csv(io.StringIO(uploaded['grammar.txt'].decode('utf-8')), 
                 sep='\t', 
                 header=None,
                 names=['grammar', 'count', 'prop'])
print(raw_data.head(3))

  grammar  count      prop
0  A6D1O1   2921  0.078640
1  A7D1O1   2417  0.065071
2  A6D2O1   2329  0.062702


In [39]:
import pandas as pd
import io
# 2. 将密码数据读取到dataframe中
raw_data = pd.read_csv(io.StringIO(uploaded['myspace.txt'].decode('utf-8')),
                 header=None,
                 names=['pwd'])
print(raw_data.shape)
print(raw_data.head(3))

(37144, 1)
         pwd
0  password1
1     abc123
2    fuckyou


In [74]:
# 3. 将数据编码成单元序列。e.g., $Password123 => S1 L8 N3‘\n’
import itertools
LETTER = 'L'
DIGIT = 'D'
OTHER = 'S'


def get_type(ch):
    if ch.isalpha():
        return LETTER
    if ch.isdigit():
        return DIGIT
    return OTHER


def encode_to_unit(data):
  data = str(data)
  tmp = ''
  result = ''
  for ch in data:
      tmp += get_type(ch)

  for k, g in itertools.groupby(tmp):
      result += (k + str(len(list(g))) + ' ')
    
  return result

raw_data['grammar'] = raw_data['pwd'].apply(encode_to_unit)
print(raw_data.shape)
print(raw_data.head(5))

(37144, 2)
         pwd grammar
0  password1  L8 D1 
1     abc123  L3 D3 
2    fuckyou     L7 
3    monkey1  L6 D1 
4  iloveyou1  L8 D1 


In [73]:
# 4. 统计每个grammar出现的概率
new_data = pd.DataFrame()
new_data['grammar'] = raw_data['grammar']
new_data['cnt'] = 1
new_data = new_data.groupby(['grammar'])['cnt'].sum().reset_index() # 计算每个grammar出现的次数
new_data['prop'] = new_data['cnt'] / raw_data.shape[0] # 计算概率
new_data.sort_values('prop', inplace=True, ascending=False) # 按prop降序排列
new_data.reset_index(inplace=True, drop=True) # 重置行索引
print(new_data.shape)
print(new_data.head(5))

(1720, 3)
  grammar   cnt      prop
0  L6 D1   2929  0.078855
1  L7 D1   2418  0.065098
2  L6 D2   2333  0.062810
3  L8 D1   2034  0.054760
4  L5 D2   1735  0.046710


In [37]:
# 4. 处理后的数据保存到文件中
raw_data.to_csv("myspace_encoded.csv")

## 1.2 定义数据集

将原始数据划分为训练集和测试集

In [11]:
import numpy as np
def split_train_test(data,test_ratio):
    #设置随机数种子，保证每次生成的结果都是一样的
    np.random.seed(42)
    #permutation随机生成0-len(data)随机序列
    shuffled_indices = np.random.permutation(len(data))
    #test_ratio为测试集所占的半分比
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    #iloc选择参数序列中所对应的行
    return data.iloc[train_indices],data.iloc[test_indices]
 
#测试
train_set,test_set = split_train_test(raw_data, 0.2)
print(train_set.shape, test_set.shape)

(1423, 3) (355, 3)


## 1.3 数据预处理

对文本数据进行编码，将文本编码成整数。Tokenizer是一个将文本向量化，转换成序列的类。用来文本处理的分词、嵌入。  
参考：
1. [Keras分词器 Tokenizer](http://codewithzhangyi.com/2019/04/23/keras-tokenizer/)  
2. [保存机器学习模型——pickle和joblib](https://www.zhangqibot.com/post/ml-dump-models/)

In [77]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# 1. 创建分词器 Tokenizer 对象
tokenizer = Tokenizer() # 参数可以自己根据实际情况更改

# 2. 整理整体语料，中文需空格分词
text = raw_data['grammar']

# 3. 将Tokenizer拟合语料，生成字典，形成新的tokenizer
tokenizer.fit_on_texts(text)

# 4. 保存tokenizer，避免重复对同一语料进行拟合
# import pickle
# pkl_filename = "myspace_tokenizer.pkl"  
# with open(pkl_filename, 'wb') as file:  
#    pickle.dump(model, file)

# 5. 整合需要做嵌入的文本，中文需要空格分词
new_text = new_data['grammar']

# 6. 将文本向量化
list_tokenized = tokenizer.texts_to_sequences(new_text)
print(list_tokenized[:10])

[[3, 1], [5, 1], [3, 2], [7, 1], [4, 2], [4, 1], [12, 1], [5, 2], [6, 2], [7, 2]]


创建整数序列

In [78]:
sequences = list()
for line in new_text:
  encoded = tokenizer.texts_to_sequences([line])[0] # 将文本向量化
  for i in range(1, len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
print(sequences[:5])

Total Sequences: 6248
[[3, 1], [5, 1], [3, 2], [7, 1], [4, 2]]


填充序列

In [79]:
# 7. 生成训练数据的序列
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 600


创建输入输出

In [84]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

Vocabulary Size: 72


In [90]:
# define the model
def define_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 10, input_length=max_length-1))
  model.add(LSTM(50))
  model.add(Dense(vocab_size, activation='softmax'))
  # compile network
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  # summarize defined model
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

In [88]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
  in_text = seed_text
  # generate a fixed number of words
  for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # pre-pad sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
    # predict probabilities for each word
    yhat = model.predict_classes(encoded, verbose=0)
    # map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
      break
    # append to input
    in_text += ' ' + out_word
  return in_text

In [92]:
# define model
model = define_model(vocab_size, max_length)
# fit network
model.fit(X, y, epochs=3, verbose=2)
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, '', 2))
print(generate_seq(model, tokenizer, max_length-1, '', 2))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 599, 10)           720       
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_2 (Dense)              (None, 72)                3672      
Total params: 16,592
Trainable params: 16,592
Non-trainable params: 0
_________________________________________________________________
Epoch 1/3
196/196 - 64s - loss: 2.7270 - accuracy: 0.2318
Epoch 2/3
196/196 - 62s - loss: 2.3444 - accuracy: 0.2694
Epoch 3/3
196/196 - 64s - loss: 2.0652 - accuracy: 0.3175




  
  
