python 3.6.13

keras 2.6.0

tensorflow 2.6.2

任务：基于 flare 文本数据，建立 LSTM 模型，预测序列文字

1.完成数据预处理，将文字序列数据转化为可用于LSTM输入的数据

2.查看文字数据预处理后的数据结构，并进行数据分离操作

3.针对字符串输入（" flare is a teacher in ai industry. He obtained his phd in Australia."），预测其对应的后续字符

备注：模型结构：单层LSTM，输出有20个神经元：每次使用前20个字符预测第21个字符

In [4]:
# load the data
data = open('flare').read()
data = data.replace('\n','').replace('\r', '') # 替换换行符
print(data)

flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained his phd in Australia. Australia is a country in the Southern Hemisphere.flare is a teacher in ai industry. He obtained h

In [9]:
# 字符去重处理
letters = list(set(data))
print(letters)
num_letters = len(letters)
print(num_letters)           # 进行独热数值编码，23行1列的数组

['A', 'o', 'c', 'h', 'l', 'm', 'p', '.', 's', 'S', ' ', 't', 'n', 'H', 'e', 'b', 'd', 'a', 'u', 'f', 'y', 'r', 'i']
23


In [13]:
# 建立字典
# int to char
int_to_char = {a:b for a,b in enumerate(letters)}
print(int_to_char)
# char to int
char_to_int = {b:a for a,b in enumerate(letters)}
print(char_to_int)

{0: 'A', 1: 'o', 2: 'c', 3: 'h', 4: 'l', 5: 'm', 6: 'p', 7: '.', 8: 's', 9: 'S', 10: ' ', 11: 't', 12: 'n', 13: 'H', 14: 'e', 15: 'b', 16: 'd', 17: 'a', 18: 'u', 19: 'f', 20: 'y', 21: 'r', 22: 'i'}
{'A': 0, 'o': 1, 'c': 2, 'h': 3, 'l': 4, 'm': 5, 'p': 6, '.': 7, 's': 8, 'S': 9, ' ': 10, 't': 11, 'n': 12, 'H': 13, 'e': 14, 'b': 15, 'd': 16, 'a': 17, 'u': 18, 'f': 19, 'y': 20, 'r': 21, 'i': 22}


In [14]:
# time_step
time_step = 20

In [17]:
# 数据预处理
import numpy as np
from tensorflow.keras.utils import to_categorical # 库发生了迁移

# 滑动窗口提取数据
def extract_data(data, slide):
  x = []
  y = []
  for i in range(len(data) - slide):
    x.append([a for a in data[i : i + slide]])
    y.append(data[i+slide])
  return x,y

# 字符到数字的批量转化
def char_to_int_Data(x, y, chat_to_int):
  x_to_int = []
  y_to_int = []
  for i in range(len(x)):
    x_to_int.append([char_to_int[char] for char in x[i]])
    y_to_int.append([char_to_int[char] for char in y[i]])  
  return x_to_int, y_to_int

# 实现输入字符文章的批量处理，输入整个字符，滑动窗口大小，转化字典
def data_preprocessing(data, slide, num_letters, char_to_int):
  char_Data = extract_data(data, slide)  
  int_Data = char_to_int_Data(char_Data[0], char_Data[1], char_to_int)  
  Input = int_Data[0]
  Output = list(np.array(int_Data[1]).flatten())
  Input_RESHAPED = np.array(Input).reshape(len(Input), slide)
  new = np.random.randint(0, 10, size=[Input_RESHAPED.shape[0], Input_RESHAPED.shape[1], num_letters])  
  for i in range(Input_RESHAPED.shape[0]):
    for j in range(Input_RESHAPED.shape[1]):
      new[i, j, :] = to_categorical(Input_RESHAPED[i, j], num_classes = num_letters)  
  return new, Output

In [18]:
# extract X and y from text data
X, y = data_preprocessing(data, time_step, num_letters, char_to_int) # X 已经被独热编码，y 稍后处理

In [22]:
print(X)       # 独热格式
print(X.shape) # 23 个映射

[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 1 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 1]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 1 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 1 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 1]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 ...

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 1 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 1 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 1 0]
  [0 0 0 ... 0 0 0]]]
(44962, 20, 23)


In [24]:
print(y)
print(len(y))

[12, 10, 17, 22, 10, 22, 12, 16, 18, 8, 11, 21, 20, 7, 10, 13, 14, 10, 1, 15, 11, 17, 22, 12, 14, 16, 10, 3, 22, 8, 10, 6, 3, 16, 10, 22, 12, 10, 0, 18, 8, 11, 21, 17, 4, 22, 17, 7, 10, 0, 18, 8, 11, 21, 17, 4, 22, 17, 10, 22, 8, 10, 17, 10, 2, 1, 18, 12, 11, 21, 20, 10, 22, 12, 10, 11, 3, 14, 10, 9, 1, 18, 11, 3, 14, 21, 12, 10, 13, 14, 5, 22, 8, 6, 3, 14, 21, 14, 7, 19, 4, 17, 21, 14, 10, 22, 8, 10, 17, 10, 11, 14, 17, 2, 3, 14, 21, 10, 22, 12, 10, 17, 22, 10, 22, 12, 16, 18, 8, 11, 21, 20, 7, 10, 13, 14, 10, 1, 15, 11, 17, 22, 12, 14, 16, 10, 3, 22, 8, 10, 6, 3, 16, 10, 22, 12, 10, 0, 18, 8, 11, 21, 17, 4, 22, 17, 7, 10, 0, 18, 8, 11, 21, 17, 4, 22, 17, 10, 22, 8, 10, 17, 10, 2, 1, 18, 12, 11, 21, 20, 10, 22, 12, 10, 11, 3, 14, 10, 9, 1, 18, 11, 3, 14, 21, 12, 10, 13, 14, 5, 22, 8, 6, 3, 14, 21, 14, 7, 19, 4, 17, 21, 14, 10, 22, 8, 10, 17, 10, 11, 14, 17, 2, 3, 14, 21, 10, 22, 12, 10, 17, 22, 10, 22, 12, 16, 18, 8, 11, 21, 20, 7, 10, 13, 14, 10, 1, 15, 11, 17, 22, 12, 14, 16, 10, 3,

In [26]:
# split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)
print(X_train.shape, len(y_train))  

(40465, 20, 23) 40465


In [27]:
y_train_category = to_categorical(y_train, num_letters)
print(y_train_category)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [47]:
# set up the model
from keras.models import Sequential
from keras.layers import Dense, LSTM

model = Sequential()
model.add(LSTM(units=20, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu'))
model.add(Dense(units=num_letters, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()



Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 20)                3520      
_________________________________________________________________
dense_2 (Dense)              (None, 23)                483       
Total params: 4,003
Trainable params: 4,003
Non-trainable params: 0
_________________________________________________________________


In [48]:
# train the model
model.fit(X_train, y_train_category, batch_size = 1000, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1caa55cad30>

In [49]:
# make prediction based on the training data
predict_x = model.predict(X_train)         # 笔者可能与视频使用的keras不是一个版本，故而有所更改
y_train_predict = np.argmax(predict_x, axis=1)  
print(y_train_predict)

[ 3 14 11 ...  4 14 10]


In [50]:
# transform the int to letters
y_train_predict_char = [int_to_char[i] for i in y_train_predict]
print(y_train_predict_char)

['h', 'e', 't', 'h', ' ', '.', 't', 'b', '.', 'i', 'H', 'a', 'a', 'c', ' ', ' ', 'e', 'a', 'c', 's', '.', ' ', 'e', 'i', 't', 'm', ' ', 'i', 'o', ' ', 'h', 'e', ' ', ' ', 's', 'r', 'a', 'i', 'o', 'c', 'n', 'i', 'H', 'c', 'n', 'l', ' ', 'r', 'e', 'e', 'r', ' ', ' ', ' ', ' ', ' ', 'y', ' ', 'r', 'r', 'd', 's', ' ', 'i', 'e', 'r', 'o', 'n', ' ', ' ', 'o', 'h', 'e', ' ', 'n', ' ', 'a', 'a', 'i', 'e', 't', 'd', 's', 'e', 'r', 'i', 'e', 'p', 'i', ' ', 'a', 'a', 'H', 'i', 'e', 'h', 'e', 's', ' ', 'e', ' ', 'h', 'd', 't', 'i', 'a', 'i', 'e', '.', ' ', 'a', 'a', 'e', ' ', 'u', ' ', 'e', 'r', 'n', 'e', 't', 'o', 'r', ' ', 's', 't', 'i', 't', ' ', 'r', 'n', 'y', 't', 'r', 'r', ' ', ' ', 'e', 'u', ' ', 'i', 'u', 'y', 'e', 't', ' ', ' ', 'r', 'n', 't', 'r', 'h', 's', 't', 'u', ' ', 'd', 'h', 't', 'y', ' ', 'n', ' ', 'r', ' ', 'h', 's', 'i', 'i', 'n', 'a', ' ', 'r', 'r', 'a', 'o', 's', 'h', 't', 'a', 'l', 's', 'e', 'a', ' ', 'e', 'u', ' ', 'h', '.', 'n', 'a', 'r', 'a', 'h', 'd', 'i', 'r', ' ', 'o',

In [51]:
from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(y_train, y_train_predict)
print(accuracy_train)

1.0


In [52]:
predict_x = model.predict(X_test)         # 笔者可能与视频使用的keras不是一个版本，故而有所更改
y_test_predict = np.argmax(predict_x, axis=1)
accuracy_test = accuracy_score(y_test, y_test_predict)  
print(accuracy_test)
print(y_test_predict)
print(y_test)

1.0
[22 12 11 ... 21 17 17]
[22, 12, 11, 10, 14, 14, 10, 22, 20, 6, 10, 14, 17, 12, 4, 22, 1, 3, 16, 18, 10, 17, 6, 3, 10, 12, 17, 2, 10, 8, 21, 7, 3, 14, 18, 4, 17, 8, 16, 5, 5, 0, 10, 10, 18, 21, 5, 10, 17, 1, 10, 7, 11, 17, 10, 10, 22, 10, 1, 22, 17, 12, 22, 18, 6, 10, 22, 8, 22, 18, 10, 12, 21, 18, 17, 10, 14, 19, 11, 5, 5, 8, 10, 17, 16, 10, 12, 22, 16, 12, 3, 14, 9, 10, 8, 14, 21, 21, 17, 10, 22, 10, 22, 11, 10, 12, 21, 17, 14, 21, 22, 18, 16, 12, 1, 3, 14, 3, 10, 14, 17, 18, 19, 10, 22, 10, 21, 12, 21, 10, 11, 14, 10, 20, 10, 10, 21, 21, 18, 7, 12, 14, 11, 15, 18, 8, 18, 10, 17, 11, 17, 10, 14, 17, 20, 21, 16, 20, 10, 2, 17, 10, 17, 22, 12, 18, 10, 10, 10, 10, 4, 4, 10, 17, 21, 17, 16, 3, 17, 14, 11, 12, 22, 3, 5, 17, 21, 8, 10, 21, 12, 10, 11, 3, 12, 6, 12, 8, 9, 3, 8, 10, 14, 8, 21, 17, 11, 12, 7, 6, 22, 11, 12, 0, 19, 22, 17, 19, 10, 10, 11, 15, 22, 14, 17, 11, 11, 8, 10, 21, 14, 3, 20, 19, 21, 17, 1, 17, 22, 12, 5, 14, 17, 3, 9, 11, 12, 16, 22, 21, 2, 7, 11, 12, 10, 18, 14, 

In [53]:
# 预测样例
new_letters = "flare is a teacher in ai industry. He obtained his phd in Australia."
X_new, y_new = data_preprocessing(new_letters, time_step, num_letters, char_to_int)
predict_x = model.predict(X_new)         # 笔者可能与视频使用的keras不是一个版本，故而有所更改
y_new_predict = np.argmax(predict_x, axis=1)
print(y_new_predict)

[12 10 17 22 10 22 12 16 18  8 11 21 20  7 10 13 14 10  1 15 11 17 22 12
 14 16 10  3 22  8 10  6  3 16 10 22 12 10  0 18  8 11 21 17  4 22 17  7]


In [54]:
# transform the int to letters
y_new_predict_char = [int_to_char[i] for i in y_new_predict]
print(y_new_predict_char)

['n', ' ', 'a', 'i', ' ', 'i', 'n', 'd', 'u', 's', 't', 'r', 'y', '.', ' ', 'H', 'e', ' ', 'o', 'b', 't', 'a', 'i', 'n', 'e', 'd', ' ', 'h', 'i', 's', ' ', 'p', 'h', 'd', ' ', 'i', 'n', ' ', 'A', 'u', 's', 't', 'r', 'a', 'l', 'i', 'a', '.']


In [55]:
for i in range(0, X_new.shape[0]-20):
  print(new_letters[i:i+20], '--predict next letter is--', y_new_predict_char[i])

flare is a teacher i --predict next letter is-- n
lare is a teacher in --predict next letter is--  
are is a teacher in  --predict next letter is-- a
re is a teacher in a --predict next letter is-- i
e is a teacher in ai --predict next letter is--  
 is a teacher in ai  --predict next letter is-- i
is a teacher in ai i --predict next letter is-- n
s a teacher in ai in --predict next letter is-- d
 a teacher in ai ind --predict next letter is-- u
a teacher in ai indu --predict next letter is-- s
 teacher in ai indus --predict next letter is-- t
teacher in ai indust --predict next letter is-- r
eacher in ai industr --predict next letter is-- y
acher in ai industry --predict next letter is-- .
cher in ai industry. --predict next letter is--  
her in ai industry.  --predict next letter is-- H
er in ai industry. H --predict next letter is-- e
r in ai industry. He --predict next letter is--  
 in ai industry. He  --predict next letter is-- o
in ai industry. He o --predict next letter is-- b
