In [20]:
import numpy as np
import pandas as pd
from keras.layers import LSTM
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.optimizers import Adam
import os
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from keras.utils.vis_utils import plot_model
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
data = pd.read_csv(r'F:\BaiduNetdiskDownload\rb_mails\train.csv', encoding = "utf-8")
data[["Label","Email"]]

Unnamed: 0,Label,Email
0,ham,"I don't have anybody's number, I still haven't..."
1,spam,Congrats! 2 mobile 3G Videophones R yours. cal...
2,ham,She is our sister.. She belongs 2 our family.....
3,ham,Ya very nice. . .be ready on thursday
4,ham,Okie
...,...,...
4453,ham,No..jst change tat only..
4454,ham,Hey darlin.. i can pick u up at college if u t...
4455,ham,Btw regarding that we should really try to see...
4456,ham,Don't fret. I'll buy the ovulation test strips...


In [22]:
# 去除标点符号及两个以上的空格
data['Email'] = data['Email'].apply(lambda x:re.sub('[!@#$:).;,?&]', ' ', x.lower()))
data['Email'] = data['Email'].apply(lambda x:re.sub(' ', ' ', x))
# 单词转换为小写
data['Email'] = data['Email'].apply(lambda x:" ".join(x.lower() for x in x.split()))
# 去除停止词 ，如a、an、the、高频介词、连词、代词等
stop = stopwords.words('english')
data['Email'] = data['Email'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
# 分词处理，希望能够实现还原英文单词原型
st = PorterStemmer()
data['Email'] = data['Email'].apply(lambda x: " ".join([word for word in x.split()]))

In [23]:
#分出训练集和测试集
train=data[:4000]
test=data[4000:]
# 每个序列的最大长度，多了截断，少了补0
max_sequence_length = 50
#只保留频率最高的前20000个词
num_words = 5000
# 嵌入的维度
embedding_dim = 100
# 找出经常出现的单词，分词器
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train.Email)
train_sequences = tokenizer.texts_to_sequences(train.Email)
test_sequences = tokenizer.texts_to_sequences(test.Email)

# dictionary containing words and their index
word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))
# get only the top frequent words on train


Found 7371 unique tokens.


In [24]:
train_x = pad_sequences(train_sequences, maxlen=max_sequence_length)
# get only the top frequent words on test
test_x = pad_sequences(test_sequences, maxlen=max_sequence_length)
print(train_x.shape)
print(test_x.shape)


(4000, 50)
(458, 50)


In [25]:
# 标签向量化
# [0,1]: ham;[1,0]:spam
def lable_vectorize(labels):
    label_vec = np.zeros([len(labels), 2])
    for i, label in enumerate(labels):
        if str(label) == 'ham':
            label_vec[i][0] = 1
        else:
            label_vec[i][1] = 1
    return label_vec


train_y = lable_vectorize(train['Label'])
test_y = lable_vectorize(test['Label'])
X_train = np.reshape(train_x , (train_x .shape[0], train_x .shape[1], 1))
X_test = np.reshape(test_x, (test_x .shape[0], test_x .shape[1], 1))
print("加载数据完成")
#=============================================================================================
#=============================================================================================

learning_rate = 0.001
training_iters = 20
batch_size = 128
display_step = 10

n_hidden = 128

model = Sequential()
model.add(LSTM(n_hidden,
               batch_input_shape=(None, max_sequence_length, 1),
               unroll=True))

model.add(Dense(2))
model.add(Activation('softmax'))
# plot_model(model, to_file='lstm.png',show_shapes='True')

adam = Adam(lr=learning_rate)
model.summary()
model.compile(optimizer=adam,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, train_y,
          batch_size=batch_size,
          epochs=training_iters,
          verbose=1,
          validation_data=(X_test, test_y))

scores = model.evaluate(X_test, test_y, verbose=0)
print('LSTM test score:', scores[0])
print('LSTM test accuracy:', scores[1])

加载数据完成
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 128)               66560     
                                                                 
 dense_2 (Dense)             (None, 2)                 258       
                                                                 
 activation_2 (Activation)   (None, 2)                 0         
                                                                 
Total params: 66,818
Trainable params: 66,818
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20


  super(Adam, self).__init__(name, **kwargs)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
LSTM test score: 0.2515014410018921
LSTM test accuracy: 0.8908296823501587
