## 套件引入 及 資料前置處理

In [44]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, LSTM, Bidirectional
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from math import log, sqrt
import pandas as pd
import numpy as np
%matplotlib inline

### 讀取資料

In [45]:
mails = pd.read_csv('spam.csv', encoding='latin-1')
mails.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### 將不必要的欄位刪除 

In [46]:
# Remove 'Unnamed:2','Unnamed:3','Unnamed:4'
mails = mails[['v1', 'v2']]
mails

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### 將欄位更名

In [47]:
# Change the title: 'v1' as 'label'; 'v2' as 'message'
message = [i for i in mails.v2]
label = [i for i in mails.v1]
mails['message'] = message
mails['label'] = label
mails = mails[['message', 'label']]
mails.head()

Unnamed: 0,message,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


### 將 ham 改為 0, spam 改為 1

In [48]:
# Mark the 'ham' as '0', and 'spam' as '1'
mails['label'] = np.where(mails['label']=='spam', 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mails['label'] = np.where(mails['label']=='spam', 1, 0)


In [49]:
mails.head()

Unnamed: 0,message,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [50]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [51]:
# 分割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(mails['message'], mails['label'], test_size=0.2)

In [52]:
# 將文本轉換為數字序列
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

In [53]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [54]:
# 填充序列,使其長度相同
maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

## 定義模型

In [55]:
#定義模型
model = Sequential()
model.add(LSTM(64, input_shape=(maxlen, 1)))
model.add(Dense(1, activation='sigmoid'))

## 編譯模型

In [56]:
#編譯模型
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## 訓練模型

In [57]:
#訓練模型
model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2047535a430>

## 評估模型

In [58]:
#評估模型
loss, acc = model.evaluate(X_test_pad, y_test, batch_size=32)
print('Test accuracy:', acc)

Test accuracy: 0.8834080696105957


## 將測試的語句轉為索引後，預測

In [59]:
x = ['To use your credit card, please click the WAP link in the next txt message or click here.']
test_sequences = tokenizer.texts_to_sequences(x)

# Pad the testing sequences
test_padded = pad_sequences(test_sequences, maxlen=maxlen)

model.predict(test_padded)



array([[0.247599]], dtype=float32)

In [60]:
x = ['Do you have time for a meeting next Mon.?']
test_sequences = tokenizer.texts_to_sequences(x)

# Pad the testing sequences
test_padded = pad_sequences(test_sequences, maxlen=maxlen)

model.predict(test_padded)



array([[0.01170174]], dtype=float32)

In [61]:
x = ['Could you provide your credit card number?']
test_sequences = tokenizer.texts_to_sequences(x)

# Pad the testing sequences
test_padded = pad_sequences(test_sequences, maxlen=maxlen)

model.predict(test_padded)



array([[0.00402901]], dtype=float32)

In [62]:
# 模型存檔
model.save('spam_model.h5')