In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
df = pd.read_csv('spam.csv')
df['target'] = df['label'].map({'spam':1,"ham":0})

print(df.head())

  label                                                sms  target
0   ham  Go until jurong point, crazy.. Available only ...       0
1   ham                      Ok lar... Joking wif u oni...       0
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...       1
3   ham  U dun say so early hor... U c already then say...       0
4   ham  Nah I don't think he goes to usf, he lives aro...       0


In [27]:
X = df['sms'].values
y = df['target'].values
print(X)

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 ... 'Pity, * was in mood for that. So...any other suggestions?'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
 'Rofl. Its true to its name']


In [28]:
from sklearn.model_selection import train_test_split

# 將資料做training testing 8:2分割

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

print(X_train[:5])
print(y_train[:5])
print('==============')
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

["You might want to pull out more just in case and just plan on not spending it if you can, I don't have much confidence in derek and taylor's money management"
 'Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GBP/mnth inc 3hrs 16 stop?txtStop www.gamb.tv'
 'Old Orchard near univ. How about you?' "I'm coming home 4 dinner."
 'Ok k..sry i knw 2 siva..tats y i askd..']
[0 1 0 0 0]
(4457,)
(1115,)
(4457,)


In [29]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
word_dict = tokenizer.index_word

print(len(word_dict))
print(word_dict)

7956


In [30]:
for key in word_dict.keys():
    print(key, word_dict[key])

1 i
2 to
3 you
4 a
5 the
6 u
7 and
8 in
9 is
10 me
11 my
12 for
13 your
14 it
15 of
16 call
17 have
18 on
19 that
20 are
21 2
22 now
23 so
24 but
25 not
26 can
27 or
28 be
29 at
30 i'm
31 do
32 will
33 get
34 if
35 ur
36 with
37 no
38 just
39 we
40 gt
41 this
42 lt
43 up
44 4
45 ok
46 when
47 go
48 from
49 all
50 out
51 how
52 what
53 know
54 free
55 got
56 then
57 good
58 like
59 come
60 time
61 day
62 am
63 only
64 its
65 was
66 love
67 he
68 want
69 there
70 send
71 text
72 one
73 by
74 txt
75 i'll
76 as
77 going
78 home
79 about
80 r
81 lor
82 need
83 still
84 back
85 sorry
86 n
87 today
88 see
89 stop
90 k
91 da
92 reply
93 mobile
94 don't
95 our
96 hi
97 take
98 tell
99 later
100 they
101 any
102 dont
103 she
104 think
105 phone
106 here
107 been
108 new
109 has
110 her
111 please
112 pls
113 claim
114 dear
115 much
116 who
117 well
118 d
119 some
120 where
121 oh
122 did
123 ì
124 1
125 hope
126 great
127 it's
128 an
129 week
130 hey
131 give
132 make
133 msg
134 wat
135 him
136

In [31]:
# 將自轉換成序列
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

print(X_train_seq[0])
for el in X_train_seq[0]:
    print(word_dict[el], end=' ')

[3, 372, 68, 2, 2569, 50, 140, 38, 8, 802, 7, 38, 353, 18, 25, 1653, 14, 34, 3, 26, 1, 94, 17, 115, 1654, 8, 1655, 7, 3879, 204, 1985]
you might want to pull out more just in case and just plan on not spending it if you can i don't have much confidence in derek and taylor's money management 

In [32]:
# 將每個文本序列補到20個長度
X_train_pad = pad_sequences(X_train_seq, maxlen=20, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=20, padding='post')

print(X_train_pad[0])
print(X_train_pad.shape)

[  38  353   18   25 1653   14   34    3   26    1   94   17  115 1654
    8 1655    7 3879  204 1985]
(4457, 20)


In [33]:
# 建構RNN神經網路
laenge_pads = 20

lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=len(word_dict)+1, output_dim=20, input_length=laenge_pads))
lstm_model.add(LSTM(400))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 20)            159140    
                                                                 
 lstm (LSTM)                 (None, 400)               673600    
                                                                 
 dense (Dense)               (None, 1)                 401       
                                                                 
Total params: 833,141
Trainable params: 833,141
Non-trainable params: 0
_________________________________________________________________


In [8]:
history = lstm_model.fit(X_train_pad, y_train, epochs=10, batch_size=64, validation_data=(X_test_pad, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
sms_test = ['Hi Paul, would you come around tonight']
# 將新的email內容輸入預測
sms_seq = tokenizer.texts_to_sequences(sms_test)
sms_pad = pad_sequences(sms_seq, maxlen=20, padding='post')
print(sms_pad)
#lstm_model.predict_classes(sms_pad)
predict_x=lstm_model.predict(sms_pad) 
classes_x=np.argmax(predict_x,axis=1)
print(classes_x)

[[ 96 175   3  59 202 205   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]]
[0]


In [10]:
sms_test = ['Free SMS service for anyone']
# 將新的email內容輸入預測

print(sms_pad)
#lstm_model.predict_classes(sms_pad)
np.argmax(lstm_model.predict(sms_pad), axis=-1)

[[ 50 252 219  12 645   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]]


array([[1]])