# 1. Import required library

In [1]:
import pandas as pd
import string
import nltk
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# 2. Read dataset and do pre-processing

In [2]:
df = pd.read_csv("spam.csv", encoding="ISO-8859-1")
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [3]:
x = df["v2"]
y = df["v1"]

In [4]:
#Converting texts to lower case
x = x.apply(lambda x:x.lower())
x

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                will ì_ b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
Name: v2, Length: 5572, dtype: object

In [5]:
#Removing punctuation
def remove_punctuations(s):
    punc_free_string = "".join([i for i in s if i not in string.punctuation])
    return punc_free_string
x = x.apply(remove_punctuations)
x

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i dont think he goes to usf he lives aroun...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                  will ì b going to esplanade fr home
5569    pity  was in mood for that soany other suggest...
5570    the guy did some bitching but i acted like id ...
5571                            rofl its true to its name
Name: v2, Length: 5572, dtype: object

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [8]:
#Tokenize the words
x = x.apply(nltk.tokenize.word_tokenize)
x

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, 2, a, wkly, comp, to, win, f...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, i, dont, think, he, goes, to, usf, he, l...
                              ...                        
5567    [this, is, the, 2nd, time, we, have, tried, 2,...
5568         [will, ì, b, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, soany, other,...
5570    [the, guy, did, some, bitching, but, i, acted,...
5571                     [rofl, its, true, to, its, name]
Name: v2, Length: 5572, dtype: object

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [11]:
#Stop word removal
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(ls):
    return [i for i in ls if i not in stopwords]
x = x.apply(remove_stopwords)
x

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, u, å£750, po...
5568                   [ì, b, going, esplanade, fr, home]
5569                     [pity, mood, soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [rofl, true, name]
Name: v2, Length: 5572, dtype: object

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...


True

In [14]:
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize(ls):
    return [lemmatizer.lemmatize(i) for i in ls]
x = x.apply(lemmatize)
x

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, dont, think, go, usf, life, around, though]
                              ...                        
5567    [2nd, time, tried, 2, contact, u, u, å£750, po...
5568                   [ì, b, going, esplanade, fr, home]
5569                      [pity, mood, soany, suggestion]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [rofl, true, name]
Name: v2, Length: 5572, dtype: object

In [15]:
#Converting to bag of words
x = x.apply(lambda i: " ".join(i))

In [16]:
y = y.apply(lambda i: 1 if i == "spam" else 0)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [18]:
y_train.shape

(4179,)

In [19]:
max_words = 1000
max_len = 150
tokenizer = Tokenizer(max_words)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_mat = pad_sequences(sequences, maxlen = max_len)

# 3. Create model

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
import tensorflow as tf

model = Sequential()

# 4. Add Layers (LSTM, Dense-(Hidden Layers), Output)

In [21]:
model.add(Embedding(max_words, 50, input_length=max_len))
model.add(LSTM(256))
model.add(Dropout(0.5))
model.add(Dense(100, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation=tf.keras.activations.sigmoid))

# 5. Compile the model

In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 6. Fit the model

In [23]:
model.fit(sequences_mat, y_train)



<keras.callbacks.History at 0x1fd8d0e7580>

# 7. Saving the model

In [24]:
model.save("lstm_model.h5")

# 8. Test the model

In [25]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = pad_sequences(test_sequences,maxlen=max_len)
test_sequences_matrix

array([[  0,   0,   0, ..., 585,  19, 786],
       [  0,   0,   0, ..., 219, 338,   2],
       [  0,   0,   0, ...,   0,   0, 483],
       ...,
       [  0,   0,   0, ...,  19,  66, 147],
       [  0,   0,   0, ...,   0,  37, 593],
       [  0,   0,   0, ..., 617, 244, 423]])

In [26]:
accr = model.evaluate(test_sequences_matrix,y_test)
print('Accuracy:',accr[1])
print('Loss:',accr[0])

Accuracy: 0.979899525642395
Loss: 0.07698313146829605
