# IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import nltk
import re

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# LOAD DATASET

In [2]:
a = pd.read_csv('spam.csv',encoding='ISO-8859-1')
a.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
a=a[['v1','v2']]
a.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:

a.shape

(5572, 2)

# TEXT PROCESSING

In [5]:
ps=PorterStemmer()
message=[]
for i in range(0,5572):
  msg=a['v2'][i]
  msg=re.sub('[^a-zA-Z]',' ',msg)
  msg=msg.lower()
  msg=msg.split(' ')
  msg = [ps.stem(word) for word in msg if word not in set(stopwords.words('english'))]
  msg=' '.join(msg)
  message.append(msg)

message[:6]

['go jurong point  crazi   avail bugi n great world la e buffet    cine got amor wat   ',
 'ok lar    joke wif u oni   ',
 'free entri   wkli comp win fa cup final tkt   st may       text fa       receiv entri question std txt rate c appli             ',
 'u dun say earli hor    u c alreadi say   ',
 'nah think goe usf  live around though',
 'freemsg hey darl   week word back  like fun still  tb ok  xxx std chg send         rcv']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
x = cv.fit_transform(message).toarray()
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
#LABEL ENCODING

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


a['v1']=le.fit_transform(a['v1'])
y = a['v1'].values
y

array([0, 0, 1, ..., 0, 0, 0])

# MODEL BUILDING

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(1550,activation='relu'))
model.add(Dense(3000,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])



model.fit(x,y,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5b9c6bda50>

# SAVE THE MODEL

In [9]:
model.save('spam-NLP.h5')

# TEST THE MODEL

In [10]:
msg='FREE MESSAGE Activate your 500 FREE Text Messages by replying to this message with the word FREE'
print('THE ORIGINAL MESSAGE IS:  ',msg)
msg=re.sub('[^a-zA-Z]',' ',msg)
msg=msg.lower()
msg=msg.split(' ')
msg = [ps.stem(word) for word in msg if word not in set(stopwords.words('english'))]
msg=' '.join(msg)
print('THE STEMMED MESSAGE IS:  ',msg)
  
predict = model.predict(cv.transform([msg]))
if predict > 0.5:
  pred='SPAM'
else: pred='NOT SPAM'
print('THE MESSAGE IS PREDICTED AS:  ',pred)

THE ORIGINAL MESSAGE IS:   FREE MESSAGE Activate your 500 FREE Text Messages by replying to this message with the word FREE
THE STEMMED MESSAGE IS:   free messag activ     free text messag repli messag word free
THE MESSAGE IS PREDICTED AS:   SPAM


In [11]:
msg='Wishing you and your family Merry \X\" mas and HAPPY NEW Year in advance.."'
print('THE ORIGINAL MESSAGE IS:  ',msg)
msg=re.sub('[^a-zA-Z]',' ',msg)
msg=msg.lower()
msg=msg.split(' ')
msg = [ps.stem(word) for word in msg if word not in set(stopwords.words('english'))]
msg=' '.join(msg)
print('THE STEMMED MESSAGE IS:  ',msg)
  
predict = model.predict(cv.transform([msg]))
if predict > 0.5:
  pred='spam'
else: pred='NOT SPAM'
print('THE MESSAGE IS PREDICTED AS:  ',pred)

THE ORIGINAL MESSAGE IS:   Wishing you and your family Merry \X" mas and HAPPY NEW Year in advance.."
THE STEMMED MESSAGE IS:   wish famili merri  x  ma happi new year advanc   
THE MESSAGE IS PREDICTED AS:   NOT SPAM
