**1) Importing Libraries**

In [1]:
import pandas as pd
import nltk
import re

In [2]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
ps = PorterStemmer()
data = [] #list of texts after preprocessing

**2) Reading Dataset and Pre-processing**

In [6]:
df = pd.read_csv('/content/content/spam.csv', encoding = "ISO-8859-1")
df = df.iloc[:, :2]

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

df['v1'] = le.fit_transform(df['v1'])

In [8]:
df

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [9]:
#Text Preprocessing
for i in range(5572):
  t = df['v2'][i]

  #i) Removing unnecessary characters
  t = re.sub('[^a-zA-Z]', ' ', t)

  #ii) Change to lower case
  t = t.lower()

  #iii) Split the text
  t = t.split()

  #iv) Stemming and removal of stop words
  t = [ps.stem(word) for word in t if not word in set(stopwords.words('english'))] #Stemming the word if it is not a stop word

  #v) Joining the words from list to sentence
  t = ' '.join(t)

  data.append(t)

In [10]:
#Vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000)

In [11]:
#Splitting the dataset into x and y vectors
x = cv.fit_transform(data).toarray()
y = df.iloc[:, 0].values

In [12]:
print(len(cv.vocabulary_))

2000


In [13]:
#Train, Test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

**3) Creating Model**

In [14]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import BinaryCrossentropy

In [16]:
model = Sequential()

**4) Adding Layers (LSTM, Dense-(Hidden Layers), Output)**

In [17]:
x_train.shape

(4457, 2000)

In [18]:
x_test.shape

(1115, 2000)

In [19]:
model.add(Dense(5572, activation='relu'))
model.add(Dense(6000, activation='relu'))
model.add(Dense(4000, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

**5) Compiling Model**

In [20]:
model.compile(optimizer='adam', loss=BinaryCrossentropy(), metrics=['accuracy'])

**6) Fitting Model**

In [21]:
model.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc89c4964d0>

**7) Saving Model**

In [22]:
model.save('spam_model.h5')

**8) Testing Model**

In [23]:
text = 'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info'
text = re.sub('[^a-zA-Z]', ' ', text)
text = text.lower()
text = text.split()
text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
text = ' '.join(text)

In [24]:
yp = model.predict(cv.transform([text]))
if yp>0.5 :
  print('Spam')
else :
  print('Ham')

Spam


In [25]:
text = 'Even my brother is not like to speak with me. They treat me like aids patent.'
text = re.sub('[^a-zA-Z]', ' ', text)
text = text.lower()
text = text.split()
text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
text = ' '.join(text)

In [26]:
yp = model.predict(cv.transform([text]))
if yp>0.5 :
  print('Spam')
else :
  print('Ham')

Ham
