In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
%matplotlib inline

In [2]:
data = pd.read_csv("spam.csv",encoding = 'latin1')

data['lab'] = data["Category"].map({"ham":0,"spam":1})
data.head()

Unnamed: 0,Category,Message,lab
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
X = data.Message
Y = data.Category
le = LabelEncoder()#converts categorical variables to numverical format
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20)

In [5]:
#spliting into words using tokenization
#tokens- words
max_words = 1000
max_len  = 150
tok = Tokenizer(num_words = max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences, maxlen = max_len)

In [6]:
#Embedding layer - used for neural networks on text data.It input data be integer
#each word is encoded as unique integer...it can be used to load a pre-trained word embedding model- a type of transfer learning
def rnn():
    inputs = Input(name = 'inputs',shape = [max_len])
    layer = Embedding(max_words,50)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name = 'FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name = 'out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs = inputs, outputs = layer)
    return model

In [7]:
model = rnn()
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = RMSprop(),metrics = ['accuracy'])

In [8]:
#calllback is a set of function to be applied at given stages of the training procedure.
#can use callback to get a view on internal states and statistics of the model during training
model.fit(sequences_matrix,Y_train,batch_size=128,epochs = 10,
          validation_split=0.2, callbacks=[EarlyStopping(monitor = 'val_loss',min_delta=0.0001)])

Epoch 1/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 384ms/step - accuracy: 0.7826 - loss: 0.4984 - val_accuracy: 0.9619 - val_loss: 0.2623
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 268ms/step - accuracy: 0.9473 - loss: 0.2112 - val_accuracy: 0.9630 - val_loss: 0.1183
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 297ms/step - accuracy: 0.9728 - loss: 0.1126 - val_accuracy: 0.9832 - val_loss: 0.0675
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 349ms/step - accuracy: 0.9824 - loss: 0.0673 - val_accuracy: 0.9843 - val_loss: 0.0613
Epoch 5/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 298ms/step - accuracy: 0.9856 - loss: 0.0513 - val_accuracy: 0.9865 - val_loss: 0.0594
Epoch 6/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 289ms/step - accuracy: 0.9893 - loss: 0.0359 - val_accuracy: 0.9877 - val_loss: 0.0557
Epoch 7/10
[1m28/28[0m 

<keras.src.callbacks.history.History at 0x19dabd67bf0>

In [9]:
model.save('spam_detection_model.keras')

In [10]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen = max_len) 

In [11]:
acc = model.evaluate(test_sequences_matrix,Y_test)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step - accuracy: 0.9792 - loss: 0.0784


In [12]:
test_content = [input()]

textx = tok.texts_to_sequences(test_content)
textx = sequence.pad_sequences(textx,maxlen=max_len)

 camera


In [13]:
pred = model.predict(textx)
print(pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 516ms/step
[[0.03518642]]


In [14]:
if pred > [[0.5]]:
    print("This is a Spam mail")
else:
    print("This is not a Spam mail")

This is not a Spam mail


In [None]:
from flask import Flask, render_template, request
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping

app = Flask(__name__)

# Load the data and preprocess
data = pd.read_csv("spam.csv", encoding='latin1')
data['lab'] = data["Category"].map({"ham": 0, "spam": 1})
X = data.Message
Y = data.Category
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)

X_train, _, Y_train, _ = train_test_split(X, Y, test_size=0.20)

# Tokenization and model setup
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences_matrix = sequence.pad_sequences(tok.texts_to_sequences(X_train), maxlen=max_len)

def rnn():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 50)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256, name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
    return model

model = rnn()
model.fit(sequences_matrix, Y_train, batch_size=128, epochs=10, validation_split=0.2,
          callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():
    if request.method == 'POST':
        message = request.form['message']
        textx = tok.texts_to_sequences([message])
        textx = sequence.pad_sequences(textx, maxlen=max_len)
        pred = model.predict(textx)
        result = "This is a Spam mail" if pred > 0.5 else "This is not a Spam mail"
        return render_template('index.html', result=result, message=message)

if __name__ == '__main__':
    app.run(debug=True)