<a href="https://colab.research.google.com/github/Harsha2193/CNN-model/blob/main/spaming_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam

In [7]:
df = pd.read_csv('/content/drive/MyDrive/m/spam.csv', encoding='latin-1')

In [8]:

df = df[['v1', 'v2']]
df.columns = ['label', 'text']

In [9]:

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [10]:

tokenizer = Tokenizer(num_words=5000, lower=True, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, df['label'].values, test_size=0.2, random_state=42)

In [12]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [13]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

In [14]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 183, 128)          640000    
                                                                 
 spatial_dropout1d (Spatial  (None, 183, 128)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 731701 (2.79 MB)
Trainable params: 731701 (2.79 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [15]:
epochs = 5
batch_size = 64

In [16]:
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=2)

Epoch 1/5
70/70 - 50s - loss: 0.2405 - accuracy: 0.9190 - val_loss: 0.0767 - val_accuracy: 0.9758 - 50s/epoch - 716ms/step
Epoch 2/5
70/70 - 39s - loss: 0.0355 - accuracy: 0.9897 - val_loss: 0.0591 - val_accuracy: 0.9839 - 39s/epoch - 551ms/step
Epoch 3/5
70/70 - 40s - loss: 0.0213 - accuracy: 0.9944 - val_loss: 0.0508 - val_accuracy: 0.9857 - 40s/epoch - 577ms/step
Epoch 4/5
70/70 - 44s - loss: 0.0103 - accuracy: 0.9978 - val_loss: 0.0503 - val_accuracy: 0.9848 - 44s/epoch - 622ms/step
Epoch 5/5
70/70 - 41s - loss: 0.0057 - accuracy: 0.9984 - val_loss: 0.0584 - val_accuracy: 0.9821 - 41s/epoch - 586ms/step


In [17]:
score, acc = model.evaluate(X_test, y_test, verbose=2, batch_size=batch_size)
print(f'Test accuracy: {acc}')

18/18 - 2s - loss: 0.0584 - accuracy: 0.9821 - 2s/epoch - 137ms/step
Test accuracy: 0.9820627570152283


In [20]:
def predict_message(message):
    seq = tokenizer.texts_to_sequences([message])
    padded = pad_sequences(seq, maxlen=X.shape[1])
    pred = model.predict(padded)
    return 'Spam' if pred[0][0] > 0.5 else 'Not Spam'

# Input message to check
input_message = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005."
print(f"Message: {input_message}")
print(f"Prediction: {predict_message(input_message)}")

Message: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005.
Prediction: Spam
