In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api

In [4]:
# Load Dataset
data = pd.read_csv("dataset.csv")
data['Message'] = data['Message'].astype(str)

In [5]:
# Split the dataset into training and testing sets
X = data['Message']
y = data['EncodedClass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Tokenize and pad sequences for deep learning models
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post')
vocab_size = len(tokenizer.word_index) + 1

In [5]:
# Build LSTM+CNN Hybrid Model
print("Building LSTM+CNN Hybrid Model")
hybrid_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

hybrid_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Building LSTM+CNN Hybrid Model


In [6]:
# Train the hybrid model
hybrid_model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x25261606ee0>

In [7]:
# Save the model
model_path = "hybrid_lstm_cnn_model.h5"
hybrid_model.save(model_path)
print(f"Model saved to {model_path}")

  saving_api.save_model(


Model saved to hybrid_lstm_cnn_model.h5


In [8]:
# Evaluate the model
loss, accuracy = hybrid_model.evaluate(X_test_pad, y_test)
print(f"Hybrid Model Test Accuracy: {accuracy}")

Hybrid Model Test Accuracy: 0.9856502413749695


In [8]:
from tensorflow.keras.models import load_model

In [12]:
# Custom input prediction
def predict_custom_input(input_text):
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_pad = pad_sequences(input_seq, maxlen=100, padding='post')
    hybrid_model = load_model("hybrid_lstm_cnn_model.h5")
    prediction = hybrid_model.predict(input_pad)
    return "Spam" if prediction[0] > 0.5 else "Ham"

# Example prediction
custom_input = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
print(f"Prediction for custom input: {predict_custom_input(custom_input)}")

Prediction for custom input: Spam
