 GRU (Gated Recurrent Unit) – Captures deep context in text.

 MLP (Multi-Layer Perceptron) – Works well with TF-IDF features.

 XGBoost with Autoencoder – Adds robustness and structure learning.

 Majority Voting – Reduces bias from individual models.

In [1]:
# prompt: upload file

from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))



Saving SpamTextCSV.csv to SpamTextCSV.csv
User uploaded file "SpamTextCSV.csv" with length 515223 bytes


In [2]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Download NLTK stopwords
nltk.download("stopwords")
from nltk.corpus import stopwords

# Load dataset (Ensure 'text' and 'label' columns exist)
df = pd.read_csv("SpamTextCSV.csv")

# Text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    text = " ".join([word for word in text.split() if word not in stopwords.words("english")])
    return text

df["clean_text"] = df["Message"].apply(clean_text)

# Encode labels (Spam = 1, Ham = 0)
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Category"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"], df["label"], test_size=0.2, random_state=42)

# Vectorization (TF-IDF & CountVectorizer)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
count_vectorizer = CountVectorizer(max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

X_train_count = count_vectorizer.fit_transform(X_train).toarray()
X_test_count = count_vectorizer.transform(X_test).toarray()


import joblib
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.joblib")
joblib.dump(count_vectorizer, "count_vectorizer.joblib")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['count_vectorizer.joblib']

###**GRU MODEL**

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=50)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=50)

# Build GRU Model
gru_model = Sequential([
    Embedding(input_dim=10000, output_dim=50, input_length=50),
    GRU(64, return_sequences=True),
    GRU(32),
    Dense(16, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

gru_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train GRU
gru_model.fit(X_train_seq, y_train, epochs=5, batch_size=32, validation_data=(X_test_seq, y_test))
# joblib.dump(tokenizer, "tokenizer.joblib")


Epoch 1/5




[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 133ms/step - accuracy: 0.8654 - loss: 0.3641 - val_accuracy: 0.9773 - val_loss: 0.0805
Epoch 2/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 95ms/step - accuracy: 0.9938 - loss: 0.0315 - val_accuracy: 0.9823 - val_loss: 0.0783
Epoch 3/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 75ms/step - accuracy: 0.9982 - loss: 0.0111 - val_accuracy: 0.9857 - val_loss: 0.0974
Epoch 4/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 78ms/step - accuracy: 0.9993 - loss: 0.0068 - val_accuracy: 0.9848 - val_loss: 0.1071
Epoch 5/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 77ms/step - accuracy: 0.9996 - loss: 0.0046 - val_accuracy: 0.9848 - val_loss: 0.1014


<keras.src.callbacks.history.History at 0x7b3a8a241890>

###MLP

In [None]:
mlp_model = Sequential([
    Dense(512, activation="relu", input_shape=(5000,)),
    Dropout(0.3),
    Dense(256, activation="relu"),
    Dropout(0.3),
    Dense(128, activation="relu"),
    Dense(1, activation="sigmoid")
])

mlp_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
mlp_model.fit(X_train_tfidf, y_train, epochs=5, batch_size=32, validation_data=(X_test_tfidf, y_test))
# mlp_model.save("mlp_model.h5")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 60ms/step - accuracy: 0.8611 - loss: 0.3574 - val_accuracy: 0.9621 - val_loss: 0.1182
Epoch 2/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.9902 - loss: 0.0369 - val_accuracy: 0.9823 - val_loss: 0.0896
Epoch 3/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 58ms/step - accuracy: 0.9978 - loss: 0.0059 - val_accuracy: 0.9806 - val_loss: 0.1030
Epoch 4/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 51ms/step - accuracy: 0.9998 - loss: 0.0025 - val_accuracy: 0.9806 - val_loss: 0.1238
Epoch 5/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 56ms/step - accuracy: 0.9999 - loss: 5.1399e-04 - val_accuracy: 0.9806 - val_loss: 0.1187


<keras.src.callbacks.history.History at 0x7ffab0b52250>

In [None]:
# mlp_model.save("mlp_model.keras")
# gru_model.save("gru_model.keras")


In [None]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
import xgboost as xgb

# Autoencoder Model
input_dim = X_train_count.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(256, activation="relu")(input_layer)
encoded = Dense(128, activation="relu")(encoded)
encoded = Dense(64, activation="relu")(encoded)

decoded = Dense(128, activation="relu")(encoded)
decoded = Dense(256, activation="relu")(decoded)
decoded = Dense(input_dim, activation="sigmoid")(decoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer="adam", loss="binary_crossentropy")
autoencoder.fit(X_train_count, X_train_count, epochs=5, batch_size=32, validation_data=(X_test_count, X_test_count))

# Extract Features
encoder = Model(input_layer, encoded)
X_train_encoded = encoder.predict(X_train_count)
X_test_encoded = encoder.predict(X_test_count)

# Train XGBoost Classifier
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_encoded, y_train)

# Save Models
# autoencoder.save("autoencoder.keras")
# joblib.dump(xgb_model, "xgboost_model.pkl")
# encoder.save("encoder.keras")



Epoch 1/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 103ms/step - loss: 0.2542 - val_loss: 0.0112
Epoch 2/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 118ms/step - loss: 0.0118 - val_loss: 0.0095
Epoch 3/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 62ms/step - loss: 0.0104 - val_loss: 0.0093
Epoch 4/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 64ms/step - loss: 0.0101 - val_loss: 0.0093
Epoch 5/5
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 65ms/step - loss: 0.0104 - val_loss: 0.0092
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [None]:
from tensorflow.keras.models import load_model

# Load Models
gru_model = load_model("gru_model.keras")
mlp_model = load_model("mlp_model.keras")
autoencoder = load_model("autoencoder.keras")
xgb_model = joblib.load("xgboost_model.pkl")

def ensemble_predict(text):
    # Preprocess input
    text_cleaned = clean_text(text)

    # Vectorization
    text_tfidf = tfidf_vectorizer.transform([text_cleaned]).toarray()
    text_count = count_vectorizer.transform([text_cleaned]).toarray()
    text_seq = pad_sequences(tokenizer.texts_to_sequences([text_cleaned]), maxlen=50)

    # Model Predictions
    p1 = gru_model.predict(text_seq)[0][0]
    p2 = mlp_model.predict(text_tfidf)[0][0]

    # Autoencoder + XGBoost
    text_encoded = encoder.predict(text_count)
    p3 = xgb_model.predict(text_encoded)[0]

    # Majority Voting
    final_prediction = round((p1 + p2 + p3) / 3)

    return "Spam" if final_prediction == 1 else "Ham"

# Test Prediction
print(ensemble_predict("you are arrested for cybercrime. Pay 2000 as to get out of this matter"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 637ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Ham


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_ensemble(X_test_texts, y_test):
    y_pred_ensemble = []

    for text in X_test_texts:
        # Preprocess input
        text_cleaned = clean_text(text)

        # Convert text into required formats
        text_tfidf = tfidf_vectorizer.transform([text_cleaned]).toarray()
        text_count = count_vectorizer.transform([text_cleaned]).toarray()
        text_seq = pad_sequences(tokenizer.texts_to_sequences([text_cleaned]), maxlen=50)

        # Get individual model predictions
        p1 = gru_model.predict(text_seq)[0][0]
        p2 = mlp_model.predict(text_tfidf)[0][0]

        # Autoencoder + XGBoost
        text_encoded = encoder.predict(text_count)
        p3 = xgb_model.predict(text_encoded)[0]

        # Majority Voting (Average Predictions)
        final_prediction = round((p1 + p2 + p3) / 3)
        y_pred_ensemble.append(final_prediction)

    # Convert lists to NumPy arrays
    y_test = np.array(y_test)
    y_pred_ensemble = np.array(y_pred_ensemble)

    # Calculate Metrics
    accuracy = accuracy_score(y_test, y_pred_ensemble)
    precision = precision_score(y_test, y_pred_ensemble)
    recall = recall_score(y_test, y_pred_ensemble)
    f1 = f1_score(y_test, y_pred_ensemble)

    return accuracy, precision, recall, f1


In [None]:
accuracy, precision, recall, f1 = evaluate_ensemble(X_test, y_test)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48