In [1]:
import os
import json

import numpy as np
import pandas as pd
from joblib import load
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

import tensorflow as tf

from features import extract_handcrafted_features
from nn_model import scamshield_model

DATA_PATH = "../data/messages.csv"
VEC_PATH = "../artifacts/log_reg/vec.joblib"
OUTPUT_DIR = "../artifacts/neural_net"
os.makedirs(OUTPUT_DIR, exist_ok = True)

In [2]:
def scipy_to_tf_sparse(X):
    """
    Convert SciPy sparse matrix -> tf.sparse.SparseTensor
    so Keras can accept sparse TF-IDF input.
    """
    X = X.tocoo()
    indices = np.stack([X.row, X.col], axis=1).astype(np.int64)
    values = X.data.astype(np.float32)
    shape = np.array(X.shape, dtype=np.int64)
    st = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=shape)
    return tf.sparse.reorder(st)

In [3]:
def main():
    # 1. loading the dataset
    df = pd.read_csv(DATA_PATH)
    texts = df['text'].astype(str).tolist()
    y = df['label'].astype(int).to_numpy()

    # Testing
    # print(texts[5])
    # print(y[5])

    # 2. splitting
    X_train_txt, X_test_txt, y_train, y_test = train_test_split(texts,
                                                                y,
                                                                test_size = 0.2,
                                                                random_state = 42,
                                                                stratify = y)

    # 3. loading the fitted TF-IDF vectorizer
    vec = load(VEC_PATH)
    X_train_tfidf = vec.transform(X_train_txt)
    X_test_tfidf = vec.transform(X_test_txt)

    # 4. handcrafted features
    X_train_hand = extract_handcrafted_features(X_train_txt)
    X_test_hand = extract_handcrafted_features(X_test_txt)

    # 5. Convert TF-IDF scipy sparse -> tf sparse
    X_train_tfidf_tf = scipy_to_tf_sparse(X_train_tfidf)
    X_test_tfidf_tf = scipy_to_tf_sparse(X_test_tfidf)

    tfidf_dim = X_train_tfidf.shape[1]
    hand_dim = X_train_hand.shape[1]
    emb_dim = 64


    # 6. building the model
    model = scamshield_model(tfidf_dim = tfidf_dim,
                             hand_dim = hand_dim,
                             emb_dim = emb_dim)

    # 7. compiling
    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3),
        loss = {
            "prob": "binary_crossentropy",
            "embedding": "mse"
        },
        loss_weights = {
            "prob": 1.0,
            "embedding": 0.0
        },
        metrics = {
            "prob":[
                tf.keras.metrics.AUC(name = "auc"),
                tf.keras.metrics.Precision(name = "precision"),
                tf.keras.metrics.Recall(name = "recall")
            ]
        }
    )

    # Dummy target for embedding (required because model has two outputs)
    dummy_emb_train = np.zeros((len(y_train), emb_dim), dtype = np.float32)

    callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor = "val_prob_auc",
        mode = "max",
        patience = 3,
        restore_best_weights = True
    )]

    # 8. fitting the model
    history = model.fit(
        x = {"tfidf": X_train_tfidf_tf,
             "handcrafted": X_train_hand},
        y = {"prob": y_train,
             "embedding": dummy_emb_train},
        validation_split = 0.2,
        epochs = 15,
        batch_size = 64,
        callbacks = callbacks,
        verbose = 1
    )

    # 9. predicting (prob + embedding)
    preds = model.predict({"tfidf": X_test_tfidf_tf,
                           "handcrafted": X_test_hand},
                          verbose=0)
    prob = preds["prob"].reshape(-1)
    emb = preds["embedding"]

    y_pred = (prob >= 0.5).astype(int)

    # 10. evaluating
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits = 4)
    auc = roc_auc_score(y_test, prob)

    print("\nConfusion Matrix:\n", cm)
    print("\nClassification Report:\n", report)
    print("ROC-AUC:", auc)

    # 11. saving the model
    model_path = os.path.join(OUTPUT_DIR, "scamshield_nn_tf.keras")
    model.save(model_path)
    np.save(os.path.join(OUTPUT_DIR, "test_embeddings.npy"), emb)

    metrics_path = os.path.join(OUTPUT_DIR, "metrics.json")
    with open(metrics_path, "w") as f:
        json.dump({"roc_auc": float(auc)}, f, indent = 2)

    print("\nSaved:")
    print(" - Model:", model_path)
    print(" - Test embeddings:", os.path.join(OUTPUT_DIR, "test_embeddings.npy"))
    print(" - Metrics:", metrics_path)

In [4]:
if __name__ == "__main__":
    main()

Epoch 1/15
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 52ms/step - embedding_loss: 0.6937 - loss: 0.4069 - prob_auc: 0.6377 - prob_loss: 0.4048 - prob_precision: 0.7424 - prob_recall: 0.3585 - val_embedding_loss: 1.0634 - val_loss: 0.1660 - val_prob_auc: 0.9153 - val_prob_loss: 0.1661 - val_prob_precision: 0.9302 - val_prob_recall: 0.7692
Epoch 2/15
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - embedding_loss: 1.6064 - loss: 0.1400 - prob_auc: 0.9309 - prob_loss: 0.1394 - prob_precision: 0.9241 - prob_recall: 0.8317 - val_embedding_loss: 1.0254 - val_loss: 0.1132 - val_prob_auc: 0.9655 - val_prob_loss: 0.1131 - val_prob_precision: 0.8980 - val_prob_recall: 0.8462
Epoch 3/15
[1m52/52[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - embedding_loss: 1.2613 - loss: 0.0804 - prob_auc: 0.9794 - prob_loss: 0.0799 - prob_precision: 0.9377 - prob_recall: 0.8805 - val_embedding_loss: 1.0358 - val_loss: 0.0874 - val_prob_auc: 0.9