In [1]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import nltk
import pickle
import joblib
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Attention, Input, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

2025-05-15 23:12:22.366271: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747350742.857247      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747350742.978123      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
df = pd.read_csv('/kaggle/input/email-dataset/Email_Dataset.csv')

In [3]:
df.drop(columns=['message_id'], inplace=True, errors='ignore')
df.drop(columns=['date'], inplace=True, errors='ignore')

In [4]:
df.drop_duplicates()
df["sender"].fillna("")
df["receiver"].fillna("")
df["subject"].fillna("")
df["body"].fillna("")

0        Buck up, your troubles caused by small dimensi...
1        \nUpgrade your sex and pleasures with these te...
2        >+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...
3        Would anyone object to removing .so from this ...
4        \nWelcomeFastShippingCustomerSupport\nhttp://7...
                               ...                        
49855    \n\n  \n---  \n![](http://images.pcdi-homestud...
49856    This is a multi-part message in MIME format. -...
49857    Dear Subscriber,\n\nIf I could show you a way ...
49858    ****Mid-Summer Customer Appreciation SALE!****...
49859    ATTN:SIR/MADAN      \n\n                      ...
Name: body, Length: 49860, dtype: object

In [5]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
stop_words = set(stopwords.words("english"))
lemmatizer = nltk.WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"\W+", " ", text)
    text = re.sub(r"\d+", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [7]:
#Preprocess Columns
df["processed_body"] = df["body"].apply(preprocess_text)
df["processed_subject"] = df["subject"].apply(preprocess_text)
df["processed_sender"] = df["sender"].apply(preprocess_text)
df["processed_receiver"] = df["receiver"].apply(preprocess_text)

In [8]:
# Sentiment Feature
df["body_sentiment"] = df["processed_body"].apply(lambda x: TextBlob(x).sentiment.polarity)
df["subject_sentiment"] = df["processed_subject"].apply(lambda x: TextBlob(x).sentiment.polarity)

In [9]:
#Keyword-Based Features
phishing_keywords = ['verify', 'login', 'bank', 'password', 'account', 'click', 'urgent', 'suspended']
def keyword_feature(text):
    return int(any(kw in text for kw in phishing_keywords))

df["body_keyword_flag"] = df["processed_body"].apply(keyword_feature)
df["subject_keyword_flag"] = df["processed_subject"].apply(keyword_feature)

In [10]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=100)
tfidf_body = tfidf.fit_transform(df["processed_body"]).toarray()
tfidf_subject = tfidf.fit_transform(df["processed_subject"]).toarray()

In [11]:
# ===== Combine Traditional Features =====
meta_features = np.concatenate([
    tfidf_body,
    tfidf_subject,
    df[["body_sentiment", "subject_sentiment", "body_keyword_flag", "subject_keyword_flag"]].values
], axis=1)

In [12]:
scaler = StandardScaler()
meta_features_scaled = scaler.fit_transform(meta_features)

In [13]:
df["sender_encoded"] = df["processed_sender"].astype("category").cat.codes
df["receiver_encoded"] = df["processed_receiver"].astype("category").cat.codes

In [17]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_text(texts, max_len=128):
    return bert_tokenizer(
        texts.tolist(), padding="max_length", truncation=True,
        max_length=max_len, return_tensors="tf"
    )

X_body = tokenize_text(df["processed_body"])
X_subject = tokenize_text(df["processed_subject"])

X_body_input = X_body["input_ids"].numpy()
X_subject_input = X_subject["input_ids"].numpy()
X_sender = df[["sender_encoded"]].values
X_receiver = df[["receiver_encoded"]].values
X_meta = meta_features_scaled
y_labels = df["label"].values

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

I0000 00:00:1747351317.615528      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1747351317.616256      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [18]:
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(X_sender, 'sender_encoder.pkl')
joblib.dump(X_receiver, 'receiver_encoder.pkl')

['receiver_encoder.pkl']

In [None]:
class PhishingDetector(Model):
    def __init__(self, bert_model):
        super(PhishingDetector, self).__init__()
        self.bert = bert_model
        self.bi_lstm = Bidirectional(LSTM(64, return_sequences=True))
        self.attention = Attention()
        self.pooling = GlobalAveragePooling1D()

        self.sender_dense = Dense(16, activation='relu')
        self.receiver_dense = Dense(16, activation='relu')
        self.meta_dense = Dense(64, activation='relu')

        self.concat = Concatenate()
        self.final_dense = Dense(1, activation='sigmoid')

    def call(self, inputs):
        body_input, subj_input, sender_input, receiver_input, meta_input = inputs

        body_emb = self.bert(body_input)[0]
        body_lstm = self.bi_lstm(body_emb)
        body_att = self.attention([body_lstm, body_lstm])
        body_feat = self.pooling(body_att)

        subj_emb = self.bert(subj_input)[0]
        subj_lstm = self.bi_lstm(subj_emb)
        subj_att = self.attention([subj_lstm, subj_lstm])
        subj_feat = self.pooling(subj_att)

        sender_feat = self.sender_dense(sender_input)
        receiver_feat = self.receiver_dense(receiver_input)
        meta_feat = self.meta_dense(meta_input)

        merged = self.concat([body_feat, subj_feat, sender_feat, receiver_feat, meta_feat])
        return self.final_dense(merged)

In [None]:
X_train_body, X_test_body, \
X_train_subject, X_test_subject, \
X_train_sender, X_test_sender, \
X_train_receiver, X_test_receiver, \
X_train_meta, X_test_meta, \
y_train, y_test = train_test_split(
    X_body_input, X_subject_input, X_sender, X_receiver, X_meta, y_labels,
    test_size=0.2, random_state=42
)

In [None]:
# ===== Initialize and Compile Model =====
bert_model = TFBertModel.from_pretrained("bert-base-uncased")
model = PhishingDetector(bert_model)
model.compile(optimizer=Adam(2e-5), loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(
    [X_train_body, X_train_subject, X_train_sender, X_train_receiver, X_train_meta],
    y_train,
    validation_split=0.1,
    batch_size=16,
    epochs=14
)

# ===== Predict on Test Set =====
y_pred_probs = model.predict([X_test_body, X_test_subject, X_test_sender, X_test_receiver, X_test_meta])
y_pred = (y_pred_probs > 0.5).astype(int)

# ===== Print Evaluation Metrics =====
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

In [None]:
# Create folders to save models
vit_model_save_path = "/kaggle/working/vit-finetuned"
swin_model_save_path = "/kaggle/working/swin-finetuned"

vit_model.save_pretrained(vit_model_save_path)
swin_model.save_pretrained(swin_model_save_path)

In [None]:
# Save the trained model
model.save_weights('phishing_detector.weights.h5')
model.save('phishing_detector_model.h5')
model.save('phishing_detector_model.keras')
model.save_pretrained

In [None]:
# ===== Predict on Test Set =====
y_pred_probs = model.predict([X_test_body, X_test_subject, X_test_sender, X_test_receiver, X_test_meta])
y_pred = (y_pred_probs > 0.5).astype(int)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Phishing', 'Phishing'], yticklabels=['Not Phishing', 'Phishing'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
roc_auc = roc_auc_score(y_test, y_pred_probs)

# Plot ROC Curve
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})', color='darkorange')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.grid()
plt.show()