In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import pickle
from bs4 import BeautifulSoup
from urllib.parse import unquote
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from imblearn.over_sampling import BorderlineSMOTE
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from xgboost import XGBClassifier, plot_importance
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# In this I was using googlcolab and found that it would be easier for me to get my csv files with the emails by keeping thm on google drive thus for this we have to mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Then I went on to define the paths to the datasets in my Google Drive folder
base_path = "/content/drive/My Drive/email_datasets"

paths = {
    'CEAS_08': f"{base_path}/CEAS_08.csv",
    'Enron': f"{base_path}/Enron.csv",
    'Ling': f"{base_path}/Ling.csv",
    'Nigerian_Fraud': f"{base_path}/Nigerian_Fraud.csv",
    'phishing_email': f"{base_path}/phishing_email.csv",
    'SpamAssasin': f"{base_path}/SpamAssasin.csv",
    'TREC_05': f"{base_path}/TREC_05.csv",
    'TREC_06': f"{base_path}/TREC_06.csv",
    'TREC_07': f"{base_path}/TREC_07.csv",
}

Mounted at /content/drive


In [None]:
phish_path = "/content/drive/My Drive/email_datasets/phishing_email.csv"
phish_df = pd.read_csv(phish_path, nrows=3)
print(phish_df.columns.tolist())


['text_combined', 'label']


In [None]:

import os
import re
import pickle
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import unquote
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import BorderlineSMOTE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from collections import Counter

class DataProcessor:
    def __init__(self, paths, max_vocab=10000, max_seq_len=200):
        self.paths = paths
        self.max_vocab = max_vocab
        self.max_seq_len = max_seq_len
        self.tokenizer_nltk = TreebankWordTokenizer()
        self.stop_words = set(stopwords.words('english'))
        self.text_tokenizer = Tokenizer(num_words=self.max_vocab)
        self.df_combined = None

    def clean_text(self, text):
        #This function will be responsible for cleaning and normalising all the email text.
        # First, BeautifulSoup will be used to remove HTML tags.
        if not isinstance(text, str):
            return ""
        text = BeautifulSoup(text, "html.parser").get_text()
        #After this then it will decode any of the URL-encoded characters.
        text = unquote(text)
        text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) #At this line, the punctuation and special characters are removed and everything will be converted to lowercase.
        text = text.lower()
        words = self.tokenizer_nltk.tokenize(text) # This line will then remoove common English stop words.
        filtered_words = [w for w in words if w.isalpha() and w not in self.stop_words]
        return ' '.join(filtered_words)

    def _handle_special_cases(self, name, path):
        if "phishing_email" in name:
            df = pd.read_csv(path)
            if 'text_combined' in df.columns and 'label' in df.columns:
                df['body'] = df['text_combined']
                df['subject'] = ''
                df['text'] = df['subject'] + ' ' + df['body']
                df = df[['text', 'label']]
                return df
        return None

    def load_and_prepare_dataset(self, path, subject_col='subject', body_col='body', label_col='label', combine_subject_body=True):
        name = os.path.basename(path).lower()
        df_special = self._handle_special_cases(name, path)
        if df_special is not None:
            return df_special

        try:
            df = pd.read_csv(path, encoding='utf-8', engine='python', on_bad_lines='skip')
        except Exception as e:
            print(f"\u274c Failed to load {path}: {e}")
            return pd.DataFrame(columns=['text', 'label'])

        if combine_subject_body and subject_col in df.columns and body_col in df.columns:
            df['text'] = df[subject_col].fillna('') + ' ' + df[body_col].fillna('')
        elif 'text_combined' in df.columns:
            df['text'] = df['text_combined'].fillna('')
        elif body_col in df.columns:
            df['text'] = df[body_col].fillna('')
        else:
            raise ValueError(f"Could not find subject/body/text_combined columns in {path}")

        df = df[['text', label_col]].rename(columns={label_col: 'label'})
        return df

    def process_all(self):
        df_list = []
        for name, path in self.paths.items():
            df = self.load_and_prepare_dataset(path)
            print(f"Loaded {name}: {len(df)} emails")
            df_list.append(df)
        self.df_combined = pd.concat(df_list, ignore_index=True)
        print(f"Total combined emails: {len(self.df_combined)}")

        self.df_combined['clean_text'] = self.df_combined['text'].apply(self.clean_text)
        self.df_combined = self.df_combined[self.df_combined['clean_text'].str.strip() != '']
        print(f"After cleaning, emails: {len(self.df_combined)}")

        min_samples_per_class = 10
        label_counts = self.df_combined['label'].value_counts()
        valid_labels = label_counts[label_counts >= min_samples_per_class].index
        initial_len = len(self.df_combined)
        self.df_combined = self.df_combined[self.df_combined['label'].isin(valid_labels)]
        print(f"\U0001f9f9 Removed rare classes (<{min_samples_per_class} samples): {initial_len - len(self.df_combined)} rows dropped.")

        print("\n\U0001f50d Unique label values BEFORE encoding:")
        print(self.df_combined['label'].value_counts())

        self.df_combined = self.df_combined.dropna(subset=['label'])
        self.df_combined['label'] = self.df_combined['label'].astype(str).str.strip().replace({'0.0': 0, '1.0': 1, '0': 0, '1': 1})
        self.df_combined['label'] = pd.to_numeric(self.df_combined['label'], errors='coerce')
        self.df_combined = self.df_combined[self.df_combined['label'].isin([0, 1])].copy()
        self.df_combined['label'] = self.df_combined['label'].astype(int)

        self.le = LabelEncoder()
        self.df_combined['label'] = self.le.fit_transform(self.df_combined['label'])

        print("\n\U0001f501 Label Encoding Mapping:")
        for i, class_label in enumerate(self.le.classes_):
            print(f"{i} → {class_label}")

# 70/15/15 Split + Tokenization + SMOTE
# Initialize processor
processor = DataProcessor(paths)
processor.process_all()

# Get cleaned data
df_cleaned = processor.df_combined.copy()
X_raw = df_cleaned['clean_text'].values
y_raw = df_cleaned['label'].values

# 70/15/15 split
X_temp, X_test_text, y_temp, y_test = train_test_split(X_raw, y_raw, test_size=0.15, random_state=42)
X_train_text, X_val_text, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42)

# Tokenizer — This will go only on the training
tokenizer = Tokenizer(num_words=processor.max_vocab)
tokenizer.fit_on_texts(X_train_text)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_val_seq = tokenizer.texts_to_sequences(X_val_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=processor.max_seq_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=processor.max_seq_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=processor.max_seq_len, padding='post')

# Smote is only applied to the training
smote = BorderlineSMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_pad, y_train)

# Save datasets
save_dir = "/content/drive/My Drive/email_datasets"
with open(f"{save_dir}/X_train_smote.pkl", 'wb') as f:
    pickle.dump(X_train_smote, f)
with open(f"{save_dir}/y_train_smote.pkl", 'wb') as f:
    pickle.dump(y_train_smote, f)
with open(f"{save_dir}/X_val_pad.pkl", 'wb') as f:
    pickle.dump(X_val_pad, f)
with open(f"{save_dir}/y_val.pkl", 'wb') as f:
    pickle.dump(y_val, f)
with open(f"{save_dir}/X_test_pad.pkl", 'wb') as f:
    pickle.dump(X_test_pad, f)
with open(f"{save_dir}/y_test.pkl", 'wb') as f:
    pickle.dump(y_test, f)
with open(f"{save_dir}/tokenizer.pkl", 'wb') as f:
    pickle.dump(tokenizer, f)

print("\n✅ Data preparation complete: 70/15/15 split with safe SMOTE and tokenization.")


def counts_to_df(counter, name):
    df = pd.DataFrame.from_dict(counter, orient='index', columns=[name])
    df.index.name = 'label'
    return df.sort_index().reset_index()

overall_counts = Counter(df_cleaned['label'])
pre_smote_counts = Counter(y_train)
post_smote_counts = Counter(y_train_smote)

df_overall = counts_to_df(overall_counts, 'count_overall')
df_overall['percent_overall'] = (df_overall['count_overall'] / df_overall['count_overall'].sum() * 100).round(2)

df_pre = counts_to_df(pre_smote_counts, 'count_pre_smote_train')
df_pre['percent_pre'] = (df_pre['count_pre_smote_train'] / df_pre['count_pre_smote_train'].sum() * 100).round(2)

df_post = counts_to_df(post_smote_counts, 'count_post_smote_train')
df_post['percent_post'] = (df_post['count_post_smote_train'] / df_post['count_post_smote_train'].sum() * 100).round(2)

df_compare = df_overall.merge(df_pre, on='label').merge(df_post, on='label')

# Save CSVs
df_overall.to_csv(f"{save_dir}/class_counts_overall.csv", index=False)
df_pre.to_csv(f"{save_dir}/class_counts_train_pre_smote.csv", index=False)
df_post.to_csv(f"{save_dir}/class_counts_train_post_smote.csv", index=False)
df_compare.to_csv(f"{save_dir}/class_counts_compare.csv", index=False)

# Charts
labels = sorted(set(pre_smote_counts.keys()) | set(post_smote_counts.keys()))
x = np.arange(len(labels))
width = 0.35

# Pre vs Post SMOTE
plt.figure(figsize=(7,5))
plt.bar(x - width/2, [pre_smote_counts.get(l, 0) for l in labels], width, label='Pre-SMOTE')
plt.bar(x + width/2, [post_smote_counts.get(l, 0) for l in labels], width, label='Post-SMOTE')
plt.xticks(x, labels)
plt.ylabel('Count')
plt.xlabel('Label')
plt.title('Class Balance: Train Pre- vs Post-SMOTE')
plt.legend()
plt.tight_layout()
plt.savefig(f"{save_dir}/class_balance_train_pre_vs_post_smote.png", dpi=200)
plt.close()

# Overall
plt.figure(figsize=(6,5))
plt.bar(labels, [overall_counts.get(l, 0) for l in labels])
plt.ylabel('Count')
plt.xlabel('Label')
plt.title('Overall Dataset Distribution (Pre-SMOTE)')
plt.tight_layout()
plt.savefig(f"{save_dir}/class_balance_overall.png", dpi=200)
plt.close()

# Per-source (optional)
if 'source' in df_cleaned.columns:
    df_source = df_cleaned.groupby(['source','label']).size().unstack(fill_value=0)
    df_source.to_csv(f"{save_dir}/per_source_class_counts_overall.csv")
    plt.figure(figsize=(10,6))
    bottom = np.zeros(len(df_source))
    for lab in sorted(df_source.columns):
        vals = df_source[lab].values
        plt.bar(df_source.index, vals, bottom=bottom, label=f"Label {lab}")
        bottom += vals
    plt.xticks(rotation=30, ha='right')
    plt.ylabel('Count')
    plt.xlabel('Source')
    plt.title('Per-Source Class Distribution (Pre-SMOTE)')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{save_dir}/per_source_class_distribution_overall.png", dpi=220)
    plt.close()

print("\n✅ All tables and charts saved to Google Drive:", save_dir)


Loaded CEAS_08: 39153 emails
Loaded Enron: 31275 emails
Loaded Ling: 2859 emails
Loaded Nigerian_Fraud: 3332 emails
Loaded phishing_email: 82486 emails
Loaded SpamAssasin: 7863 emails
Loaded TREC_05: 59015 emails
Loaded TREC_06: 16439 emails
Loaded TREC_07: 68221 emails
Total combined emails: 310643
After cleaning, emails: 290429
🧹 Removed rare classes (<10 samples): 2014 rows dropped.

🔍 Unique label values BEFORE encoding:
label
0    75789
1    74165
0    72413
1    66048
Name: count, dtype: int64


  self.df_combined['label'] = self.df_combined['label'].astype(str).str.strip().replace({'0.0': 0, '1.0': 1, '0': 0, '1': 1})



🔁 Label Encoding Mapping:
0 → 0
1 → 1

✅ Data preparation complete: 70/15/15 split with safe SMOTE and tokenization.

✅ All tables and charts saved to Google Drive: /content/drive/My Drive/email_datasets


In [None]:
tk = processor.text_tokenizer


In [None]:
# 📦 Imports
import os, pickle, zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay
)
from xgboost import XGBClassifier, plot_importance

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger

# Load saved datasets
base_path = "/content/drive/My Drive/email_datasets"
with open(f"{base_path}/X_train_smote.pkl", 'rb') as f:
    X_train = pickle.load(f)
with open(f"{base_path}/y_train_smote.pkl", 'rb') as f:
    y_train = pickle.load(f)
with open(f"{base_path}/X_val_pad.pkl", 'rb') as f:
    X_val = pickle.load(f)
with open(f"{base_path}/y_val.pkl", 'rb') as f:
    y_val = pickle.load(f)
with open(f"{base_path}/X_test_pad.pkl", 'rb') as f:
    X_test = pickle.load(f)
with open(f"{base_path}/y_test.pkl", 'rb') as f:
    y_test = pickle.load(f)
with open(f"{base_path}/tokenizer.pkl", 'rb') as f:
    tokenizer = pickle.load(f)

# Params
embedding_dim = 64
sequence_length = X_train.shape[1]
vocab_size = 10001
batch_size = 64
total_epochs = 5
num_classes = len(np.unique(y_train))

# Build LSTM Model
def build_model():
    inp = Input(shape=(sequence_length,))
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(inp)
    x = Bidirectional(LSTM(units=48))(x)
    x = Dropout(0.4)(x)
    features = Dense(32, activation='relu', name='lstm_features')(x)
    out = Dense(num_classes, activation='softmax')(features)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train Model
model = build_model()

#  Paths
checkpoint_path = f"{base_path}/lstm_checkpoint.keras"
csv_log_path = f"{base_path}/lstm_training_log.csv"
lstm_final_path = f"{base_path}/lstm_final_model.keras"
xgb_model_path = f"{base_path}/lstm_xgb_model.pkl"
tokenizer_path = f"{base_path}/text_tokenizer.pkl"
xgb_pred_path = f"{base_path}/xgb_predictions.csv"
plot_acc_path = f"{base_path}/training_accuracy.png"
plot_loss_path = f"{base_path}/training_loss.png"
conf_matrix_path = f"{base_path}/confusion_matrix_named.png"
feature_importance_path = f"{base_path}/xgb_feature_importance.png"
report_csv_path = f"{base_path}/classification_report.csv"
zip_path = f"{base_path}/hybrid_model_export.zip"

# Callbacks
for path in [checkpoint_path, csv_log_path, lstm_final_path]:
    if os.path.exists(path):
        os.remove(path)

checkpoint_cb = ModelCheckpoint(checkpoint_path, save_best_only=False)
csv_logger = CSVLogger(csv_log_path, append=False)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=total_epochs,
    batch_size=batch_size,
    callbacks=[checkpoint_cb, csv_logger]
)

# Extract Features
feature_model = Model(inputs=model.input, outputs=model.get_layer('lstm_features').output)
features_train = feature_model.predict(X_train)
features_test = feature_model.predict(X_test)

# Train XGBoost
xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=num_classes,
    learning_rate=0.2,
    max_depth=8,
    n_estimators=250,
    eval_metric='mlogloss',
    random_state=42
)
xgb.fit(features_train, y_train)
y_pred = xgb.predict(features_test)

# 🧾 Evaluation
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("🎯 Precision:", precision_score(y_test, y_pred, average='weighted'))
print("🎯 Recall:", recall_score(y_test, y_pred, average='weighted'))
print("🎯 F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 💾 Save Everything
model.save(lstm_final_path)
with open(xgb_model_path, 'wb') as f:
    pickle.dump(xgb, f)
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)
pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).to_csv(xgb_pred_path, index=False)

# 📈 Accuracy & Loss Plots from Log
if os.path.exists(csv_log_path):
    df_log = pd.read_csv(csv_log_path)

    if 'accuracy' in df_log.columns and 'val_accuracy' in df_log.columns:
        plt.figure()
        plt.plot(df_log['accuracy'], label='Training Accuracy')
        plt.plot(df_log['val_accuracy'], label='Validation Accuracy')
        plt.title('Training vs Validation Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.grid(True)
        plt.savefig(plot_acc_path)
        plt.close()
        print(f"✅ Accuracy plot saved: {plot_acc_path}")

    if 'loss' in df_log.columns and 'val_loss' in df_log.columns:
        plt.figure()
        plt.plot(df_log['loss'], label='Training Loss')
        plt.plot(df_log['val_loss'], label='Validation Loss')
        plt.title('Training vs Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)
        plt.savefig(plot_loss_path)
        plt.close()
        print(f"✅ Loss plot saved: {plot_loss_path}")

# 📊 Confusion Matrix with Class Labels
cm = confusion_matrix(y_test, y_pred)
class_labels = ['Legitimate', 'Phishing']
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
fig, ax = plt.subplots(figsize=(8, 6))
disp.plot(cmap=plt.cm.Blues, ax=ax, values_format='d')
plt.title("Confusion Matrix with Class Names")
plt.tight_layout()
plt.savefig(conf_matrix_path)
plt.close()
print(f"✅ Confusion matrix saved: {conf_matrix_path}")

# 📋 Classification Report CSV
report_dict = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
report_df.to_csv(report_csv_path)
print(f"✅ Classification report saved: {report_csv_path}")

# 📈 XGBoost Feature Importance
plt.figure(figsize=(10, 6))
plot_importance(xgb)
plt.title("XGBoost Feature Importance")
plt.tight_layout()
plt.savefig(feature_importance_path)
plt.close()
print(f"✅ Feature importance plot saved: {feature_importance_path}")

# 🗜️ Zip Everything
files_to_zip = [
    checkpoint_path,
    lstm_final_path,
    xgb_model_path,
    tokenizer_path,
    xgb_pred_path,
    csv_log_path,
    plot_acc_path,
    plot_loss_path,
    conf_matrix_path,
    feature_importance_path,
    report_csv_path,
    f"{base_path}/label_encoder.pkl"  # Only if it exists
]

with zipfile.ZipFile(zip_path, 'w') as zipf:
    for file_path in files_to_zip:
        if os.path.exists(file_path):
            zipf.write(file_path, os.path.basename(file_path))
            print(f"📦 Added to ZIP: {os.path.basename(file_path)}")
        else:
            print(f" Missing file (not zipped): {os.path.basename(file_path)}")

print(f"\n✅ All artifacts zipped to: {zip_path}")



Epoch 1/5
[1m3241/3241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m691s[0m 211ms/step - accuracy: 0.9321 - loss: 0.1604 - val_accuracy: 0.9811 - val_loss: 0.0537
Epoch 2/5
[1m3241/3241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m749s[0m 213ms/step - accuracy: 0.9840 - loss: 0.0477 - val_accuracy: 0.9790 - val_loss: 0.0584
Epoch 3/5
[1m3241/3241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m739s[0m 212ms/step - accuracy: 0.9902 - loss: 0.0291 - val_accuracy: 0.9840 - val_loss: 0.0536
Epoch 4/5
[1m3241/3241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m742s[0m 212ms/step - accuracy: 0.9942 - loss: 0.0176 - val_accuracy: 0.9837 - val_loss: 0.0584
Epoch 5/5
[1m3241/3241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m684s[0m 211ms/step - accuracy: 0.9962 - loss: 0.0125 - val_accuracy: 0.9842 - val_loss: 0.0580
[1m6482/6482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 36ms/step
[1m1352/1352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 35ms/step
✅ Accu

<Figure size 1000x600 with 0 Axes>

In [None]:
# Imports
import os
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay
)
from xgboost import plot_importance

# Paths
base_path = "/content/drive/My Drive/email_datasets"
csv_log_path = f"{base_path}/lstm_training_log.csv"
plot_acc_path = f"{base_path}/training_accuracy.png"
plot_loss_path = f"{base_path}/training_loss.png"
conf_matrix_path = f"{base_path}/confusion_matrix_named.png"
feature_importance_path = f"{base_path}/xgb_feature_importance.png"
report_csv_path = f"{base_path}/classification_report.csv"
zip_path = f"{base_path}/hybrid_model_export.zip"

# 🔁 Dummy placeholders (replace these with your real predictions and labels if not already in scope)
# These must be in scope before running:
# y_test, y_pred, xgb

# 📈 Accuracy Plot
if os.path.exists(csv_log_path):
    df_log = pd.read_csv(csv_log_path)

    if 'accuracy' in df_log.columns and 'val_accuracy' in df_log.columns:
        plt.figure()
        plt.plot(df_log['accuracy'], label='Training Accuracy')
        plt.plot(df_log['val_accuracy'], label='Validation Accuracy')
        plt.title('Training vs Validation Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.grid(True)
        plt.savefig(plot_acc_path)
        plt.close()
        print(f"Accuracy plot saved: {plot_acc_path}")

    if 'loss' in df_log.columns and 'val_loss' in df_log.columns:
        plt.figure()
        plt.plot(df_log['loss'], label='Training Loss')
        plt.plot(df_log['val_loss'], label='Validation Loss')
        plt.title('Training vs Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)
        plt.savefig(plot_loss_path)
        plt.close()
        print(f"✅ Loss plot saved: {plot_loss_path}")

# Confusion Matrix with Class Labels
if 'y_test' in locals() and 'y_pred' in locals():
    cm = confusion_matrix(y_test, y_pred)
    class_labels = ['Legitimate', 'Phishing']
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
    fig, ax = plt.subplots(figsize=(8, 6))
    disp.plot(cmap=plt.cm.Blues, ax=ax, values_format='d')
    plt.title("Confusion Matrix with Class Names")
    plt.tight_layout()
    plt.savefig(conf_matrix_path)
    plt.close()
    print(f"✅ Confusion matrix saved: {conf_matrix_path}")

    # Classification Report CSV
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    report_df.to_csv(report_csv_path)
    print(f"✅ Classification report saved: {report_csv_path}")
else:
    print("⚠️ Skipping confusion matrix and report (missing y_test or y_pred)")

# XGBoost Feature Importance
if 'xgb' in locals():
    plt.figure(figsize=(10, 6))
    plot_importance(xgb)
    plt.title("XGBoost Feature Importance")
    plt.tight_layout()
    plt.savefig(feature_importance_path)
    plt.close()
    print(f"✅ Feature importance plot saved: {feature_importance_path}")
else:
    print("⚠️ Skipping feature importance (missing xgb model)")

# Zip Everything
files_to_zip = [
    f"{base_path}/lstm_checkpoint.keras",
    f"{base_path}/lstm_final_model.keras",
    f"{base_path}/lstm_xgb_model.pkl",
    f"{base_path}/text_tokenizer.pkl",
    f"{base_path}/label_encoder.pkl",
    f"{base_path}/xgb_predictions.csv",
    csv_log_path,
    plot_acc_path,
    plot_loss_path,
    conf_matrix_path,
    feature_importance_path,
    report_csv_path
]

with zipfile.ZipFile(zip_path, 'w') as zipf:
    for file_path in files_to_zip:
        if os.path.exists(file_path):
            zipf.write(file_path, os.path.basename(file_path))
            print(f"Added to ZIP: {os.path.basename(file_path)}")
        else:
            print(f" Missing file (not zipped): {os.path.basename(file_path)}")

print(f"\n✅ All artifacts zipped to: {zip_path}")



✅ Confusion matrix saved: /content/drive/My Drive/email_datasets/confusion_matrix_named.png
✅ Classification report saved: /content/drive/My Drive/email_datasets/classification_report.csv
✅ Feature importance plot saved: /content/drive/My Drive/email_datasets/xgb_feature_importance.png
📦 Added to ZIP: lstm_checkpoint.keras
📦 Added to ZIP: lstm_final_model.keras
📦 Added to ZIP: lstm_xgb_model.pkl
📦 Added to ZIP: text_tokenizer.pkl
❌ Missing file (not zipped): label_encoder.pkl
📦 Added to ZIP: xgb_predictions.csv
❌ Missing file (not zipped): lstm_training_log.csv
❌ Missing file (not zipped): training_accuracy.png
❌ Missing file (not zipped): training_loss.png
📦 Added to ZIP: confusion_matrix_named.png
📦 Added to ZIP: xgb_feature_importance.png
📦 Added to ZIP: classification_report.csv

✅ All artifacts zipped to: /content/drive/My Drive/email_datasets/hybrid_model_export.zip


<Figure size 1000x600 with 0 Axes>