In [None]:
import re
import email
import tldextract
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
import string
import pickle
from prettytable import PrettyTable  # For displaying results in a table format

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
# Function to preprocess and clean the email text
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [i for i in text if i.isalnum()]
    text = [i for i in text if i not in stopwords.words('english') and i not in string.punctuation]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(i, pos='v') for i in text]
    return " ".join(text)

# Metadata extraction function
def extract_metadata(email_content):
    msg = email.message_from_string(email_content)
    sender = msg["From"] or ""
    domain = sender.split("@")[-1] if "@" in sender else "unknown"
    reply_to = msg["Reply-To"] or ""
    subject = msg["Subject"] or ""

    features = {
        "Sender_Domain": domain if domain != "unknown" else "unknown_domain",
        "Reply_To_Mismatch": int(reply_to != "" and reply_to != sender),
        "Subject_Length": len(subject),
        "Is_HTML": int(msg.get_content_type() == "text/html"),
        "Num_Links": len(re.findall(r"https?://[^\s]+", msg.get_payload(decode=True).decode(errors="ignore") if msg.get_payload() else "")),
        "Num_Attachments": sum(1 for part in msg.walk() if part.get_content_disposition() == "attachment"),
        "Is_Shortened_URL": int(any(tldextract.extract(url).domain in {"bit.ly", "tinyurl.com"} for url in re.findall(r"https?://[^\s]+", msg.get_payload(decode=True).decode(errors="ignore") if msg.get_payload() else "")))
    }

    return features

# Combine text and metadata features
def process_email(email_content, email_text):
    metadata = extract_metadata(email_content)
    transformed_text = transform_text(email_text)
    metadata["Processed_Text"] = transformed_text
    return metadata

# Load dataset
df = pd.read_csv('spam.csv', encoding='ISO-8859–1')
df.rename(columns={'v1':'target', 'v2':'text'}, inplace=True)

# Apply text transformation and metadata extraction
df['metadata_features'] = df.apply(lambda row: process_email(row['text'], row['text']), axis=1)

# Convert metadata features into separate columns
metadata_df = pd.json_normalize(df['metadata_features'])
df = pd.concat([df, metadata_df], axis=1)
df.drop(columns=['metadata_features'], inplace=True)

# Encode labels
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

# Encode categorical features (Sender_Domain column)
domain_encoder = LabelEncoder()
df['Sender_Domain'] = domain_encoder.fit_transform(df['Sender_Domain'])

# Feature extraction (TF-IDF for text)
tf = TfidfVectorizer()
X_text = tf.fit_transform(df['Processed_Text']).toarray()

# Combine metadata features with text features
metadata_columns = ['Sender_Domain', 'Reply_To_Mismatch', 'Subject_Length', 'Is_HTML', 'Num_Links', 'Num_Attachments', 'Is_Shortened_URL']
X_metadata = df[metadata_columns].values
X = np.hstack([X_text, X_metadata])

y = df['target'].values

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Models to evaluate
mls = { 
    'mnb': MultinomialNB(),
    'xg': XGBClassifier(eval_metric='logloss')
}

# Training and evaluation
results = []

# Create a PrettyTable object for formatted output
table = PrettyTable()
table.field_names = ["Model", "Accuracy", "Precision", "Confusion Matrix"]

for name, model in mls.items():
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        ps = precision_score(y_test, y_pred)
        results.append([name, acc, ps, cm.tolist()])

        # Add the result to the table
        table.add_row([name, f"{acc:.4f}", f"{ps:.4f}", str(cm)])
    except Exception as e:
        results.append([name, 'Error', 'Error', str(e)])


print(table)

# Save model and vectorizer
pickle.dump(tf, open('vectorizer.pkl', 'wb'))
pickle.dump(mls['mnb'], open('model.pkl', 'wb'))


+-------+----------+-----------+------------------+
| Model | Accuracy | Precision | Confusion Matrix |
+-------+----------+-----------+------------------+
|  mnb  |  0.9578  |   1.0000  |    [[957   0]    |
|       |          |           |    [ 47 111]]    |
|   xg  |  0.9614  |   0.9675  |    [[953   4]    |
|       |          |           |    [ 39 119]]    |
+-------+----------+-----------+------------------+
