# 🧠 Phishing Email Detection: Model Comparison
We compare classical ML and deep learning models on the `zefang-liu/phishing-email-dataset`.

### Models:
- ✅ SVM
- ✅ Logistic Regression
- ✅ Naive Bayes
- ✅ Random Forest
- ✅ KNN
- 🔄 LSTM (coming next)
- 🔄 CNN (coming next)


In [None]:
# STEP 1: Load dataset
from datasets import load_dataset

ds = load_dataset('zefang-liu/phishing-email-dataset')
df = ds['train'].to_pandas()

# Keep relevant columns
df = df[['Email Text', 'Email Type']]
df = df.dropna()
df = df.rename(columns={'Email Text': 'text', 'Email Type': 'label'})

# Convert label to 0 (legit) or 1 (phishing)
df['label'] = df['label'].str.lower().map({'legitimate': 0, 'phishing': 1})

In [None]:
# STEP 2: TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = df['text']
y = df['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [None]:
# STEP 3: Define a function to evaluate models
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
def evaluate_model(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n📌 {name} Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [None]:
# STEP 4: Run classical models
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

evaluate_model("SVM", SVC(kernel='linear'))
evaluate_model("Logistic Regression", LogisticRegression(max_iter=1000))
evaluate_model("Naive Bayes", MultinomialNB())
evaluate_model("Random Forest", RandomForestClassifier())
evaluate_model("KNN", KNeighborsClassifier())