<a href="https://colab.research.google.com/github/IamPrachiSharma/Phishing-Email-Detection/blob/main/SVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TF-IDF (Term Frequency-Inverse Document Frequency) vectors as features and a classifier (such as Support Vector Machine)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

# Load your dataset
data = pd.read_csv('Phishing_Email.csv')

# Drop rows with NaN values in 'email_text' column
data = data.dropna(subset=['Email Text'])

# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['Email Text'], data['Email Type'], test_size=0.2, random_state=42)

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Convert text data to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train = vectorizer.fit_transform(train_data)
X_test = vectorizer.transform(test_data)

# Train a Support Vector Machine (SVM) classifier
classifier = SVC(kernel='linear')
classifier.fit(X_train, train_labels_encoded)

# Predictions on the test set
predictions = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(test_labels_encoded, predictions)
conf_matrix = confusion_matrix(test_labels_encoded, predictions)
class_report = classification_report(test_labels_encoded, predictions)

# Display the results
print(f'Accuracy: {accuracy:.2f}')
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)


Accuracy: 0.95

Confusion Matrix:
[[1461   57]
 [ 142 2067]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      1518
           1       0.97      0.94      0.95      2209

    accuracy                           0.95      3727
   macro avg       0.94      0.95      0.95      3727
weighted avg       0.95      0.95      0.95      3727

