In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import re
import pickle

In [2]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [3]:
df = pd.read_csv("C:/Users/M Amruth Sai/Downloads/extension 2/Phishing_validation_emails.csv")

# Clean the data
df['Email Text'] = df['Email Text'].apply(preprocess_text)
df['Email Type'] = df['Email Type'].map({'Safe Email': 0, 'Phishing Email': 1})

# 3. Split features and target
X = df['Email Text']
y = df['Email Type']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [6]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

In [7]:
y_pred = rf_classifier.predict(X_test_tfidf)

In [8]:
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Safe Email', 'Phishing Email']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
                precision    recall  f1-score   support

    Safe Email       1.00      1.00      1.00       211
Phishing Email       1.00      1.00      1.00       189

      accuracy                           1.00       400
     macro avg       1.00      1.00      1.00       400
  weighted avg       1.00      1.00      1.00       400


Confusion Matrix:
[[211   0]
 [  0 189]]


In [9]:
def predict_email(email_text):
    # Preprocess the input
    processed_text = preprocess_text(email_text)
    # Transform using the fitted TF-IDF vectorizer
    text_tfidf = tfidf.transform([processed_text])
    # Make prediction
    prediction = rf_classifier.predict(text_tfidf)
    probability = rf_classifier.predict_proba(text_tfidf)
    
    return {
        'prediction': 'Safe Email' if prediction[0] == 0 else 'Phishing Email',
        'confidence': max(probability[0]) * 100
    }

In [10]:
with open('email_classifier.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [17]:
if __name__ == "__main__":
    # Test with some example emails
    test_emails = [
        '''

As a free member of our services we can provide you a exclusive discount coupon which can be used when upgrading:

SUPERSSD Coupon Code

This coupon code will get you for a limited time a huge 20% off all premium hosting plans!

What are the benefits of Premium Hosting ?
- Unlimited daily hits.
- Super fast SSD powered premium servers.
- Unlimited disk space and traffic.
- Upto 21 free domain names !
- SMTP/IMAP/POP email accounts.
- Premium cPanel control panel.
- Choice of PHP versions from 5.2 to PHP 5.6.
- All PHP functions enabled / fully featured server.
- Free migration of free account data to premium,
- 100% satifaction guaranteed or we will refund.
- 100's of extra features in the premium cPanel.

So if your looking for more features, a faster server, free domains or just an awesome permanent home for your hobby or business website, our premium hosting is the ideal place for you: Running on our blazingly fast SSD (solid state drive) powered servers! 


''',
        
    ]
    
    print("\nPredictions for test emails:")
    for email in test_emails:
        result = predict_email(email)
        print(f"Email: {email}")
        print(f"Prediction: {result['prediction']}")
        print(f"Confidence: {result['confidence']:.2f}%\n")


Predictions for test emails:
Email: 

As a free member of our services we can provide you a exclusive discount coupon which can be used when upgrading:

SUPERSSD Coupon Code

This coupon code will get you for a limited time a huge 20% off all premium hosting plans!

What are the benefits of Premium Hosting ?
- Unlimited daily hits.
- Super fast SSD powered premium servers.
- Unlimited disk space and traffic.
- Upto 21 free domain names !
- SMTP/IMAP/POP email accounts.
- Premium cPanel control panel.
- Choice of PHP versions from 5.2 to PHP 5.6.
- All PHP functions enabled / fully featured server.
- Free migration of free account data to premium,
- 100% satifaction guaranteed or we will refund.
- 100's of extra features in the premium cPanel.

So if your looking for more features, a faster server, free domains or just an awesome permanent home for your hobby or business website, our premium hosting is the ideal place for you: Running on our blazingly fast SSD (solid state drive) power