<a href="https://colab.research.google.com/github/HarryNguyen30/Secure-Comunication-Password-Manager/blob/main/Phishing_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [79]:
#Install and import requirement libraries
!pip install wget
import wget
import zipfile
import tarfile
import os
import glob
import pandas as pd
import re
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential
from tensorflow.keras.utils import to_categorical




In [80]:
#Download dataset from SpamAssassin Public Corpus (https://spamassassin.apache.org/old/publiccorpus/)

#Spam mail
wget.download("https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2", "spam.tar.bz2")
# Unzip with tar
tar = tarfile.open("spam.tar.bz2", "r:bz2")
tar.extractall("spamassassin")
tar.close()

#Ham mail (phishing mail)
wget.download("https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2", "ham.tar.bz2")
# Unzip with tar
tar = tarfile.open("ham.tar.bz2", "r:bz2")
tar.extractall("spamassassin")
tar.close()


In [81]:
import glob

# Load spam and ham file paths
spam_files = glob.glob("spamassassin/spam/*")
ham_files = glob.glob("spamassassin/hard_ham/*")

print(f"Spam: {len(spam_files)} | Ham: {len(ham_files)}")

Spam: 501 | Ham: 251


In [82]:
def load_emails(file_paths):
  emails = []
  for file_path in file_paths:
    try:
      with open(file_path, 'r', encoding='latin-1') as f:
        content = f.read()
        emails.append(content)
    except Exception as e:
      print(f"Error loading file {file_path}: {e}")
  return emails

spam_emails = load_emails(spam_files)
ham_emails = load_emails(ham_files)
print(f"Spam: {len(spam_emails)} | Ham: {len(ham_emails)}")

Spam: 501 | Ham: 251


In [83]:
import pandas as pd

# Create labeled dataset
df_spam = pd.DataFrame({'text': spam_emails, 'label': 1})
df_ham = pd.DataFrame({'text': ham_emails, 'label': 0})

# Combine
df = pd.concat([df_spam, df_ham], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

df.head()

Unnamed: 0,text,label
0,Return-Path: <bounce-lghtml-2534368@sprocket.l...,0
1,From rpm-list-admin@freshrpms.net Wed Oct 9 ...,0
2,From dus@insiq.us Thu Sep 19 11:15:00 2002\nR...,1
3,Return-Path: ler@lerami.lerctr.org\nDelivery-D...,1
4,From firstever001@44yes.onlineisbest.com Mon ...,1


In [84]:

def clean_text(text):
  text = text.lower()
  text = re.sub(r"http\S+", "", text) # remove URLs
  text = re.sub(r"\S+@\S+", "", text)
  text = re.sub(r"<.*?>", "", text)
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
  return text

df['clean_text'] = df['text'].apply(clean_text)

df.head()

Unnamed: 0,text,label,clean_text
0,Return-Path: <bounce-lghtml-2534368@sprocket.l...,0,returnpath \nreceived from lockergnomecom spro...
1,From rpm-list-admin@freshrpms.net Wed Oct 9 ...,0,from wed oct 9 105056 2002\nreturnpath \nde...
2,From dus@insiq.us Thu Sep 19 11:15:00 2002\nR...,1,from thu sep 19 111500 2002\nreturnpath \nde...
3,Return-Path: ler@lerami.lerctr.org\nDelivery-D...,1,returnpath \ndeliverydate sat sep 14 132641 20...
4,From firstever001@44yes.onlineisbest.com Mon ...,1,from mon sep 23 183345 2002\nreturnpath \nde...


In [85]:
# Hyperparameters
max_words = 10000 # max number of words to use in the vocabulary
max_len = 1000 # max length of each text (in terms of number of words)
embedding_dim = 100 # dimension of word embeddings
lstm_units = 64 # number of units in the LSTM layer
num_classes = len(set(df['label'])) # number of classes
print(num_classes)

2


In [86]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

Found 58577 unique tokens.


In [87]:
X = pad_sequences(sequences, maxlen=max_len)
y = df['label']
print(y)
y = to_categorical(y)
print(y)

0      0
1      0
2      1
3      1
4      1
      ..
747    1
748    1
749    1
750    1
751    1
Name: label, Length: 752, dtype: int64
[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [88]:
# Build and train LSTM model
model = Sequential()
model.add(Embedding(max_words, embedding_dim))
model.add(LSTM(lstm_units))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=32, epochs=10)

Epoch 1/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 56ms/step - accuracy: 0.7507 - loss: 0.6255
Epoch 2/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.8934 - loss: 0.2548
Epoch 3/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.9873 - loss: 0.0501
Epoch 4/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.9991 - loss: 0.0124
Epoch 5/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.9946 - loss: 0.0138
Epoch 6/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.9968 - loss: 0.0199
Epoch 7/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.9993 - loss: 0.0115
Epoch 8/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.9997 - loss: 0.0035
Epoch 9/10
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c96803a6590>

In [89]:

# Evaluate the model
loss, accuracy = model.evaluate(X, y, verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")


Loss: 0.0051
Accuracy: 0.9987


In [93]:
def predict_email(text):
  text = clean_text(text)
  sequence = tokenizer.texts_to_sequences([text])
  padded_sequence = pad_sequences(sequence, maxlen=max_len)
  prediction = model.predict(padded_sequence)
  predicted_class = np.argmax(prediction)
  return"Spam" if predicted_class == 1 else "Ham"

In [94]:
# prompt: test model

import numpy as np

# Example prediction (replace with your actual data)
example_text =  """Subject: Urgent Security Alert: Unusual Activity Detected on Your Account

Dear Valued Customer,

We have detected unusual activity on your account that requires your immediate attention. For your security, we have temporarily suspended certain features until you verify your account information.

To restore full access, please click on the following link and follow the on-screen instructions:

[MALICIOUS/FAKE LINK REMOVED - DO NOT INCLUDE A REAL LINK IN A PHISHING SIMULATION]

**Please note:** Ignoring this notification may result in permanent account suspension.

This is an automated message. Please do not reply directly to this email.

Thank you for your cooperation.

Sincerely,

[Fake Company Name - e.g., "Your Bank Security Team", "Online Service Support"]

**Why this is a phishing attempt:**

* **Urgency:** Creates a sense of panic and encourages immediate action.
* **Vague Language:** Mentions "unusual activity" without specific details.
* **Suspension Threat:** Threatens negative consequences for inaction.
* **Suspicious Link:** The provided link (if it were real in a malicious email) would likely lead to a fake login page designed to steal credentials.
* **Generic Greeting:** Uses a general salutation like "Dear Valued Customer."
* **Automated Message Claim:** Discourages direct replies, which is typical of phishing emails.

**Important Note for Testing AI Models:**

When testing AI models for phishing detection, it's crucial to:

* **Avoid using real links or personal information.**
* **Clearly label the email as a "phishing simulation" or "test email."**
* **Focus on the linguistic cues and structural elements that are characteristic of phishing attacks.**

This example is for educational and testing purposes only. Do not use this to attempt to deceive or harm anyone."""

print(predict_email(example_text))
# Preprocess the example text
example_text = clean_text(example_text) # Assuming clean_text function from your code is defined.
example_sequence = tokenizer.texts_to_sequences([example_text])
example_padded = pad_sequences(example_sequence, maxlen=max_len)

# Make prediction
prediction = model.predict(example_padded)
predicted_class = np.argmax(prediction)

# Print the prediction
print(f"Predicted class: {predicted_class}")

# Example prediction with probabilities:
print(f"Prediction probabilities: {prediction[0]}")

# Determine if spam or ham
if predicted_class == 1:
    print("The email is predicted to be spam.")
else:
    print("The email is predicted to be ham (not spam).")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Ham
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Predicted class: 0
Prediction probabilities: [0.9887714  0.01122863]
The email is predicted to be ham (not spam).


In [95]:
spam_email_string = """Subject: 🎉 Congratulations! You've Won a Free Vacation! 🎉

Dear [Random Name],

You've been randomly selected as our lucky winner for an all-expenses-paid vacation for two to your dream destination! Imagine relaxing on sunny beaches, exploring exotic cultures, or enjoying thrilling adventures - all absolutely FREE!

To claim your prize, simply click the link below and enter your details:

[SUSPICIOUS/FAKE LINK REMOVED - DO NOT INCLUDE A REAL LINK IN A SPAM SIMULATION]

This is a limited-time offer, so don't miss out on this incredible opportunity! Claim your free vacation today!

Best regards,

The [Fake Company Name - e.g., "Travel Rewards Center", "Global Getaways"] Team

P.S. Share this amazing news with your friends and family!

**Why this is likely spam:**

* **Unsolicited Offer:** You likely didn't enter any contest or sign up for this.
* **Too Good to Be True:** Free vacations are rarely given away without strings attached.
* **Generic Greeting:** Uses a vague salutation like "Dear [Random Name]."
* **Sense of Urgency:** Encourages immediate action with "limited-time offer."
* **Suspicious Link:** The provided link (if real in a spam email) could lead to a website asking for personal information or installing malware.
* **Poor Grammar/Typos (Often Present but not explicitly included here for clarity):** Spam emails sometimes contain grammatical errors.
* **Unfamiliar Sender:** You likely don't recognize the sender or the company.

**Important Note for Testing AI Models:**

When testing AI models for spam detection, it's crucial to:

* **Avoid using real links or personal information.**
* **Clearly label the email as a "spam simulation" or "test email."**
* **Focus on the linguistic cues and structural elements that are characteristic of spam emails, such as exaggerated claims, unsolicited offers, and pressure to act quickly.**

This example is for educational and testing purposes only. Do not use this to send unsolicited emails to others."""


print(predict_email(spam_email_string))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Spam


In [101]:
import joblib
joblib.dump(model, "phishing_model.pkl")
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']

In [102]:
model = joblib.load("phishing_model.pkl")
tokenizer = joblib.load("tokenizer.pkl")