In [None]:
import re
import pandas as pd
from nltk.corpus import stopwords
import nltk
import os
from time import sleep
nltk.download('stopwords')

#preprocess Phishing_Email.csv
def cleanData(text):
  #remove special characters, html tags, \n, urls, and extra spaces
  stop_words = set(stopwords.words('english'))
  text = re.sub(r'http\S+|www\S+|https\S+', " ", text, flags=re.MULTILINE)
  text = re.sub(r'\S{15,}', " ", text)
  text = re.sub(r'[^a-zA-Z\s]', " ", text)
  text = re.sub(r'<.*?>', " ", text)
  text = re.sub(r'\\\n', " ", text)
  text = re.sub(r'\s+', " ", text).strip()
  text = text.lower()
  text = ' '.join([word for word in text.split() if word not in stop_words])

  return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
!pip install openai
from openai import OpenAI
import openai
import pandas as pd
import re
from time import sleep
import numpy as np
import json

#openAI api phishing analysis
def openAITest(text):
  #get api key
  input_file = 'OpenAIAPIKey.json'
  with open(input_file, 'r') as file:
      data = json.load(file)
  key = data['key']

  #set up api connection
  api_key = key
  client = OpenAI(api_key=api_key)

  expected = ""
  email = text

  completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": f"Determine if this email is phishing or safe. If an email is phishing respond with a 1, if it is safe respond with a 0.  Do not use any words. Here it is: {email}"}
    ]
  )
  expected += completion.choices[0].message.content


  expectarray = []

  for char in expected:
      if char == '1':
          expectarray.append(1)
      elif char == '0':
          expectarray.append(0)

  return expectarray[0]



In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

def load_model_and_predict(email_text):
    #load the trained model and tokenizer
    model = DistilBertForSequenceClassification.from_pretrained('./distilbert-phishing-detection')
    tokenizer = DistilBertTokenizer.from_pretrained('./distilbert-phishing-detection')

    #tokenize the email text
    inputs = tokenizer(email_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    #predict using the model
    with torch.no_grad():
        model.eval()
        outputs = model(**inputs)
        logits = outputs.logits

    #get the predicted label
    predicted_class = torch.argmax(logits, dim=1).item()

    return predicted_class

In [None]:
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

def votingModel(text_preprocessed, text):
  #import models and vectorizers
  naive_bayes_model = joblib.load('naive_bayes_multinomial_model.pkl')
  naive_bayes_vectorizer = joblib.load('naive_bayes_multinomial_vectorizer.pkl')

  decision_tree_model = joblib.load('decision_tree_model.pkl')
  decision_tree_vectorizer = joblib.load('decision_tree_vectorizer.pkl')

  svc_model = joblib.load('SVC_model.pkl')
  svc_vectorizer = joblib.load('SVC_vectorizer.pkl')

  #pass models text
  naive_bayes_prediction = naive_bayes_model.predict(naive_bayes_vectorizer.transform([text_preprocessed]))
  decision_tree_prediction = decision_tree_model.predict(decision_tree_vectorizer.transform([text_preprocessed]))
  svc_prediction = svc_model.predict(svc_vectorizer.transform([text_preprocessed]))
  distilbert_prediction = load_model_and_predict(text)
  openai_prediction = openAITest(text)

  #determine if the prediction is phishing or safe
  phishing_count = 0
  safe_count = 0
  if naive_bayes_prediction[0] == 1:
    print("naive bayes phishing")
    phishing_count += 1
  else:
    print("naive bayes safe")
    safe_count += 1
  if decision_tree_prediction[0] == 1:
    print("decision tree phishing")
    phishing_count += 1
  else:
    print("decision tree safe")
    safe_count += 1
  if svc_prediction[0] == 1:
    print("svc phishing")
    phishing_count += 1
  else:
    print("svc safe")
    safe_count += 1
  if distilbert_prediction == 1:
    print("distilbert phishing")
    phishing_count += 1
  else:
    print("distilbert safe")
    safe_count += 1
  if openai_prediction == 1:
    print("openai phishing")
    phishing_count += 1
  else:
    print("openai safe")
    safe_count += 1

  if phishing_count > safe_count:
    print("This email is predicted to be phishing.")
  else:
    print("This email is predicted to be safe.")




In [None]:
def main():
  #text = "get your m 3 ds here - admix articulatory elastomer bogey condescend bohr slough affiance chasm thong dispensable johnsen coven dock hourglass ocean ciliate memphis personify"
  #text = "re : driscoll ranch # 3 gas pricing and interconnect estimate"
  text = ""
  text_preprocessed = cleanData(text)

  #if the email is empty after preprocessing do not run
  if(text_preprocessed == "" or text_preprocessed == " "):
    print("invalid input, the email does not have enough information")
  else:
    votingModel(str(text_preprocessed), str(text))

main()

naive bayes safe
decision tree safe
svc safe
distilbert safe
openai safe
This email is predicted to be safe.
