In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
import nltk
import os
from time import sleep
nltk.download('stopwords')

#preprocess Phishing_Email.csv for Naive Bayes, Decision Tree, SVC
def cleanDataNaiveDecisionSVC(data):
  #remove all duplicates and empty lines
  data = data.dropna()
  data = data.drop_duplicates()
  data = data[data['Email Text'] != 'empty']

  #remove special characters, html tags, \n, urls, and extra spaces
  #data['Email Text'] = data['Email Text'].apply(lambda x: re.sub(r'[^\w\s]', " ", x))
  data['Email Text'] = data['Email Text'].apply(lambda x: re.sub('http\S+|www\S+|https\S+'," ", x, flags=re.MULTILINE))
  data['Email Text'] = data['Email Text'].apply(lambda x: re.sub('\S{15,}'," ", x))
  data['Email Text'] = data['Email Text'].apply(lambda x: re.sub('[^a-zA-Z\s]'," ", x))
  data['Email Text'] = data['Email Text'].apply(lambda x: re.sub('<.*?>'," ", x))
  data['Email Text'] = data['Email Text'].apply(lambda x: re.sub('\\\n'," ", x))
  data['Email Text'] = data['Email Text'].apply(lambda x: re.sub('\s+', ' ', x).strip())
  data['Email Text'] = data['Email Text'].apply(lambda x: x.strip())
  data['Email Text'] = data['Email Text'].str.lower()

  #remove all stopwords
  stop_words = set(stopwords.words('english'))
  data['Email Text'] = data['Email Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

  #turn the new data into a csv to make sure there are no new empty lines after removing everything above
  data.to_csv('data.csv', index=False)
  sleep(2)
  data = pd.read_csv("data.csv")
  data = data.dropna()

  #split data into safe and phishing groups and get length
  safe_df = data[data["Email Type"] == 'Safe Email']
  phishing_df = data[data["Email Type"] == 'Phishing Email']
  lensd = len(safe_df)
  lenpd = len(phishing_df)

  #randomize data
  # safe_df = safe_df.sample(frac=1, random_state=42).reset_index(drop=True)
  # phishing_df = phishing_df.sample(frac=1, random_state=42).reset_index(drop=True)

  #even out the size of each group
  if len(safe_df) > len(phishing_df):
      num = lensd-lenpd
      rows_to_drop = safe_df.sample(n=num, random_state=42).index
      safe_df = safe_df.drop(rows_to_drop)
  if len(safe_df) < len(phishing_df):
      num = lenpd-lensd
      rows_to_drop = phishing_df.sample(n=num, random_state=42).index
      phishing_df = phishing_df.drop(rows_to_drop)

  #combine the groups back together into 2 seperate groups: training and testing
  lensd = len(safe_df)
  lenpd = len(phishing_df)

  safe_midpoint = len(safe_df) // 2
  phishing_midpoint = len(phishing_df) // 2

  safe_training = safe_df.iloc[:safe_midpoint]
  safe_test = safe_df.iloc[safe_midpoint:]

  phishing_training = phishing_df.iloc[:phishing_midpoint]
  phishing_test = phishing_df.iloc[phishing_midpoint:]

  training_df = pd.concat([safe_training, phishing_training])
  test_df = pd.concat([safe_test, phishing_test])

  #create csv and return array of training and test dataset
  training_df_shuffle = training_df.sample(frac=1, random_state=42).reset_index(drop=True)
  training_df_shuffle.to_csv('train_data_NDS_shuffle.csv', index=False)
  training_df.to_csv('train_data_NDS.csv', index=False)
  test_df.to_csv('test_data_NDS.csv', index=False)
  #data = data.sample(frac=1, random_state=42).reset_index(drop=True)
  d = [training_df, test_df]

  return d


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
import re
import pandas as pd
from nltk.corpus import stopwords
import nltk
import os
from time import sleep
nltk.download('stopwords')

#preprocess Phishing_Email.csv for BERT
def cleanDataBert(data):
  data = data.dropna()
  data = data.drop_duplicates()
  data = data[data['Email Text'] != 'empty']

  #get 3486 of each type to match size of test set
  safe_df = data[data["Email Type"] == 'Safe Email']
  phishing_df = data[data["Email Type"] == 'Phishing Email']

  safe_df = safe_df.sample(n=6972, random_state=42)
  phishing_df = phishing_df.sample(n=6972, random_state=42)

  len_safe = len(safe_df)
  len_phish = len(phishing_df)

  safe_midpoint = len_safe // 2
  phishing_midpoint = len_phish // 2

  safe_train_df = safe_df.iloc[:safe_midpoint]
  safe_test_df = safe_df.iloc[safe_midpoint:]

  phishing_train_df = phishing_df.iloc[:phishing_midpoint]
  phishing_test_df = phishing_df.iloc[phishing_midpoint:]

  #combine the groups together
  train_df = pd.concat([safe_train_df, phishing_train_df])
  test_df = pd.concat([safe_test_df, phishing_test_df])

  #reset the index
  train_df = train_df.reset_index(drop=True)
  test_df = test_df.reset_index(drop=True)

  #shuffle both
  train_df = train_df.sample(frac=1).reset_index(drop=True)
  test_df = test_df.sample(frac=1).reset_index(drop=True)

  #turn to csv
  train_df.to_csv("train_data_BERT.csv", index=False)
  test_df.to_csv("test_data_BERT.csv", index=False)


cleanDataBert(pd.read_csv("Phishing_Email.csv"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import re
import pandas as pd
from nltk.corpus import stopwords
import nltk
import os
from time import sleep
nltk.download('stopwords')

#preprocess Phishing_Email.csv for OpenAI
def cleanDataOpenAI(data):
  data = data.dropna()
  data = data.drop_duplicates()
  data = data[data['Email Text'] != 'empty']

  #get 3486 of each type to match size of test set
  safe_df = data[data["Email Type"] == 'Safe Email']
  phishing_df = data[data["Email Type"] == 'Phishing Email']

  safe_df = safe_df.sample(n=3486, random_state=42)
  phishing_df = phishing_df.sample(n=3486, random_state=42)

  #combine the groups together
  test_df = pd.concat([safe_df, phishing_df])

  #create csv
  # test_df.to_csv('test_data_openai.csv', index=False)

  #shuffle for another csv
  test_df_shuffle = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
  test_df_shuffle.to_csv('test_data_openai.csv', index=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import pandas as pd

def main():
  df = pd.read_csv("Phishing_Email.csv")
  cleanDataNaiveDecisionSVC(df)
  cleanDataBert(df)
  cleanDataOpenAI(df)
  print("Done")

if __name__ == "__main__":
  main()


Done
