# Import

In [1]:
import mailbox
import email

# import gensim
import gensim.parsing.preprocessing as preprocessing

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Data preprocessing

## Extract Email

### Functions: Extract email from .mbox file and .eml file


In [2]:
def extract_emails_from_mbox(mbox_file_name):
  messages=[]
  try:
    mbox = mailbox.mbox(mbox_file_name)
    for message in mbox:
      messages.append(message)
  except FileNotFoundError:
    print(f"File not found: {mbox_file_name}")
  return messages


def extract_email_from_eml(eml_file_name):
  email_message=None
  try:
      with open(eml_file_name, "r") as email_file:
          email_message = email.message_from_file(email_file)
  except FileNotFoundError:
      print(f"File not found: {eml_file_name}")
  except Exception as e:
      print(f"An error occurred: {e}")
  return email_message

#From:https://stackoverflow.com/questions/7166922/extracting-the-body-of-an-email-from-mbox-file-decoding-it-to-plain-text-regard

### Functions: Extract component of email

In [3]:
def extract_sender_email(email_message):
  return email_message.get("From")


def extract_subject_email(email_message):
  return email_message.get("Subject")


def extract_content_email(email_message):
  body = None
  if(email_message.is_multipart()):
    for part in email_message.walk():
      if(part.is_multipart()):
        for subpart in part.walk():
          if(subpart.get_content_type() == "text/plain"):
            body = subpart.get_payload(decode=True)
          # elif(subpart.get_content_type() == "text/html"):
          #   body = subpart.get_payload(decode=True)
      elif(part.get_content_type() == "text/plain"):
        body = part.get_payload(decode=True)
  else:
    body = email_message.get_payload(decode=True)
  if(body is not None):
    #chuyển dữ liệu dạng byte string sang string (utf-8)
    body=body.decode('utf-8')
  return body

### Process: extract phishing emails

In [10]:
phishing_message_bodies = []


phishing_messages = extract_emails_from_mbox("/content/emails-enron-legal-mails.mbox")
for message in phishing_messages:
  body=extract_content_email(message)
  if (body is not None and body.strip()):
    phishing_message_bodies.append(body)
    #print(body)
    #print(body).decode('utf-8')

print(len(phishing_message_bodies))
print(len(phishing_messages))

4279
4279


### Process: extract benign emails

In [9]:
benign_message_bodies = []


benign_messages = extract_emails_from_mbox("/content/emails-enron-ham.mbox")
for message in benign_messages:
  body=extract_content_email(message)
  if (body is not None and body.strip()):
    benign_message_bodies.append(body)

print(len(benign_message_bodies))
print(len(benign_messages))

0
0


### Check

In [None]:
print(phishing_messages[0])
print(phishing_message_bodies[0])

In [None]:
print(benign_messages[0])
print(benign_message_bodies[0])

## Simple preprocessing

In [12]:
# Custom stop words and preprocessing filters

stopWords = nltk.corpus.stopwords
stopWords = stopWords.words("english")
stopWords.extend(["nbsp", "font", "sans", "serif", "bold", "arial", "verdana", "helvetica", "http", "https", "www", "html", "enron", "margin", "spamassassin"])

def remove_custom_stopwords(p):
    return preprocessing.remove_stopwords(p, stopwords=stopWords)

CUSTOM_FILTERS = [lambda x: x.lower(), preprocessing.strip_tags, preprocessing.strip_punctuation,
                  preprocessing.strip_multiple_whitespaces, preprocessing.strip_numeric, remove_custom_stopwords,
                  preprocessing.remove_stopwords, preprocessing.strip_short, preprocessing.stem_text]


In [13]:
#process phishing emails
phishing_preprocessed = []

for message in phishing_message_bodies:
  message_preprocessed = preprocessing.preprocess_string(message,filters = CUSTOM_FILTERS)
  phishing_preprocessed.append(message_preprocessed)

print(len(phishing_preprocessed))

4279


In [14]:
#process benign emails
benign_preprocessed = []

for message in benign_message_bodies:
  message_preprocessed = preprocessing.preprocess_string(message,filters = CUSTOM_FILTERS)
  benign_preprocessed.append(message_preprocessed)

print(len(benign_preprocessed))

0


In [None]:
#test
print(phishing_preprocessed[1])

# TEST

In [None]:
msg = extract_email_from_eml("/content/sample1.eml")
messages = extract_emails_from_mbox("/emails-enron-legal-mails.mbox")
if(msg):
  print(extract_sender_email(msg))
  print(extract_subject_email(msg))
  print(extract_content_email(msg))

File not found: /content/sample1.eml
