# Import

In [None]:
import mailbox
import email

import nltk
nltk.download('stopwords')

# import gensim
import gensim.parsing.preprocessing as preprocessing
import gensim.corpora as corpora
import gensim.models as models
# from gensim.models import Word2Vec
# from gensim.models import LdaMulticore


import sklearn.feature_extraction.text as text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Data preprocessing

## Extract Email

### Functions: Extract email from .mbox file and .eml file


In [None]:
def extract_emails_from_mbox(mbox_file_name):
  messages=[]
  try:
    mbox = mailbox.mbox(mbox_file_name)
    for message in mbox:
      messages.append(message)
  except FileNotFoundError:
    print(f"File not found: {mbox_file_name}")
  return messages


def extract_email_from_eml(eml_file_name):
  email_message=None
  try:
      with open(eml_file_name, "r") as email_file:
          email_message = email.message_from_file(email_file)
  except FileNotFoundError:
      print(f"File not found: {eml_file_name}")
  except Exception as e:
      print(f"An error occurred: {e}")
  return email_message

#From:https://stackoverflow.com/questions/7166922/extracting-the-body-of-an-email-from-mbox-file-decoding-it-to-plain-text-regard

### Functions: Extract component of email

In [None]:
def extract_sender_email(email_message):
  return email_message.get("From")


def extract_subject_email(email_message):
  return email_message.get("Subject")


def extract_content_email(email_message):
  body = None
  if(email_message.is_multipart()):
    for part in email_message.walk():
      if(part.is_multipart()):
        for subpart in part.walk():
          if(subpart.get_content_type() == "text/plain"):
            body = subpart.get_payload(decode=True)
          # elif(subpart.get_content_type() == "text/html"):
          #   body = subpart.get_payload(decode=True)
      elif(part.get_content_type() == "text/plain"):
        body = part.get_payload(decode=True)
  else:
    body = email_message.get_payload(decode=True)
  if(body is not None):
    #chuyển dữ liệu dạng byte string sang string (utf-8)
    body=body.decode('utf-8')
  return body

### Process: Extract phishing emails

In [None]:
phishing_message_bodies = []


phishing_messages = extract_emails_from_mbox("/content/emails-enron-legal-mails.mbox")
for message in phishing_messages:
  body=extract_content_email(message)
  if (body is not None and body.strip()):
    phishing_message_bodies.append(body)
    #print(body)
    #print(body).decode('utf-8')

print(len(phishing_message_bodies))
print(len(phishing_messages))

4279
4279


### Process: Extract benign emails

In [None]:
benign_message_bodies = []


benign_messages = extract_emails_from_mbox("/content/emails-enron-ham.mbox")
for message in benign_messages:
  body=extract_content_email(message)
  if (body is not None and body.strip()):
    benign_message_bodies.append(body)

print(len(benign_message_bodies))
print(len(benign_messages))

0
0


### ===TEST===

In [None]:
msg = extract_email_from_eml("/content/sample1.eml")
messages = extract_emails_from_mbox("/emails-enron-legal-mails.mbox")
if(msg):
  print(extract_sender_email(msg))
  print(extract_subject_email(msg))
  print(extract_content_email(msg))

In [None]:
print(phishing_messages[0])
print(phishing_message_bodies[0])

In [None]:
print(benign_messages[0])
print(benign_message_bodies[0])

## Simple preprocessing

### Custom stop words and preprocessing filters

In [None]:
# Custom stop words and preprocessing filters
stopWords = nltk.corpus.stopwords
stopWords = stopWords.words("english")
stopWords.extend(["nbsp", "font", "sans", "serif", "bold", "arial", "verdana", "helvetica", "http", "https", "www", "html", "enron", "margin", "spamassassin"])

def remove_custom_stopwords(p):
    return preprocessing.remove_stopwords(p, stopwords=stopWords)

CUSTOM_FILTERS = [lambda x: x.lower(), preprocessing.strip_tags, preprocessing.strip_punctuation,
                  preprocessing.strip_multiple_whitespaces, preprocessing.strip_numeric, remove_custom_stopwords,
                  preprocessing.remove_stopwords, preprocessing.strip_short, preprocessing.stem_text]


In [None]:
def custom_preprocessing(messages):
  preprocessed_messages = []
  for message in messages:
    preprocessed = preprocessing.preprocess_string(message,filters = CUSTOM_FILTERS)
    if preprocessed and (preprocessed not in preprocessed_messages):
      preprocessed_messages.append(preprocessed)

  return preprocessed_messages

### Preprocess messages

In [None]:
#preprocessing phishing message bodies
phishing_preprocessed = []
phishing_preprocessed =  custom_preprocessing(phishing_message_bodies)

print(len(phishing_preprocessed))

4088


In [None]:
#preprocessing benign message bodies
benign_preprocessed = []
benign_preprocessed =  custom_preprocessing(benign_message_bodies)

print(len(benign_preprocessed))

0


### ===TEST===

In [None]:
#Test
print(phishing_preprocessed[1])

['trade', 'profit', 'allen', 'grigsbi', 'rest', 'desk', 'total', 'view', 'bonu', 'partli', 'attribut', 'trade', 'partli', 'group', 'perform', 'thought', 'minimum', 'market', 'maximum', 'cash', 'equiti', 'mike', 'number', 'adjust', 'minimum', 'market', 'maximum', 'cash', 'equiti', 'given', 'expect', 'speech', 'point', 'phillip']


# Word2Vec Embedding


In [None]:
# Train the model on all messages
model = models.Word2Vec(phishing_preprocessed + benign_preprocessed, vector_size=100, min_count=1, workers=3, window=5)

#From: https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/

In [None]:
model.wv.most_similar("dollar", topn=20)

In [None]:
model.wv["dollar"]

# LDA Topic Modeling

## Init the number of topics

In [None]:
numTopics = 1024

## Create dictionary and corpus

In [None]:
all_message_preprocessed = phishing_preprocessed + benign_preprocessed

In [None]:
dictionary = corpora.Dictionary(all_message_preprocessed)
corpus = [dictionary.doc2bow(text) for text in all_message_preprocessed]

##  Create LDA model

In [None]:
LDA_model = models.LdaMulticore(corpus=corpus, num_topics=numTopics, id2word=dictionary)

In [None]:
#Test
# Print keyword for the topics
print(LDA_model.print_topics())

#Doc2Vec

In [None]:
tagged_data = [models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(all_message_preprocessed)]

In [None]:
#Khởi tạo và huấn luyện trực tiếp
doc2vec_model = models.Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4)

#Tách khởi tạo và huấn luyện1
# doc2vec_model = models.Doc2Vec(tagged_data, vector_size=100, min_count=1, epochs=10)
# doc2vec_model.build_vocab(tagged_data)
# doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

#Classification

In [None]:
all_message_bodies = phishing_message_bodies + benign_message_bodies

In [None]:
def get_file_lines(file_path):
  lines=[]
  try:
    with open(file_path, 'r') as file:
      for line in file:
        lines.append(line.strip())
  except FileNotFoundError:
    print(f"File not found: {file_path}")
  return lines

In [None]:
black_list_words = get_file_lines("/content/spam_wordlist.txt")
black_list = custom_preprocessing(black_list_words)

print(len(black_list_words))
print(len(black_list))

582
369


In [None]:
#TF-IDF
max_term=6

In [None]:
def list_to_string(lst):
    return ' '.join(lst)

def count_all_upper_words(text):
    count = 0
    for word in text.split():
        if word.isupper():
            count += 1
    return count

In [None]:
tfidfVectorizer = text.TfidfVectorizer(max_features=max_term, preprocessor=list_to_string,sublinear_tf=True)
tfidf_matrix = tfidfVectorizer.fit_transform(all_message_preprocessed).toarray()

print(tfidfVectorizer.get_feature_names_out())
print(tfidf_matrix.shape)

In [None]:
allVectors = []

for i in range(all_message_bodies):
  top_topics = LDA_model.get_document_topics(corpus[i], minimum_probability=0.0)
  vec =  [top_topics[i][1] for i in range(numTopics)]#topics

  for v in doc2vec_model.infer_vector(all_message_preprocessed[i]):#Doc2Vec--need to fix
    vec.append(v)

  sia = SentimentIntensityAnalyzer()
  sentence = " ".join(all_message_preprocessed[i])
  polarity = sia.polarity_scores(sentence)

  for s in polarity:
    vec.append(polarity[s])

  # Contains HTML
  if "<html>" in allBodies[i].lower():
      vec.append(1)
  else:
      vec.append(0)

  # Contains a link (how many)
  if "http://" in allBodies[i].lower() or "https://" in allBodies[i].lower():
      vec.append(1)
  else:
      vec.append(0)

  # How many blacklisted phrases/words appear in this email
  for b in blackList:
      count = 0
      for word in b:
          if word in allPreprocessed[i]:
              count += 1
      vec.append(count)

  # TF-IDF for top terms
  for w in tfIDF[i]:
      vec.append(w)

  # Has all caps word?
  vec.append(count_all_upper_word(allBodies[i]))

  # Has exclamation marks?
  vec.append(allBodies[i].count("!"))

  # Total length
  vec.append(len(allBodies[i]))

  # Num words
  vec.append(len(allPreprocessed[i]))


  allVectors.append(vec)
