# Import

In [None]:
import mailbox
import email

import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
import nltk.sentiment as sentiment                          #from nltk.sentiment import SentimentIntensityAnalyzer

# import gensim
import gensim.parsing.preprocessing as gs_preprocessing     #from gensim.parsing.preprocessing import *
import gensim.corpora as corpora
import gensim.models as models                              #from gensim.models import Word2Vec,LdaMulticore
#from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# import sklearn
import sklearn.feature_extraction.text as text              #from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.model_selection as model_selection           #from sklearn.model_selection import RandomizedSearchCV, train_test_split
import sklearn.preprocessing as sk_preprocessing            #from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sklearn.ensemble as ensemble
import sklearn.metrics as metrics                           #from sklearn.metrics import accuracy_score, precision_score, recall_score


import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Data preprocessing

## Extract Email

### Functions: Extract email from .mbox file and .eml file


In [None]:
def extract_emails_from_mbox(mbox_file_name):
  messages=[]
  try:
    mbox = mailbox.mbox(mbox_file_name)
    for message in mbox:
      messages.append(message)
  except FileNotFoundError:
    print(f"File not found: {mbox_file_name}")
  return messages


def extract_email_from_eml(eml_file_name):
  email_message=None
  try:
      with open(eml_file_name, "r") as email_file:
          email_message = email.message_from_file(email_file)
  except FileNotFoundError:
      print(f"File not found: {eml_file_name}")
  except Exception as e:
      print(f"An error occurred: {e}")
  return email_message

#From:https://stackoverflow.com/questions/7166922/extracting-the-body-of-an-email-from-mbox-file-decoding-it-to-plain-text-regard

### Functions: Extract component of email

In [None]:
def extract_sender_email(email_message):
  return email_message.get("From")


def extract_subject_email(email_message):
  return email_message.get("Subject")


def extract_content_email(email_message):
  body = None
  if(email_message.is_multipart()):
    for part in email_message.walk():
      if(part.is_multipart()):
        for subpart in part.walk():
          if(subpart.get_content_type() == "text/plain"):
            body = subpart.get_payload(decode=True)
          # elif(subpart.get_content_type() == "text/html"):
          #   body = subpart.get_payload(decode=True)
      elif(part.get_content_type() == "text/plain"):
        body = part.get_payload(decode=True)
  else:
    body = email_message.get_payload(decode=True)
  if(body is not None):
    #chuyển dữ liệu dạng byte string sang string (utf-8)
    body=body.decode('utf-8')
  return body

### Process: Extract phishing emails

In [None]:
phishing_message_bodies = []


phishing_messages = extract_emails_from_mbox("/content/emails-enron-legal-mails.mbox")
for message in phishing_messages:
  body=extract_content_email(message)
  if (body is not None and body.strip()):
    phishing_message_bodies.append(body)
    #print(body)
    #print(body).decode('utf-8')

print(len(phishing_message_bodies))
print(len(phishing_messages))

4279
4279


### Process: Extract benign emails

In [None]:
benign_message_bodies = []


benign_messages = extract_emails_from_mbox("/content/emails-enron-ham.mbox")
for message in benign_messages:
  body=extract_content_email(message)
  if (body is not None and body.strip()):
    benign_message_bodies.append(body)

print(len(benign_message_bodies))
print(len(benign_messages))

0
0


### ===TEST===

In [None]:
msg = extract_email_from_eml("/content/sample1.eml")
messages = extract_emails_from_mbox("/emails-enron-legal-mails.mbox")
if(msg):
  print(extract_sender_email(msg))
  print(extract_subject_email(msg))
  print(extract_content_email(msg))

File not found: /content/sample1.eml


In [None]:
print(phishing_messages[0])
print(phishing_message_bodies[0])

Message-ID: <27151276.1075857703081.JavaMail.evans@thyme>
Date: Thu, 28 Dec 2000 16:37:00 -0800 (PST)
From: phillip.allen@enron.com
To: john.lavorato@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=ANSI_X3.4-1968
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Lavorato, John </o=ENRON/ou=NA/cn=Recipients/cn=Jlavora>
X-cc: 
X-bcc: 
X-Folder: \jlavora\COMP
X-Origin: Lavorado-J
X-FileName: jlavora.pst

?	Pay well ? and pay for performance (with bonuses based on merit, not entitlement).




Just a soundbite from a PRC email.  I am getting worried about Mike G. and myself.  Are you open to more discussions?

Phillip


?	Pay well ? and pay for performance (with bonuses based on merit, not entitlement).




Just a soundbite from a PRC email.  I am getting worried about Mike G. and myself.  Are you open to more discussions?

Phillip




In [None]:
print(benign_messages[0])
print(benign_message_bodies[0])

## Simple preprocessing

### Custom stop words and preprocessing filters

In [None]:
# Custom stop words and preprocessing filters
stopWords = nltk.corpus.stopwords
stopWords = stopWords.words("english")
stopWords.extend(["nbsp", "font", "sans", "serif", "bold", "arial", "verdana", "helvetica", "http", "https", "www", "html", "enron", "margin", "spamassassin"])

def remove_custom_stopwords(p):
    return gs_preprocessing.remove_stopwords(p, stopwords=stopWords)

CUSTOM_FILTERS = [lambda x: x.lower(), gs_preprocessing.strip_tags, gs_preprocessing.strip_punctuation,
                  gs_preprocessing.strip_multiple_whitespaces, gs_preprocessing.strip_numeric, remove_custom_stopwords,
                  gs_preprocessing.remove_stopwords, gs_preprocessing.strip_short, gs_preprocessing.stem_text]


In [None]:
def custom_preprocessing(messages):
  preprocessed_messages = []
  for message in messages:
    preprocessed = gs_preprocessing.preprocess_string(message,filters = CUSTOM_FILTERS)
    #NEED FIX: xu ly trung lap chi can thiet o black list
    # if preprocessed and (preprocessed not in preprocessed_messages):
    #   preprocessed_messages.append(preprocessed)
    preprocessed_messages.append(preprocessed)

  return preprocessed_messages

#Bỏ các phần tử rỗng và trùng lập
def duplicate_filter(texts):
    unique_texts = []
    for text in texts:
        if text and (text not in unique_texts):
            unique_texts.append(text)
    return unique_texts


### Preprocess messages

In [None]:
#preprocessing phishing message bodies
phishing_preprocessed = []
phishing_preprocessed =  custom_preprocessing(phishing_message_bodies)

print(len(phishing_preprocessed))

4279


In [None]:
#preprocessing benign message bodies
benign_preprocessed = []
benign_preprocessed =  custom_preprocessing(benign_message_bodies)

print(len(benign_preprocessed))

0


### ===TEST===

In [None]:
#Test
print(len(phishing_message_bodies))
print(len(duplicate_filter(phishing_message_bodies)))
print(len(phishing_preprocessed))
print(len(duplicate_filter(phishing_preprocessed)))

print(phishing_preprocessed[0])

4279
4246
4279
4088
['pai', 'pai', 'perform', 'bonus', 'base', 'merit', 'entitl', 'soundbit', 'prc', 'email', 'get', 'worri', 'mike', 'open', 'discuss', 'phillip']


# Word2Vec Embedding


In [None]:
all_message_preprocessed = phishing_preprocessed + benign_preprocessed

print(len(all_message_preprocessed))

4279


In [None]:
# Train the model on all messages
word2vec_model = models.Word2Vec(all_message_preprocessed, vector_size=100, min_count=1, workers=3, window=5)
#From: https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/

In [None]:
word2vec_model.wv.most_similar("dollar", topn=20)

[('billion', 0.9507895112037659),
 ('win', 0.9110546112060547),
 ('ubid', 0.903036892414093),
 ('maximum', 0.8959468603134155),
 ('tend', 0.8940055966377258),
 ('quantiti', 0.8918537497520447),
 ('supersit', 0.8888633251190186),
 ('defici', 0.8846665024757385),
 ('prioriti', 0.8796712160110474),
 ('half', 0.8794583082199097),
 ('auction', 0.8794552087783813),
 ('size', 0.8770312666893005),
 ('dissent', 0.8762837648391724),
 ('entri', 0.8752008080482483),
 ('dcq', 0.8745139837265015),
 ('store', 0.8734740018844604),
 ('majeur', 0.8722878098487854),
 ('grab', 0.8720036149024963),
 ('shorter', 0.8716081380844116),
 ('furthermor', 0.8704047203063965)]

In [None]:
word2vec_model.wv["dollar"]

# LDA Topic Modeling

## Init the number of topics

In [None]:
numTopics = 1024

## Create dictionary and corpus

In [None]:
dictionary = corpora.Dictionary(all_message_preprocessed)
corpus = [dictionary.doc2bow(text) for text in all_message_preprocessed]

In [None]:
print(dictionary)
print(corpus[0])

Dictionary<31032 unique tokens: ['base', 'bonus', 'discuss', 'email', 'entitl']...>
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]


##  Create LDA model

In [None]:
LDA_model = models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=numTopics)



In [None]:
#Test
# Print keyword for the topics
print(LDA_model.print_topics())

[(803, '0.011*"com" + 0.010*"mail" + 0.010*"messag" + 0.009*"offic" + 0.009*"box" + 0.008*"power" + 0.008*"energi" + 0.007*"ferc" + 0.007*"compani" + 0.007*"gener"'), (363, '0.010*"com" + 0.008*"ee" + 0.007*"hou" + 0.005*"gener" + 0.005*"mail" + 0.004*"power" + 0.004*"leagu" + 0.004*"fantasi" + 0.004*"basketbal" + 0.003*"updat"'), (744, '0.024*"com" + 0.024*"mari" + 0.019*"rate" + 0.018*"file" + 0.018*"lft" + 0.017*"negoti" + 0.017*"subject" + 0.014*"dai" + 0.014*"know" + 0.013*"darveaux"'), (657, '0.018*"com" + 0.012*"need" + 0.009*"agreement" + 0.009*"continu" + 0.008*"issu" + 0.008*"power" + 0.007*"rick" + 0.006*"number" + 0.006*"state" + 0.006*"peopl"'), (579, '0.094*"com" + 0.008*"hotmail" + 0.007*"scott" + 0.006*"susan" + 0.005*"john" + 0.005*"pipelin" + 0.005*"lisa" + 0.005*"laura" + 0.005*"iso" + 0.004*"project"'), (421, '0.078*"com" + 0.006*"price" + 0.006*"richard" + 0.005*"new" + 0.005*"power" + 0.005*"robert" + 0.004*"access" + 0.004*"john" + 0.004*"mari" + 0.004*"plan"'), 

#Doc2Vec

In [None]:
tagged_data = [models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(all_message_preprocessed)]

In [None]:
#Khởi tạo và huấn luyện trực tiếp
doc2vec_model = models.Doc2Vec(tagged_data, vector_size=20, window=2, min_count=1, workers=4)

#Tách khởi tạo và huấn luyện1
# doc2vec_model = models.Doc2Vec(tagged_data, vector_size=100, min_count=1, epochs=10)
# doc2vec_model.build_vocab(tagged_data)
# doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

#Classification

In [None]:
all_message_bodies = phishing_message_bodies + benign_message_bodies

## Blacklist words

In [None]:
def get_file_lines(file_path):
  lines=[]
  try:
    with open(file_path, 'r') as file:
      for line in file:
        lines.append(line.strip())
  except FileNotFoundError:
    print(f"File not found: {file_path}")
  return lines

In [None]:
black_list_words = get_file_lines("/content/spam_wordlist.txt")
black_list = custom_preprocessing(black_list_words)
black_list = duplicate_filter(black_list)

print(len(black_list_words))
print(len(black_list))

582
369


## TF-IDF

In [None]:
#TF-IDF
max_term=6

In [None]:
def list_to_string(lst):
    return ' '.join(lst)

def count_all_upper_words(text):
    count = 0
    for word in text.split():
        if word.isupper():
            count += 1
    return count

In [None]:
tfidfVectorizer = text.TfidfVectorizer(max_features=max_term, preprocessor=list_to_string, sublinear_tf=True)
tfidf_matrix = tfidfVectorizer.fit_transform(all_message_preprocessed).toarray()

print(tfidfVectorizer.get_feature_names_out())
print(tfidf_matrix.shape)
print(tfidf_matrix[2])

['com' 'content' 'ect' 'hou' 'mail' 'subject']
(4279, 6)
[0. 0. 0. 0. 0. 0.]


## Vector

In [2]:
def create_vectors_from_messages(messages, messages_preprocessed):
    corpus = [dictionary.doc2bow(text) for text in messages_preprocessed] # Term document frequency
    all_vectors = []
    for i in range(len(messages)):
        topTopics = LDA_model.get_document_topics(corpus[i], minimum_probability=0.0)

        # Can extend this array with other stuff later
        vec = [topTopics[i][1] for i in range(numTopics)] # Topics

        for v in doc2vec_model.infer_vector(messages_preprocessed[i]): # Doc2Vec
            vec.append(v)

        # Sentiment analysis of polarity
        sia = sentiment.SentimentIntensityAnalyzer()
        sentence = " ".join(messages_preprocessed[i])
        polarity = sia.polarity_scores(sentence)
        for s in polarity:
            vec.append(polarity[s])

        # Contains HTML
        if "<html>" in messages[i].lower():
            vec.append(1)
        else:
            vec.append(0)

        # Contains a link
        if "http://" in messages[i].lower() or "https://" in messages[i].lower():
            vec.append(1)
        else:
            vec.append(0)

        # How many blacklisted phrases/words appear in this email
        for b in black_list:
            count = 0
            for word in b:
                if word in messages_preprocessed[i]:
                    count += 1
            vec.append(count)

        # TF-IDF for top terms
        for word_weight in tfidf_matrix[i]:
            vec.append(word_weight)

        # Has all caps word?
        vec.append(count_all_upper_words(messages[i]))

        # Has exclamation marks?
        vec.append(messages[i].count("!"))

        # Total length
        vec.append(len(messages[i]))

        # Num words
        vec.append(len(messages_preprocessed[i]))

        all_vectors.append(vec)

    return all_vectors

In [None]:
all_vectors = []
for i in range(len(all_message_bodies)):
  top_topics = LDA_model.get_document_topics(corpus[i], minimum_probability=0.0)
  vec =  [top_topics[i][1] for i in range(numTopics)]#topics

  for v in doc2vec_model.infer_vector(all_message_preprocessed[i]):#Doc2Vec--need to fix
    vec.append(v)

  sia = sentiment.SentimentIntensityAnalyzer()
  sentence = " ".join(all_message_preprocessed[i])
  polarity = sia.polarity_scores(sentence)

  for s in polarity:
    vec.append(polarity[s])

  # Contains HTML
  if "<html>" in all_message_bodies[i].lower():
      vec.append(1)
  else:
      vec.append(0)

  # Contains a link (how many)
  if "http://" in all_message_bodies[i].lower() or "https://" in all_message_bodies[i].lower():
      vec.append(1)
  else:
      vec.append(0)

  # How many blacklisted phrases/words appear in this email
  for b in black_list:
      count = 0
      for word in b:
          if word in all_message_preprocessed[i]:
              count += 1
      vec.append(count)

  # TF-IDF for top terms
  for word_weight in tfidf_matrix[i]:
      vec.append(word_weight)

  # Has all caps word?
  vec.append(count_all_upper_words(all_message_bodies[i]))

  # Has exclamation marks?
  vec.append(all_message_bodies[i].count("!"))

  # Total length
  vec.append(len(all_message_bodies[i]))

  # Num words
  vec.append(len(all_message_preprocessed[i]))


  all_vectors.append(vec)


In [None]:
print(np.array(all_vectors).shape)
print(all_vectors[0])

In [None]:
all_labels = []
all_labels.extend([1]*len(phishing_preprocessed))
all_labels.extend([0]*len(benign_preprocessed))
# for i in range(len(phishing_preprocessed)):
#     all_labels.append(1)
# for i in range(len(benign_preprocessed)):
#     all_labels.append(0)

In [None]:
print(len(all_labels))

4279


In [None]:
# Scale and split data
#scaler = MinMaxScaler()
scaler = sk_preprocessing.StandardScaler()
scaler.fit(all_vectors)

X_train, X_test, y_train, y_test = model_selection.train_test_split(scaler.transform(all_vectors), all_labels, test_size=0.2, shuffle=True)


## Random Forest

In [None]:
rf = ensemble.RandomForestClassifier()
#rf = make_pipeline(StandardScaler(), RandomForestClassifier())
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
rfc_accuracy = metrics.accuracy_score(y_test, y_pred)
rfc_precision = metrics.precision_score(y_test, y_pred)
rfc_recall = metrics.recall_score(y_test, y_pred)

print("Accuracy:", rfc_accuracy)
print("Precision:", rfc_precision)
print("Recall:", rfc_recall)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [None]:
cm = metrics.confusion_matrix(y_test, y_pred)
metrics.ConfusionMatrixDisplay(confusion_matrix=cm).plot()

## SVC - Support Vector Classifier