In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
import joblib

#Baseline Naive Bayes used in analysis
def naiveBayesMultinomial():
    print("Naive Bayes Multinomial\n")
    data = pd.read_csv('train_data_NDS.csv')
    data2 = pd.read_csv('test_data_NDS.csv')

    X = data["Email Text"]
    X2 = data2["Email Text"]
    Y = data["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})
    Y2 = data2["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})

    vectorizer = CountVectorizer()
    X_vec = vectorizer.fit_transform(X)

    tfidf = TfidfTransformer()
    X_tfidf = tfidf.fit_transform(X_vec)

    model = MultinomialNB()
    model.fit(X_tfidf, Y)

    X_test_vec = vectorizer.transform(X2)
    X_test_tfidf = tfidf.transform(X_test_vec)

    Y_pred = model.predict(X_test_tfidf)

    conf_matrix = confusion_matrix(Y2, Y_pred)
    report = classification_report(Y, Y_pred, target_names=['Safe Email', 'Phishing Email'])

    accuracy = metrics.accuracy_score(Y2, Y_pred)
    precision = metrics.precision_score(Y2, Y_pred)
    recall = metrics.recall_score(Y2, Y_pred)

    print(conf_matrix)
    print(report)

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)

    print("\n-----------------\n")

    joblib.dump(model, 'naive_bayes_multinomial_model.pkl')
    joblib.dump(vectorizer, 'naive_bayes_multinomial_vectorizer.pkl')

    return model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
import joblib

def naiveBayesMultinomialShuffle():
    print("Naive Bayes Mulitnomial Shuffle\n")
    data = pd.read_csv('train_data_NDS_shuffle.csv')
    data2 = pd.read_csv('test_data_NDS.csv')

    X = data["Email Text"]
    X2 = data2["Email Text"]
    Y = data["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})
    Y2 = data2["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})

    vectorizer = CountVectorizer()
    X_vec = vectorizer.fit_transform(X)

    tfidf = TfidfTransformer()
    X_tfidf = tfidf.fit_transform(X_vec)

    model = MultinomialNB()
    model.fit(X_tfidf, Y)

    X_test_vec = vectorizer.transform(X2)
    X_test_tfidf = tfidf.transform(X_test_vec)

    Y_pred = model.predict(X_test_tfidf)

    conf_matrix = confusion_matrix(Y2, Y_pred)
    report = classification_report(Y, Y_pred, target_names=['Safe Email', 'Phishing Email'])

    accuracy = metrics.accuracy_score(Y2, Y_pred)
    precision = metrics.precision_score(Y2, Y_pred)
    recall = metrics.recall_score(Y2, Y_pred)

    print(conf_matrix)
    print(report)

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)

    print("\n-----------------\n")

    joblib.dump(model, 'naive_bayes_model_multinomial_shuffle.pkl')
    joblib.dump(vectorizer, 'naive_bayes_vectorizer_multinomial_shuffle.pkl')

    return model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
import joblib

def naiveBayesComplement():
    print("Naive Bayes Complement\n")
    data = pd.read_csv('train_data_NDS.csv')
    data2 = pd.read_csv('test_data_NDS.csv')

    X = data["Email Text"]
    X2 = data2["Email Text"]
    Y = data["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})
    Y2 = data2["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})

    vectorizer = CountVectorizer()
    X_vec = vectorizer.fit_transform(X)

    tfidf = TfidfTransformer()
    X_tfidf = tfidf.fit_transform(X_vec)

    model = ComplementNB()
    model.fit(X_tfidf, Y)

    X_test_vec = vectorizer.transform(X2)
    X_test_tfidf = tfidf.transform(X_test_vec)

    Y_pred = model.predict(X_test_tfidf)

    conf_matrix = confusion_matrix(Y2, Y_pred)
    report = classification_report(Y, Y_pred, target_names=['Safe Email', 'Phishing Email'])

    accuracy = metrics.accuracy_score(Y2, Y_pred)
    precision = metrics.precision_score(Y2, Y_pred)
    recall = metrics.recall_score(Y2, Y_pred)

    print(conf_matrix)
    print(report)

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)

    print("\n-----------------\n")

    joblib.dump(model, 'naive_bayes_complement_model.pkl')
    joblib.dump(vectorizer, 'naive_bayes_complement_vectorizer.pkl')

    return model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
import joblib

def naiveBayesBernoulli():
    print("Naive Bayes Bernoulli\n")
    data = pd.read_csv('train_data_NDS.csv')
    data2 = pd.read_csv('test_data_NDS.csv')

    X = data["Email Text"]
    X2 = data2["Email Text"]
    Y = data["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})
    Y2 = data2["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})

    vectorizer = CountVectorizer()
    X_vec = vectorizer.fit_transform(X)

    tfidf = TfidfTransformer()
    X_tfidf = tfidf.fit_transform(X_vec)

    model = BernoulliNB()
    model.fit(X_tfidf, Y)

    X_test_vec = vectorizer.transform(X2)
    X_test_tfidf = tfidf.transform(X_test_vec)

    Y_pred = model.predict(X_test_tfidf)

    conf_matrix = confusion_matrix(Y2, Y_pred)
    report = classification_report(Y, Y_pred, target_names=['Safe Email', 'Phishing Email'])

    accuracy = metrics.accuracy_score(Y2, Y_pred)
    precision = metrics.precision_score(Y2, Y_pred)
    recall = metrics.recall_score(Y2, Y_pred)

    print(conf_matrix)
    print(report)

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)

    print("\n-----------------\n")

    joblib.dump(model, 'naive_bayes_bernoulli_model.pkl')
    joblib.dump(vectorizer, 'naive_bayes_bernoulli_vectorizer.pkl')

    return model

In [None]:
def main():
  naiveBayesMultinomial()
  naiveBayesMultinomialShuffle()
  naiveBayesComplement()
  naiveBayesBernoulli()

if __name__ == "__main__":
  main()

Naive Bayes Multinomial

[[3390   92]
 [ 101 3381]]
                precision    recall  f1-score   support

    Safe Email       0.97      0.97      0.97      3482
Phishing Email       0.97      0.97      0.97      3482

      accuracy                           0.97      6964
     macro avg       0.97      0.97      0.97      6964
  weighted avg       0.97      0.97      0.97      6964

Accuracy:  0.9722860425043078
Precision:  0.9735099337748344
Recall:  0.9709936817920736

-----------------

Naive Bayes Mulitnomial Shuffle

[[3390   92]
 [ 101 3381]]
                precision    recall  f1-score   support

    Safe Email       0.51      0.51      0.51      3482
Phishing Email       0.51      0.51      0.51      3482

      accuracy                           0.51      6964
     macro avg       0.51      0.51      0.51      6964
  weighted avg       0.51      0.51      0.51      6964

Accuracy:  0.9722860425043078
Precision:  0.9735099337748344
Recall:  0.9709936817920736

-----------