In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
import joblib
from sklearn.metrics import confusion_matrix, classification_report

def decisionTree():
    print("Decision Tree\n")
    vectorizer = CountVectorizer()

    data = pd.read_csv('train_data_NDS.csv')
    data2 = pd.read_csv('test_data_NDS.csv')

    X = data["Email Text"]
    Y = data["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})
    X_vectorized = vectorizer.fit_transform(X)

    X2 = data2["Email Text"]
    Y2 = data2["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})
    X2_vectorized = vectorizer.transform(X2)


    decision_tree_model = tree.DecisionTreeClassifier()

    decision_tree_model.fit(X_vectorized, Y)

    predictions = decision_tree_model.predict(X2_vectorized)


    conf_matrix = confusion_matrix(Y2, predictions)
    report = classification_report(Y, predictions, target_names=['Safe Email', 'Phishing Email'])

    accuracy = metrics.accuracy_score(Y2, predictions)
    precision = metrics.precision_score(Y2, predictions)
    recall = metrics.recall_score(Y2, predictions)

    print(conf_matrix)
    print(report)

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)

    print("\n-----------------\n")

    joblib.dump(decision_tree_model, 'decision_tree_model.pkl')
    joblib.dump(vectorizer, 'decision_tree_vectorizer.pkl')

    return decision_tree_model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
import joblib
from sklearn.metrics import confusion_matrix, classification_report

def decisionTreeShuffle():
    print("Decision Tree Shuffle\n")
    vectorizer = CountVectorizer()

    data = pd.read_csv('train_data_NDS_shuffle.csv')
    data2 = pd.read_csv('test_data_NDS.csv')

    X = data["Email Text"]
    Y = data["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})
    X_vectorized = vectorizer.fit_transform(X)

    X2 = data2["Email Text"]
    Y2 = data2["Email Type"].map({'Safe Email': 0, 'Phishing Email': 1})
    X2_vectorized = vectorizer.transform(X2)


    decision_tree_model = tree.DecisionTreeClassifier()

    decision_tree_model.fit(X_vectorized, Y)

    predictions = decision_tree_model.predict(X2_vectorized)


    conf_matrix = confusion_matrix(Y2, predictions)
    report = classification_report(Y2, predictions, target_names=['Safe Email', 'Phishing Email'])

    accuracy = metrics.accuracy_score(Y2, predictions)
    precision = metrics.precision_score(Y2, predictions)
    recall = metrics.recall_score(Y2, predictions)

    print(conf_matrix)
    print(report)

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)

    print("\n-----------------\n")

    joblib.dump(decision_tree_model, 'decision_tree_shuffle_model.pkl')
    joblib.dump(vectorizer, 'decision_tree_shuffle_vectorizer.pkl')

    return decision_tree_model

In [None]:
def main():
  decisionTree()
  decisionTreeShuffle()

if __name__ == "__main__":
  main()

Decision Tree

[[3163  319]
 [ 274 3208]]
                precision    recall  f1-score   support

    Safe Email       0.92      0.91      0.91      3482
Phishing Email       0.91      0.92      0.92      3482

      accuracy                           0.91      6964
     macro avg       0.91      0.91      0.91      6964
  weighted avg       0.91      0.91      0.91      6964

Accuracy:  0.9148477886272257
Precision:  0.9095548624893678
Recall:  0.9213095921883975

-----------------

Decision Tree Shuffle

[[3155  327]
 [ 297 3185]]
                precision    recall  f1-score   support

    Safe Email       0.91      0.91      0.91      3482
Phishing Email       0.91      0.91      0.91      3482

      accuracy                           0.91      6964
     macro avg       0.91      0.91      0.91      6964
  weighted avg       0.91      0.91      0.91      6964

Accuracy:  0.9103963239517519
Precision:  0.9068906605922551
Recall:  0.914704192992533

-----------------

