In [1]:
import openpyxl
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
from bs4 import BeautifulSoup
import re
import nltk

def cleanString(myString):

    # convert text to lowercase
    myString = myString.lower()

    # convert URLs to 'httpaddr'
    myString = re.sub(r'(http|https)://[^\s]*', r' httpaddr ', myString)

    # convert email addresses to 'emailaddr'
    myString = re.sub(r'[^\s]+@[^\s]+[.][^\s]+', r' emailaddr ', myString)

    # convert all hyperlinks to 'linktag'
    soup = BeautifulSoup(myString, 'html.parser')
    myString = soup.get_text()
    numberLink = len(soup.find_all('a'))
    numberImg = len(soup.find_all('img'))
    myString = myString + numberLink * " linktag " + numberImg * " imgtag "

    # convert numbers to 'number'
    myString = re.sub(r'[0-9]+', r' number ', myString)

    # convert $, ! and ? to proper words
    myString = re.sub(r'[$]', r' dollar ', myString)
    myString = re.sub(r'[!]', r' exclammark ', myString)
    myString = re.sub(r'[?]', r' questmark ', myString)

    # convert other punctuation to whitespace
    myString = re.sub(r'([^\w\s]+)|([_-]+)', r' ', myString)

    # convert newlines and blanklines to special strings and extra whitespace to single
    myString = re.sub(r'\n', r' newline ', myString)
    myString = re.sub(r'\n\n', r' blankline ', myString)
    myString = re.sub(r'\s+', r' ', myString)
    myString = myString.strip(' ')

    # perform word stemming
    myStringWords = myString.split(' ')
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    stemWords = [stemmer.stem(word) for word in myStringWords]
    myString = ' '.join(stemWords)

    return myString

In [3]:
#load the data
from google.colab import files
uploaded = files.upload( )

Saving mail_data.csv to mail_data.csv


In [16]:
df=pd.read_csv('mail_data.csv')

In [17]:
df.shape

(5572, 2)

In [18]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [22]:
#show the number of missing (NAN,NaN,na) data for each column
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [8]:
def store():

    workBookOld = openpyxl.load_workbook('mail_data.csv')
    dataSheetOld = workBookOld['Data set']

    xData = []
    yData = []

    rows = dataSheetOld.max_row

    for i in range(2, rows+1):

        if (str(dataSheetOld.cell(row = i, column = 2).value) != 'None'):
            xData.append(str(cleanString(dataSheetOld.cell(row = i, column = 1).value)))
            if (str(dataSheetOld.cell(row = i, column = 2).value) == "1"):
                yData.append(1)
            else:
                yData.append(0)
    xTrain, xTest, yTrain, yTest = train_test_split(xData, yData, test_size=0.2, random_state=0)
    return xTrain, xTest, yTrain, yTest

In [9]:
def calcFScore(xTest, yTest, model, vectorizer):

    xTestMatrix = vectorizer.transform(xTest)
    yTestMatrix = np.asarray(yTest)

    result = model.predict(xTestMatrix)
    matrix = confusion_matrix(yTestMatrix, result)

    fScore = f1_score(yTestMatrix, result, pos_label = 0)
    precision = precision_score(yTestMatrix, result, pos_label=0)
    recall = recall_score(yTestMatrix, result, pos_label=0)
    return fScore, precision, recall, matrix

In [10]:
def predict(emailBody, model, vectorizer):

    featureMatrix = vectorizer.transform([cleanString(emailBody)])
    result = model.predict(featureMatrix)
    print("Predicting...")

    if (1 in result):
        return "Spam"
    else:
        return "Not Spam"

model = LinearSVC(class_weight='balanced')

In [2]:
import pandas as pd

# Read the CSV file into a DataFrame
data = pd.read_csv('mail_data.csv')

# Convert the DataFrame to an Excel file
data = data.to_excel('output_data.xlsx', index=False)

data = pd.read_excel('output_data.xlsx')

In [6]:
import pandas as pd

def store():
    # Assuming you have a CSV file named 'Phishing_Email.csv' with columns 'text' and 'label'
    data = pd.read_excel('output_data.xlsx')

    # Create training data
    X = data['Message']  # Features (input data)
    y = data['Category']  # Target (output labels)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

# Now you can call the store() function to get the training and testing data
xTrain, xTest, yTrain, yTest = store()


In [7]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Assuming you have the function `store()` to get the dataset
# Create training data
xTrain, xTest, yTrain, yTest = store()
vectorizer = TfidfVectorizer(stop_words='english', max_df=75)
yTrainMatrix = np.asarray(yTrain)
xTrainMatrix = vectorizer.fit_transform(xTrain)

In [8]:
# The code by itself randomly splits the given dataset into an 80-20 ratio for training and testing respectively
# Training SVM classifier
model = SVC(kernel='linear')  # Create the SVM model (you can choose the kernel you want)
model.fit(xTrainMatrix, yTrainMatrix)

# Calculate predictions for the test data
xTestMatrix = vectorizer.transform(xTest)
yTestPred = model.predict(xTestMatrix)

# Calculate accuracy, F1-score, precision, recall, and confusion matrix
accuracy = accuracy_score(yTest, yTestPred)
fScore = f1_score(yTest, yTestPred, average='weighted')
precision = precision_score(yTest, yTestPred, average='weighted')
recall = recall_score(yTest, yTestPred, average='weighted')
matrix = confusion_matrix(yTest, yTestPred)

print("Accuracy:", accuracy)
print("F1-Score:", fScore)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", matrix)

# Convert the dataset to an Excel file
data = {'xTrain': xTrain, 'xTest': xTest, 'yTrain': yTrain, 'yTest': yTest}
df = pd.DataFrame(data)
df.to_excel('Phishing_Email_dataset.xlsx', index=False)


Accuracy: 0.9865470852017937
F1-Score: 0.986285222665064
Precision: 0.986628085872971
Recall: 0.9865470852017937
Confusion Matrix:
 [[965   1]
 [ 14 135]]
