In [1]:
import openpyxl
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
from bs4 import BeautifulSoup
import re
import nltk

def cleanString(myString):

    # convert text to lowercase
    myString = myString.lower()

    # convert URLs to 'httpaddr'
    myString = re.sub(r'(http|https)://[^\s]*', r' httpaddr ', myString)

    # convert email addresses to 'emailaddr'
    myString = re.sub(r'[^\s]+@[^\s]+[.][^\s]+', r' emailaddr ', myString)

    # convert all hyperlinks to 'linktag'
    soup = BeautifulSoup(myString, 'html.parser')
    myString = soup.get_text()
    numberLink = len(soup.find_all('a'))
    numberImg = len(soup.find_all('img'))
    myString = myString + numberLink * " linktag " + numberImg * " imgtag "

    # convert numbers to 'number'
    myString = re.sub(r'[0-9]+', r' number ', myString)

    # convert $, ! and ? to proper words
    myString = re.sub(r'[$]', r' dollar ', myString)
    myString = re.sub(r'[!]', r' exclammark ', myString)
    myString = re.sub(r'[?]', r' questmark ', myString)

    # convert other punctuation to whitespace
    myString = re.sub(r'([^\w\s]+)|([_-]+)', r' ', myString)

    # convert newlines and blanklines to special strings and extra whitespace to single
    myString = re.sub(r'\n', r' newline ', myString)
    myString = re.sub(r'\n\n', r' blankline ', myString)
    myString = re.sub(r'\s+', r' ', myString)
    myString = myString.strip(' ')

    # perform word stemming
    myStringWords = myString.split(' ')
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    stemWords = [stemmer.stem(word) for word in myStringWords]
    myString = ' '.join(stemWords)

    return myString

In [3]:
#load the data
from google.colab import files
uploaded = files.upload( )

Saving Phishing_Email.csv to Phishing_Email.csv


In [14]:
def store():

    workBookOld = openpyxl.load_workbook('Phishing_Email.csv')
    dataSheetOld = workBookOld['Data set']

    xData = []
    yData = []

    rows = dataSheetOld.max_row

    for i in range(2, rows+1):

        if (str(dataSheetOld.cell(row = i, column = 2).value) != 'None'):
            xData.append(str(cleanString(dataSheetOld.cell(row = i, column = 1).value)))
            if (str(dataSheetOld.cell(row = i, column = 2).value) == "1"):
                yData.append(1)
            else:
                yData.append(0)
    xTrain, xTest, yTrain, yTest = train_test_split(xData, yData, test_size=0.2, random_state=0)
    return xTrain, xTest, yTrain, yTest

In [15]:
def calcFScore(xTest, yTest, model, vectorizer):

    xTestMatrix = vectorizer.transform(xTest)
    yTestMatrix = np.asarray(yTest)

    result = model.predict(xTestMatrix)
    matrix = confusion_matrix(yTestMatrix, result)

    fScore = f1_score(yTestMatrix, result, pos_label = 0)
    precision = precision_score(yTestMatrix, result, pos_label=0)
    recall = recall_score(yTestMatrix, result, pos_label=0)
    return fScore, precision, recall, matrix

In [16]:
def predict(emailBody, model, vectorizer):

    featureMatrix = vectorizer.transform([cleanString(emailBody)])
    result = model.predict(featureMatrix)
    print("Predicting...")

    if (1 in result):
        return "Spam"
    else:
        return "Not Spam"

model = LinearSVC(class_weight='balanced')

In [24]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Read the CSV file
data = pd.read_csv('Phishing_Email.csv')
print(data)

       Unnamed: 0                                         Email Text  \
0               0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1               1  the other side of * galicismos * * galicismo *...   
2               2  re : equistar deal tickets are you still avail...   
3               3  \nHello I am your hot lil horny toy.\n    I am...   
4               4  software at incredibly low prices ( 86 % lower...   
...           ...                                                ...   
18645       18646  date a lonely housewife always wanted to date ...   
18646       18647  request submitted : access request for anita ....   
18647       18648  re : important - prc mtg hi dorn & john , as y...   
18648       18649  press clippings - letter on californian utilit...   
18649       18650                                              empty   

           Email Type  
0          Safe Email  
1          Safe Email  
2          Safe Email  
3      Phishing Email  
4      Phishing

In [27]:

# Print column names
print(data.columns)

# Create training data
X = data['Email Text']  # Features (input data)
y = data['Email Type']  # Target (output labels)
print(X)
print(y)

Index(['Unnamed: 0', 'Email Text', 'Email Type'], dtype='object')
0        re : 6 . 1100 , disc : uniformitarianism , re ...
1        the other side of * galicismos * * galicismo *...
2        re : equistar deal tickets are you still avail...
3        \nHello I am your hot lil horny toy.\n    I am...
4        software at incredibly low prices ( 86 % lower...
                               ...                        
18645    date a lonely housewife always wanted to date ...
18646    request submitted : access request for anita ....
18647    re : important - prc mtg hi dorn & john , as y...
18648    press clippings - letter on californian utilit...
18649                                                empty
Name: Email Text, Length: 18650, dtype: object
0            Safe Email
1            Safe Email
2            Safe Email
3        Phishing Email
4        Phishing Email
              ...      
18645    Phishing Email
18646        Safe Email
18647        Safe Email
18648        Safe Emai

In [29]:
df=pd.read_csv('Phishing_Email.csv')

In [30]:
#Print the shape (get the numbers of columns and rows)
df.shape

(18650, 3)

In [31]:
#Get the columns name of the data set
df.columns

Index(['Unnamed: 0', 'Email Text', 'Email Type'], dtype='object')

In [32]:
#check for duplicates and remove them
df.drop_duplicates(inplace=True)

In [33]:
#show the new shape (number of rows and columns)
df.shape

(18650, 3)

In [43]:
#show the number of missing (NAN,NaN,na) data for each column
df.isnull().sum()

Unnamed: 0    0
Email Text    0
Email Type    0
dtype: int64

In [35]:
import pandas as pd

# Assuming you have a DataFrame named 'df' with missing values (NaN, na)
# For demonstration, I'm creating a sample DataFrame 'df'

df = pd.DataFrame(data)

# Show the number of missing (NaN, na) data for each column
missing_counts = df.isnull().sum()
print("Number of missing data for each column:")
print(missing_counts)

# Replace missing (NaN, na) values in each column with an empty string
df.fillna('', inplace=True)

# Display the DataFrame with missing values replaced by empty strings
print("\nDataFrame with missing values replaced by empty strings:")
print(df)


Number of missing data for each column:
Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64

DataFrame with missing values replaced by empty strings:
       Unnamed: 0                                         Email Text  \
0               0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1               1  the other side of * galicismos * * galicismo *...   
2               2  re : equistar deal tickets are you still avail...   
3               3  \nHello I am your hot lil horny toy.\n    I am...   
4               4  software at incredibly low prices ( 86 % lower...   
...           ...                                                ...   
18645       18646  date a lonely housewife always wanted to date ...   
18646       18647  request submitted : access request for anita ....   
18647       18648  re : important - prc mtg hi dorn & john , as y...   
18648       18649  press clippings - letter on californian utilit...   
18649       18650                             

In [36]:
#download the stopwords package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [37]:
def process_text(text):
  #1 remove puntuation
  #2 remove stopwords
  #3 return  a list of clean text words

  #1
  nopunc = [char for char in text if char not in string.puntuation]
  nopunc =' '.join(nopunc)

  #2
  clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

  #3
  return clean_words

In [39]:
import string

def process_text(text):
    # Convert text to string if it's not already
    if not isinstance(text, str):
        text = str(text)

    # Remove punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    # Split text into tokens
    tokens = nopunc.split()

    return tokens



In [40]:
#Example

message4= 'hello world hello hello world play'
message5= 'test test test test one hello'
print(message4)
print()

#convert the text to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
bow4 =  CountVectorizer(analyzer=process_text).fit_transform([[message4],[message5]])
print(bow4)
print()

print(bow4.shape)

hello world hello hello world play

  (0, 0)	3
  (0, 4)	2
  (0, 2)	1
  (1, 0)	1
  (1, 3)	4
  (1, 1)	1

(2, 5)


In [46]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=75)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_matrix = vectorizer.fit_transform(X_train)
X_test_matrix = vectorizer.transform(X_test)

# Training SVM classifier
model = SVC(kernel='linear')
model.fit(X_train_matrix, y_train)

# Calculate predictions for the test data
y_test_pred = model.predict(X_test_matrix)

# Calculate accuracy, F1-score, precision, recall, and confusion matrix
accuracy = accuracy_score(y_test, y_test_pred)
fScore = f1_score(y_test, y_test_pred, average='weighted')
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
matrix = confusion_matrix(y_test, y_test_pred)

print("Accuracy:", accuracy)
print("F1-Score:", fScore)
print("Precision:", precision)
print("Recall:", recall)
print("Confusion Matrix:\n", matrix)


Accuracy: 0.9351206434316354
F1-Score: 0.9342564749636727
Precision: 0.9378496749939872
Recall: 0.9351206434316354
Confusion Matrix:
 [[1249  208]
 [  34 2239]]
