In [8]:
import openpyxl
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [9]:
from bs4 import BeautifulSoup
import re
import nltk

def cleanString(myString):

    # convert text to lowercase
    myString = myString.lower()

    # convert URLs to 'httpaddr'
    myString = re.sub(r'(http|https)://[^\s]*', r' httpaddr ', myString)

    # convert email addresses to 'emailaddr'
    myString = re.sub(r'[^\s]+@[^\s]+[.][^\s]+', r' emailaddr ', myString)

    # convert all hyperlinks to 'linktag'
    soup = BeautifulSoup(myString, 'html.parser')
    myString = soup.get_text()
    numberLink = len(soup.find_all('a'))
    numberImg = len(soup.find_all('img'))
    myString = myString + numberLink * " linktag " + numberImg * " imgtag "

    # convert numbers to 'number'
    myString = re.sub(r'[0-9]+', r' number ', myString)

    # convert $, ! and ? to proper words
    myString = re.sub(r'[$]', r' dollar ', myString)
    myString = re.sub(r'[!]', r' exclammark ', myString)
    myString = re.sub(r'[?]', r' questmark ', myString)

    # convert other punctuation to whitespace
    myString = re.sub(r'([^\w\s]+)|([_-]+)', r' ', myString)

    # convert newlines and blanklines to special strings and extra whitespace to single
    myString = re.sub(r'\n', r' newline ', myString)
    myString = re.sub(r'\n\n', r' blankline ', myString)
    myString = re.sub(r'\s+', r' ', myString)
    myString = myString.strip(' ')

    # perform word stemming
    myStringWords = myString.split(' ')
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    stemWords = [stemmer.stem(word) for word in myStringWords]
    myString = ' '.join(stemWords)

    return myString

In [10]:
#load the data
from google.colab import files
uploaded = files.upload( )

Saving DataSet.xlsx to DataSet (1).xlsx


In [12]:
def store():

    workBookOld = openpyxl.load_workbook('DataSet.xlsx')
    dataSheetOld = workBookOld['Data set']

    xData = []
    yData = []

    rows = dataSheetOld.max_row

    for i in range(2, rows+1):

        if (str(dataSheetOld.cell(row = i, column = 2).value) != 'None'):
            xData.append(str(cleanString(dataSheetOld.cell(row = i, column = 1).value)))
            if (str(dataSheetOld.cell(row = i, column = 2).value) == "1"):
                yData.append(1)
            else:
                yData.append(0)
    xTrain, xTest, yTrain, yTest = train_test_split(xData, yData, test_size=0.2, random_state=0)
    return xTrain, xTest, yTrain, yTest

In [13]:
def calcFScore(xTest, yTest, model, vectorizer):

    xTestMatrix = vectorizer.transform(xTest)
    yTestMatrix = np.asarray(yTest)

    result = model.predict(xTestMatrix)
    matrix = confusion_matrix(yTestMatrix, result)

    fScore = f1_score(yTestMatrix, result, pos_label = 0)
    precision = precision_score(yTestMatrix, result, pos_label=0)
    recall = recall_score(yTestMatrix, result, pos_label=0)
    return fScore, precision, recall, matrix

In [14]:
def predict(emailBody, model, vectorizer):

    featureMatrix = vectorizer.transform([cleanString(emailBody)])
    result = model.predict(featureMatrix)
    print("Predicting...")

    if (1 in result):
        return "Spam"
    else:
        return "Not Spam"

model = LinearSVC(class_weight='balanced')

In [20]:
# Create training data
xTrain, xTest, yTrain, yTest = store()
vectorizer = TfidfVectorizer(stop_words='english', max_df=75)
yTrainMatrix = np.asarray(yTrain)
xTrainMatrix = vectorizer.fit_transform(xTrain)
print(" ")

#The code by itself randomly splits the given dataset into an 80-20 ratio for training and testing respectively
# Training SVM classifier
model.fit(xTrainMatrix, yTrainMatrix)
fScore, precision, recall, matrix = calcFScore(xTest, yTest, model, vectorizer)
print(" ")
print(fScore, precision, recall, matrix)
print(" ")

  soup = BeautifulSoup(myString, 'html.parser')


 
 
0.9910979228486648 0.9881656804733728 0.9940476190476191 [[167   1]
 [  2  16]]
 
