In [1]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load Data

In [2]:
spam_mail = pd.read_csv("../datasets/spam_mail.csv")
spam_mail

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


# Pre-process Data

In [3]:
def text_preprocessing(txt):
    txt = txt.lower()
    txt = txt.replace("subject: ", "")
    txt = txt.replace("re :", "reply")
    txt = txt.encode('ascii', 'ignore').decode()
    txt = re.sub(r'https*\S+', ' ', txt)
    txt = re.sub(r'@\S+', ' ', txt)
    txt = re.sub(r'#\S+', ' ', txt)
    txt = re.sub(r'\'\w+', '', txt)
    txt = re.sub('[%s]' % re.escape(string.punctuation), ' ', txt)
    txt = re.sub(r'\w*\d+\w*', '', txt)
    txt = re.sub(r'\s{2,}', ' ', txt)
    return txt

In [4]:
spam_mail = spam_mail.drop(["Unnamed: 0","label"], axis=1)
spam_mail['text'] = spam_mail.text.apply(text_preprocessing)
spam_mail = spam_mail.rename(columns = {'label_num' : 'label'})

# Vectorize text using TF-IDF

In [5]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(spam_mail['text'])
y = spam_mail['label']

# Prepare train and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle= True, random_state=42)

# Build model and evaluate

In [7]:
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)


0.8821256038647343