In [6]:
# import packages
import pandas as pd # manipulating the data purpose
import numpy as np
from sklearn.model_selection import train_test_split # Split test/train data randomly
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts
from sklearn.naive_bayes import MultinomialNB # add classifier for classification of words

In [5]:
# import data
spam_df = pd.read_csv("spam_assassin.csv")

In [4]:
# inspect data
spam_df.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [3]:
# inspect data
spam_df.shape

(5796, 2)

In [7]:
# goup by category. 0 - no spam, 1 - spam
spam_df.groupby('target').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,3900,3638,From iiu-admin@taint.org Tue Aug 6 11:14:56 20...,5
1,1896,1691,Return-Path: ler@lerami.lerctr.org Delivery-Da...,6


In [10]:
# create train/test split w proporcji 25%
x_train, x_test, y_train, y_test = train_test_split(spam_df.text, spam_df.target, test_size = 0.25)

In [11]:
# check training data
x_train.head()

4028    From ilug-admin@linux.ie Tue Aug 13 10:29:59 2...
1320    From rssfeeds@jmason.org Wed Oct 9 10:52:34 20...
654     From cheapbargain@yahoo.com Mon Jul 29 11:40:0...
4594    From rssfeeds@jmason.org Tue Oct 1 10:36:38 20...
1672    From rpm-list-admin@freshrpms.net Wed Sep 18 1...
Name: text, dtype: object

In [12]:
# check training data
x_train.describe()

count                                                  4347
unique                                                 4056
top       Return-Path: ler@lerami.lerctr.org Delivery-Da...
freq                                                      5
Name: text, dtype: object

In [13]:
# find key words and store data as matrix of words
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values) # all text transforming to matrix of words

In [14]:
# see matrix shape
x_train_count

<4347x105931 sparse matrix of type '<class 'numpy.int64'>'
	with 1106120 stored elements in Compressed Sparse Row format>

In [15]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [3, 0, 1, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 3, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
# train model basis on word matrix and given information spam - 1 , not spam - 0
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [22]:
# test trained model on my custom NON SPAM email message
email_ham = ["hey lets have a beer together today. Are you in?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

In [21]:
# test trained model on my custom SPAM email message
email_spam = ["reward money click now you will be reach"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [23]:
# test model accuracy
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9620427881297446