In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [2]:
df = pd.read_csv("emails.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df.tail()

Unnamed: 0,text,spam
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


In [4]:
df.columns

Index(['text', 'spam'], dtype='object')

In [5]:
df.shape

(5728, 2)

In [6]:
df.describe()

Unnamed: 0,spam
count,5728.0
mean,0.238827
std,0.426404
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [8]:
df.drop_duplicates(inplace=True)
df.shape

(5695, 2)

In [9]:
df.isna().sum()

text    0
spam    0
dtype: int64

In [10]:
df.spam.nunique()

2

In [11]:
df.spam.value_counts()

spam
0    4327
1    1368
Name: count, dtype: int64

In [12]:
# download the stopwords package
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

function to clean and transform text by removing punctuation and filtering out common stopwords

In [13]:
def process(text):
    # Remove punctuation
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)

    # Remove stopwords
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    return clean

# to show the tokenization
df['text'].head().apply(process)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

convert the text into a matrix of token counts

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
message = CountVectorizer(analyzer=process).fit_transform(df['text'])

In [15]:
message

<5695x37229 sparse matrix of type '<class 'numpy.int64'>'
	with 562930 stored elements in Compressed Sparse Row format>

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(message,df["spam"], test_size=0.2, random_state=42)

In [22]:
X_test.shape

(1139, 37229)

In [24]:
X_train.shape

(4556, 37229)

In [25]:
message.shape

(5695, 37229)

In [26]:
# create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [31]:
classifier.predict(X_train[:10])

array([0, 1, 0, 0, 0, 0, 1, 1, 0, 0], dtype=int64)

In [32]:
y_train[:10]

2862    0
43      1
1891    0
3201    0
3451    0
5006    0
1071    1
718     1
5356    0
4448    0
Name: spam, dtype: int64

In [30]:
classifier.predict(X_test[:10])

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0], dtype=int64)

In [29]:
y_test[:10]

1436    0
748     1
4126    0
1448    0
3905    0
944     1
1595    0
1666    0
2382    0
4820    0
Name: spam, dtype: int64