### **Import the required packages**

In [1]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

### **Reading and Exploring the data**

In [2]:
spam_df = pd.read_csv('/content/emails.csv')

In [3]:
spam_df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
spam_df.tail()

Unnamed: 0,text,spam
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0
5727,Subject: news : aurora 5 . 2 update aurora ve...,0


In [5]:
spam_df.shape #checking the shape of the data

(5728, 2)

In [6]:
spam_df.isnull().sum()  #checking for the null values column-wise

Unnamed: 0,0
text,0
spam,0


In [7]:
spam_df.dtypes

Unnamed: 0,0
text,object
spam,int64


In [8]:
spam_df.duplicated().sum()   #check for the duplicate rows

np.int64(33)

In [9]:
spam_df[spam_df.duplicated()]   #print all the duplicate rows

Unnamed: 0,text,spam
2155,Subject: research allocations to egm hi becky...,0
2260,Subject: departure of grant masson the resear...,0
2412,"Subject: re : schedule and more . . jinbaek ,...",0
2473,"Subject: day off tuesday stinson , i would l...",0
2763,"Subject: re : your mail zhendong , dr . kami...",0
3123,"Subject: re : grades pam , the students rese...",0
3152,Subject: tiger evals - attachment tiger hosts...,0
3248,"Subject: re : i am zhendong zhendong , thank...",0
3249,Subject: hello from enron dear dr . mcmullen ...,0
3387,"Subject: term paper dr . kaminski , attached...",0


In [10]:
spam_df.drop_duplicates(inplace = True)

In [11]:
spam_df['spam'].value_counts()    #to check for data imbalance

Unnamed: 0_level_0,count
spam,Unnamed: 1_level_1
0,4327
1,1368


In [12]:
spam_df.shape

(5695, 2)

#### **Apply Count Vectorization on the `text` column**

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
vectorizer = CountVectorizer()

vectorized_input_cols = vectorizer.fit_transform(spam_df['text'])

In [15]:
vectorized_input_cols

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 704610 stored elements and shape (5695, 37303)>

### **Machine Learning Process**

In [16]:
X = vectorized_input_cols
y = spam_df['spam']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100, stratify = y)

#### **Apply Naive Bayes Classifier to the data**

There are three variants of Naive Bayes algorithm:

1. Gaussian Naive Bayes
2. Bernoulli Naive Bayes.
3. Multinomial Naive Bayes

In [18]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [19]:
y_pred = nb_classifier.predict(X_test)

In [20]:
accuracy_score(y_test, y_pred)

0.9894644424934153

In [21]:
roc_auc_score(y_test, y_pred)

np.float64(0.9918168009788616)

#### **Let's take some new unseen emails and check whether our model is performing well enough or not**

In [22]:
emails_new = ['Congratulations, you have a pre-approved loan of 5000 US Dollar. No Documentation required.',
              'Can we have quick meeting tomorrow at 5 PM IST?']

In [23]:
vectorized_emails = vectorizer.transform(emails_new)

In [24]:
nb_classifier.predict(vectorized_emails)

array([1, 0])