In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,spam
0,0,Subject: naturally irresistible your corporate...,1
1,1,Subject: the stock trading gunslinger fanny i...,1
2,2,Subject: unbelievable new homes made easy im ...,1
3,3,Subject: 4 color printing special request add...,1
4,4,"Subject: do not have money , get software cds ...",1


# **Droping all extra column from dataframe**

In [3]:
# There is some garbage Null column after 2 column which have to remove
# df = df.drop(df.columns[2:], axis=1)
df = df.drop(['Unnamed: 0'], axis = 1)
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5725,Subject: re : research and development charges...,0
5726,"Subject: re : receipts from visit jim , than...",0
5727,Subject: re : enron case study update wow ! a...,0
5728,"Subject: re : interest david , please , call...",0


In [4]:
df['spam'].value_counts()

0                                                                                               4359
1                                                                                               1367
 its termination would not  have such a phenomenal impact on the power situation .  however        1
 mr suresh prabhu                                                                                  1
Name: spam, dtype: int64

# **Duplicates Remove**

In [5]:
df.drop_duplicates(inplace = True)
df.shape

(5697, 2)

In [6]:
df['spam'].value_counts()

0                                                                                               4326
1                                                                                               1367
 its termination would not  have such a phenomenal impact on the power situation .  however        1
 mr suresh prabhu                                                                                  1
Name: spam, dtype: int64

# **Missing Value Removal**

In [7]:
df.isnull().sum()

text    0
spam    2
dtype: int64

In [8]:
df = df.dropna()
df.isnull().sum()

text    0
spam    0
dtype: int64

In [9]:
df['spam'].value_counts()

0                                                                                               4326
1                                                                                               1367
 its termination would not  have such a phenomenal impact on the power situation .  however        1
 mr suresh prabhu                                                                                  1
Name: spam, dtype: int64

In [10]:
# df = df[~df['spam'].isin([' its termination would not have such a phenomenal impact on the power situation . however', 'mr suresh prabhu'])]
# df['spam'].value_counts()

# **X Y separation**

In [11]:
X = df.text.values
Y = df.spam.values
Y

array(['1', '1', '1', ..., '0', '0', '0'], dtype=object)

# **Train Test Split**

In [12]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size = 0.2, random_state =42)

# **Data Preprocessing**

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

### **Fit Transform X train Data**

In [14]:
cv_xtrain = cv.fit_transform(xtrain)

In [15]:
cv_xtrain.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
cv_xtrain.shape

(4556, 33744)

# **ML Algorithms**
## **Multi-nomial Naive Bayes**

In [17]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()

In [18]:
MNB.fit(cv_xtrain, ytrain)

MultinomialNB()

### **Transform X test data**

In [19]:
cv_xtest = cv.transform(xtest)

In [20]:
cv_xtest.toarray()
cv_xtest.shape

(1139, 33744)

In [21]:
MNB.score(cv_xtest, ytest)

0.9877085162423178

In [22]:
emails = ['Hello, I am writing to follow up on the proposal we discussed in our last meeting. Do you have any further questions or concerns? Best regards, John', 'Dear customer, We are excited to announce a new product that we will be launching next month. It is designed to help improve your productivity and save you time. Stay tuned for more details! Best regards, The Product Team', 'This is a spam, do not open it']

In [23]:
cv_emails = cv.transform(emails)

In [24]:
MNB.predict(cv_emails)

array(['0', '0', '1'], dtype='<U92')

<h2 style='color:Blue'><b>So, Multinomial Naive Bayes Performing well with Accuracy of 98.8%</b></h2>