In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
df = pd.read_csv("spam.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [9]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [8]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [11]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=="spam" else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [12]:
from sklearn.model_selection import train_test_split
a_train, a_test, b_train, b_test = train_test_split(df['Message'],df['spam'],test_size = 0.25)

Converting our "Message" column into vector form in below cell because it is in textual form and Machine can't learn on textual data

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
a_train_count = v.fit_transform(a_train.values)
a_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
from sklearn.naive_bayes import MultinomialNB
myModel = MultinomialNB()
myModel.fit(a_train_count,b_train)

MultinomialNB()

In [19]:
myEmails = [
    'Hello there makesh. hope you are fine my friend', 
    'Get upto 50% discount on all nike stores'
]
email_count = v.transform(myEmails)
myModel.predict(email_count)

array([0, 1], dtype=int64)

As you can see in above cell that first mail is not a spam whereas second mail is spam

In [22]:
a_test_count = v.transform(a_test) #converting textual data to vector
print("Accuracy of my model: ", myModel.score(a_test_count, b_test)*100,"%")

Accuracy of my model:  98.34888729361091 %


We can also solve the above problem using Pipeline library from sklearn:

In [31]:
from sklearn.pipeline import Pipeline
classifier = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('NB', MultinomialNB())
])

Now we don't need to convert our 'Message' column into vector. it will be handled by pipeline 

In [32]:
classifier.fit(a_train, b_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('NB', MultinomialNB())])

In [33]:
classifier.score(a_test,b_test)

0.9834888729361091

In [35]:
classifier.predict(myEmails)

array([0, 1], dtype=int64)

So we can solve this problem by using pipeline or by simply using traditional method