In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.columns[df.isna().any()]

Index([], dtype='object')

In [5]:
# there are no columns in your DataFrame (df) that contain missing values (NaN)

In [6]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [7]:
df['spam'] = df['Category'].apply(lambda x : 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['Message'],df['spam'])

In [10]:
x_train

855     Stop the story. I've told him i've returned it...
3215    Babe, have you got enough money to pick up bre...
1870                       Mom wants to know where you at
2623                                            Ok lor...
1474    Will do, you gonna be at blake's all night? I ...
                              ...                        
2066    Cos daddy arranging time c wat time fetch ü ma...
1468           I wont touch you with out your permission.
1090    Goodmorning today i am late for  &lt;DECIMAL&g...
4708                  Wif my family booking tour package.
1218    Damn, can you make it tonight or do you want t...
Name: Message, Length: 4179, dtype: object

In [11]:
x_train.values

array(["Stop the story. I've told him i've returned it and he's saying i should not re order it.",
       "Babe, have you got enough money to pick up bread and milk ? And I'll give you it back when you get home ?",
       'Mom wants to know where you at', ...,
       'Goodmorning today i am late for  &lt;DECIMAL&gt; min.',
       'Wif my family booking tour package.',
       'Damn, can you make it tonight or do you want to just wait til tomorrow'],
      dtype=object)

#### Our input variable x that is Message column is text. We will convert it to numbers using CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
x_train_count = v.fit_transform(x_train.values)

#### For discrete data we use Multinominal Naive Bayes

In [13]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [14]:
example_emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
email_count = v.transform(example_emails)
model.predict(email_count)

array([0, 1], dtype=int64)

In [15]:
x_test_count = v.transform(x_test)
model.score(x_test_count, y_test)

0.9842067480258435

### Everytime you get a list of emails, you will have to transform it into a matrix using CountVectorizer due to which a new variable comes into picture. Then only you are allowed to use MultinomialNB. You can simplify these steps using sklearn pipeline

In [16]:
from sklearn.pipeline import Pipeline

In [17]:
clf = Pipeline([
    ('Vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [18]:
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.9842067480258435

In [19]:
clf.predict(example_emails)

array([0, 1], dtype=int64)