In [1]:
import pandas as pd
import sklearn

In [37]:
data = pd.read_csv("data/spam.csv")
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
# converting the Category column (ham or spam) into 0 or 1 form
# we can also use the dummy or label encoder method
data["spam"] = data.Category.apply(lambda x: 1 if x=="spam" else 0)
data.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.Message, data.spam, test_size=0.2)

In [40]:
# we have to use the technique count vectonizer, which we can use to identify the most common words in our Message feature and then converting it
# into the array of numbers

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X_train_trans = cv.fit_transform(X_train.values)
X_train_trans.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [41]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train_trans, y_train)

MultinomialNB()

In [42]:
# accuracy of our model
X_test_trans = cv.transform(X_test)
model.score(X_test_trans, y_test)

0.9910313901345291

In [47]:
# we would try to predict something
# 1 is spam and 0 is ham

test_mail = [
    "50% discount on all products",
    "love your channel dude"
]

test_mail_trans = cv.transform(test_mail)
model.predict(test_mail_trans)

array([1, 0], dtype=int64)

In [48]:
# we can do this by a pipeline method in sklear where we dont have to use the vectorizer function again and again

from sklearn.pipeline import Pipeline

classifier = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nd', MultinomialNB())
])

In [49]:
# we dont have to use count vectorzer on every prediction or training data
classifier.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nd', MultinomialNB())])

In [50]:
classifier.predict(test_mail)

array([1, 0], dtype=int64)