In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [50]:
df = pd.read_csv('spamVsham.csv', encoding="latin-1")

# Data Preprocessing

In [51]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [52]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [53]:
df.shape

(5572, 2)

In [54]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [55]:
df['label'] = df['v1'].map({'ham': 0, 'spam': 1})

In [56]:
df.head(3)

Unnamed: 0,v1,v2,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1


# ML Model / Classification via NLP 

In [57]:
X = df['v2']
y = df['label']
cv = CountVectorizer()

In [58]:
df.dtypes

v1       object
v2       object
label     int64
dtype: object

In [59]:
X = cv.fit_transform(X)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [61]:
clf = MultinomialNB()

In [62]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [63]:
clf.score(X_test,y_test)

0.97847533632287

In [64]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99       965
          1       0.91      0.93      0.92       150

avg / total       0.98      0.98      0.98      1115



In [66]:
from sklearn.externals import joblib
joblib.dump(clf, 'NB_spam_model.pkl')

['NB_spam_model.pkl']

In [67]:
NB_spam_model = open('NB_spam_model.pkl','rb')
clf = joblib.load(NB_spam_model)