In [20]:
"""
In this python machine learning tutorial for beginners we will build 
email spam classifier using naive bayes algorithm. We will use sklearn CountVectorizer 
to convert email text into a matrix of numbers and then use sklearn MultinomialNB classifier 
to train our model. The model score with this approach comes out to be very high 
(around 98%). Sklearn pipeline allows us to handle pre processing transformations 
easily with its convenient api. In the end there is an exercise where you need to 
classify sklearn wine dataset using naive bayes.
"""

''

In [27]:
import pandas as pd

In [34]:
df = pd.read_csv('/Users/haleyk/Desktop/Codebasics/ML/14_naive_bayes/spam.csv')
df 

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [29]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [35]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
# ? df = pd.get_dummies(df.category)
# ? df_new = df.category.map({'spam':1, 'ham': 2})
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# Naive Bayes Classifier Algorithm
- CountVectorizer
- Multinomial

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()  # recall counting the number of repeated words 'this is the first doc', 'this doc is the second doc', and there is two 'this', three 'doc'
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:2]  # Return a dense ndarray representation of this matrix.

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [7]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count,y_train)

"""
The multinomial Naive Bayes classifier is suitable for classification with
discrete features (e.g., word counts for text classification). The
multinomial distribution normally requires integer feature counts. However,
in practice, fractional counts such as tf-idf may also work.

"""

'\nThe multinomial Naive Bayes classifier is suitable for classification with\ndiscrete features (e.g., word counts for text classification). The\nmultinomial distribution normally requires integer feature counts. However,\nin practice, fractional counts such as tf-idf may also work.\n\n'

In [8]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)  # first one is not a spam, while second one is

array([0, 1])

In [9]:
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

0.9892318736539842

In [None]:
# Sklearn Pipeline


In [10]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),=
    ('nb', MultinomialNB())
])

"""
>>> pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
>>> # The pipeline can be used as any other estimator
>>> # and avoids leaking the test set into the train set
>>> pipe.fit(X_train, y_train)
Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
"""

In [11]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [12]:
clf.score(X_test,y_test)

0.9892318736539842

In [13]:
clf.predict(emails) 

array([0, 1])