## Email Spam Detection

In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('spam.txt')
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [6]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [7]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df.head(10)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


### Splitting Dataset

In [8]:
from sklearn.model_selection import train_test_split

In [45]:
x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam,random_state=1)

In [46]:
x_train.head(10)

710     4mths half price Orange line rental & latest c...
3740                           Did you stitch his trouser
2711    Hope you enjoyed your new content. text stop t...
3155    Not heard from U4 a while. Call 4 rude chat pr...
3748    Ü neva tell me how i noe... I'm not at home in...
2389    wiskey Brandy Rum Gin Beer Vodka Scotch Shampa...
3464    i am seeking a lady in the street and a freak ...
772     Lol! U drunkard! Just doing my hair at d momen...
3667    I'm turning off my phone. My moms telling ever...
4955    U coming back 4 dinner rite? Dad ask me so i r...
Name: Message, dtype: object

#### The message column is still in text... needs to be converted to integers

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
v = CountVectorizer()

In [49]:
x_train_count = v.fit_transform(x_train.values)
x_train_count

<4179x7453 sparse matrix of type '<class 'numpy.int64'>'
	with 55159 stored elements in Compressed Sparse Row format>

In [50]:
x_train_count.toarray()[0:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Naive Bayes has 3 different Classifiers 

1. Gaussian Naive Bayes - Features have a bell curve distribution
2. Bernouli Naive Bayes - Assumes features are binary
3. Multinomial Naive Bayes - Features are discrete data 

## Model

* Multinomial NB

In [51]:
from sklearn.naive_bayes import MultinomialNB

In [52]:
model = MultinomialNB()
model.fit(x_train_count, y_train)

MultinomialNB()

In [53]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [55]:
x_test

1078                         Yep, by the pretty sculpture
4028        Yes, princess. Are you going to make me moan?
958                            Welp apparently he retired
4642                                              Havent.
4674    I forgot 2 ask ü all smth.. There's a card on ...
                              ...                        
3207                                        At home also.
4655                     Hope you are having a great day.
1140    Message:some text missing* Sender:Name Missing...
1793    WIN: We have a winner! Mr. T. Foley won an iPo...
1710    U meet other fren dun wan meet me ah... Muz b ...
Name: Message, Length: 1393, dtype: object

In [60]:
x_test_count = v.transform(x_test)
x_test_count

<1393x7453 sparse matrix of type '<class 'numpy.int64'>'
	with 17584 stored elements in Compressed Sparse Row format>

In [63]:
model.predict(x_test_count[:10])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [64]:
y_test[:10]

1078    0
4028    0
958     0
4642    0
4674    0
5461    0
4210    0
4216    0
1603    0
1504    0
Name: spam, dtype: int64

In [65]:
model.score(x_test_count, y_test)

0.9877961234745154

## Using Sklearn Pipeline

In [66]:
from sklearn.pipeline import Pipeline

In [67]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [68]:
clf.fit(x_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [71]:
clf.predict(x_test[:10])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [72]:
y_test[:10]

1078    0
4028    0
958     0
4642    0
4674    0
5461    0
4210    0
4216    0
1603    0
1504    0
Name: spam, dtype: int64

In [73]:
clf.score(x_test, y_test)

0.9877961234745154

In [74]:
clf.predict(emails)

array([0, 1])