In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [3]:
email_data = pd.read_csv("sms_raw_NB.csv", encoding ='ISO-8859-1')
email_data

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."
4,spam,okmail: Dear Dave this is your final notice to...
...,...,...
5554,ham,You are a great role model. You are giving so ...
5555,ham,"Awesome, I remember the last time we got someb..."
5556,spam,"If you don't, your prize will go to another cu..."
5557,spam,"SMS. ac JSco: Energy is high, but u may not kn..."


In [4]:
# cleaning the data
import re
stopwords = []



In [5]:
def cleaning_text(i):
    i = re.sub("[^A-Za-z" "]+", " ",i).lower()
    i = re.sub("[0-9" "]+", " ",i)

    w = []
    for word in i.split():
        if len(word)>=3:
            w.append(word)
    return(''.join(w))

In [6]:
# testing above function with sample text => removes punctuations, numbers
cleaning_text("Hope you are having a good week. Just checking in")
cleaning_text("hope i can understand your feelings 123121. 123 hi how .. are you?")
cleaning_text("Hi how are you, I am good")

'howareyougood'

In [7]:
def split_into_words(i):
    return [word for word in i.split(" ")]
split_into_words("Hi how are you, I am good")

['Hi', 'how', 'are', 'you,', 'I', 'am', 'good']

In [8]:
email_data.text = email_data.text.apply(cleaning_text)

In [9]:
# removing wmpty rows text data

email_data = email_data.loc[email_data.text != " ",:]



# countvectorizer

# convert a collection of text documents to a matrix of token counts

# split the data into train and test

In [10]:
from sklearn.model_selection import train_test_split

In [43]:
email_train, email_test = train_test_split(email_data, test_size = 0.2)
email_train.columns

Index(['type', 'text'], dtype='object')

In [12]:
# creating a matrix of tokens count for the entire text data
def split_words(i):
    return [word for word in i.split(" ")]

In [13]:
# Defining the preparation of email texts into word count matrix format - Bag of Words
emails_bow = CountVectorizer(analyzer = split_words).fit(email_data.text)

In [14]:
emails_bow

CountVectorizer(analyzer=<function split_words at 0x000002002486AA60>)

In [15]:
# defining BOW for all emails

all_emails_data = emails_bow.transform(email_data.text)

In [16]:
all_emails_data

<5559x5040 sparse matrix of type '<class 'numpy.int64'>'
	with 5559 stored elements in Compressed Sparse Row format>

In [17]:
# creating BOW for train and test data
train_bow = emails_bow.transform(email_train.text)

In [18]:
test_bow = emails_bow.transform(email_test.text)

In [19]:
# Learning Term weighting and normalizing on entire emails
tfidf_transformer  = TfidfTransformer().fit(all_emails_data)


In [20]:
train_tfidf = tfidf_transformer.transform(train_bow)
train_tfidf.shape

(4447, 5040)

In [21]:
test_tfidf = tfidf_transformer.transform(test_bow)
test_tfidf.shape

(1112, 5040)

In [22]:
from sklearn.naive_bayes import MultinomialNB as MB

In [23]:
classifier_MB = MB()

In [24]:
classifier_MB.fit(train_tfidf, email_train.type)
email_train.type

2632     ham
3181     ham
427      ham
5209     ham
3237     ham
        ... 
1692     ham
4920     ham
4055    spam
2751     ham
5317     ham
Name: type, Length: 4447, dtype: object

In [25]:
# Evaluation on Test Data
test_pred_m = classifier_MB.predict(test_tfidf) 
test_pred_m

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [26]:
accuracy_test = np.mean(test_pred_m == email_test.type)

In [27]:
accuracy_test

0.8615107913669064

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(test_pred_m, email_test.type) 



0.8615107913669064

In [29]:
pd.crosstab(test_pred_m, email_test.type)


type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,955,154
spam,0,3


In [30]:
train_pred = classifier_MB.predict(train_tfidf)

In [31]:
train_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [32]:
accuracy_train = np.mean(train_pred == email_train.type)

In [33]:
accuracy_train

0.8727231841691028

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
accuracy_score(train_pred, email_train.type)

0.8727231841691028

In [36]:
pd.crosstab(train_pred, email_train.type)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3857,566
spam,0,24


In [37]:
# Multinomial Naive Bayes changing default alpha for laplace smoothing
# if alpha = 0 then no smoothing is applied and the default alpha parameter is 1
# the smoothing process mainly solves the emergence of zero probability problem in the dataset.



In [38]:
from sklearn.naive_bayes import MultinomialNB as MB
classifier_mb  = MB(alpha = .2)

In [39]:
classifier_mb.fit(train_tfidf, email_train.type)
test_pred_lap = classifier_mb.predict(test_tfidf)
accuracy_score(test_pred_lap, email_test.type)


0.914568345323741

In [40]:
pd.crosstab(test_pred_lap, email_test.type)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,955,95
spam,0,62


In [41]:
classifier_mb.fit(train_tfidf, email_train.type)
train_pred_lap = classifier_mb.predict(train_tfidf)
accuracy_score(train_pred_lap, email_train.type)

0.9997751293006522

In [42]:
pd.crosstab(train_pred_lap, email_train.type)

type,ham,spam
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3857,1
spam,0,589
