In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import string

In [2]:
df_sms = pd.read_csv('spam.csv',encoding = 'latin_1')
df_sms.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# Cleaning the data
df_sms = df_sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
df_sms = df_sms.rename(columns = {"v1" : "label", "v2" : "mail"})
df_sms.head()

Unnamed: 0,label,mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# Replacing 'ham' with 0 and 'spam' with 1
df_sms['label'].replace('ham', 0, inplace = True)
df_sms['label'].replace('spam', 1, inplace = True)
df_sms.head()

Unnamed: 0,label,mail
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data = df_sms.to_numpy()

In [6]:
X = data[:,1]
y = data[:,0]

In [7]:
# Tokenizing the sentence
from nltk.tokenize import RegexpTokenizer, word_tokenize
tk = RegexpTokenizer('\w+')
def word_tk(text):
    tokens = tk.tokenize(text)
    return tokens

In [8]:
# Removing stopwords
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))
def remove_stopwords(tokens):
    return [word for word in tokens if word not in sw]

In [9]:
# Stemming words
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def stem_words(tokens):
    return [ps.stem(token) for token in tokens]

In [10]:
# Processing the text
def text_processing(text):
    text = text.lower()
    tokens = word_tk(text)
    removed_stopwords = remove_stopwords(tokens)
    stemmed_words = stem_words(removed_stopwords)
    clean_text = ' '.join(stemmed_words)
    return clean_text

In [11]:
# Getting the cleaned document
def clean_doc(X):
    ls = []
    for d in X:
        ls.append(text_processing(d))
    return ls

In [12]:
doc = clean_doc(X)

In [13]:
doc[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [14]:
# Creating a vocabulary of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bag_of_words = cv.fit_transform(doc)

In [15]:
X = bag_of_words.toarray()

In [16]:
y = np.asarray(y, dtype = "int")

In [17]:
# Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [18]:
# Classifying the mail into spam or ham using Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [19]:
classifier.score(X_test, y_test)

0.9847533632286996