#Importing the Libraries

In [1]:
import numpy as np
import pandas as pd

#Loading the Dataset

In [2]:
dataset = pd.read_csv('spam.csv', encoding='latin-1')

#Data Preparation and Preprocessing

In [3]:
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
dataset.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [5]:
dataset.duplicated().sum()

np.int64(415)

In [6]:
dataset.drop_duplicates(inplace=True)

In [7]:
label_map = {
    'spam' : 1,
    'ham' : 0
}

In [8]:
dataset['Category'] = dataset['Category'].replace(label_map).infer_objects(copy=False)

  dataset['Category'] = dataset['Category'].replace(label_map).infer_objects(copy=False)


In [9]:
dataset.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# Creating a function to Remove special characters from the data
def rem_special_chars(text):
    new_text = ""
    for i in text:
        if i.isalnum() or i == " ":
            new_text += i
    return new_text.strip()

In [11]:
# Removing special characters from the data
dataset['Message'] = dataset['Message'].apply(rem_special_chars)

In [12]:
dataset.head()

Unnamed: 0,Category,Message
0,0,Go until jurong point crazy Available only in ...
1,0,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor U c already then say
4,0,Nah I dont think he goes to usf he lives aroun...


In [13]:
# Convert all to small alphabet letters
dataset['Message'] = dataset['Message'].apply(lambda x:x.lower())

In [14]:
dataset.head()

Unnamed: 0,Category,Message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [18]:
X = dataset['Message']
y = dataset['Category']

#Applying CountVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer(stop_words='english',max_features=10000)

In [21]:
X = cv.fit_transform(dataset['Message']).toarray()

In [22]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
y = dataset['Category']

In [26]:
y.values

array([0, 0, 1, ..., 0, 0, 0])

#Splitting the Dataset to Training an Test Set

In [27]:
# Splitting the data into training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [28]:
print(X_train.shape)
print(X_test.shape)
print(y_test.shape)
print(y_train.shape)

(4125, 9338)
(1032, 9338)
(1032,)
(4125,)


#Applying Naive Bayes Model

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
clf = MultinomialNB()

In [31]:
clf.fit(X_train, y_train)

In [32]:
y_pred = clf.predict(X_test)

#Computing the Accuracy

In [34]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)

In [35]:
accuracy

0.9767441860465116

In [37]:
# Length of the data that we have used for training our model
len(cv.get_feature_names_out())

9338

In [39]:
# Length of stopwords(eg: a, an, the, etc.)
len(cv.get_stop_words())

318

# Post Training

In [40]:
import pickle

In [42]:
pickle.dump(cv,open('cv.pkl','wb'))
pickle.dump(clf,open('clf.pkl','wb'))