In [1]:
## Import necessary libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
## Read the dataset
df = pd.read_csv('spam.csv', encoding = 'latin-1')

In [3]:
## Print the first five rows
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
## Rename the columns
df = df.rename(columns = {'v1' : 'class' , 'v2' : 'messages'})

In [5]:
## Drop the unnecessary columns
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis = 1, inplace = True)

In [6]:
## Check the dataset after dropping the unnecessary columns
df.head()

Unnamed: 0,class,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
## Shape of the data
df.shape

(5572, 2)

In [8]:
## Mapping the label columns
df['label'] = df['class'].map({'ham': 0, 'spam': 1})

In [9]:
X = df['messages']
y = df['label']

In [10]:
## Extract feature with CountVectorizer
cv = CountVectorizer()

In [11]:
X = cv.fit_transform(X)

In [12]:
X

<5572x8672 sparse matrix of type '<class 'numpy.int64'>'
	with 73916 stored elements in Compressed Sparse Row format>

In [13]:
pickle.dump(cv,open('transform.pkl', 'wb'))

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 0)

In [15]:
## Modelling
from sklearn.metrics import confusion_matrix
def MultinomialNB_classifier(X_train,X_test,y_train,y_test):
    classifier_multinomialnb = MultinomialNB()
    classifier_multinomialnb.fit(X_train,y_train)
    y_pred=classifier_multinomialnb.predict(X_test)
    cm=confusion_matrix(y_test,y_pred)
    return (f'Train Score:{classifier_multinomialnb.score(X_train,y_train)}\n Test Score:{classifier_multinomialnb.score(X_test,y_test)}')

In [16]:
def print_score(X_train, X_test, y_train, y_test):
    print("MultinomialNB:\n")
    result1=MultinomialNB_classifier(X_train, X_test, y_train, y_test)
    print(result1)

In [17]:
print_score(X_train, X_test, y_train, y_test)

MultinomialNB:

Train Score:0.9930605407992342
 Test Score:0.9849246231155779


In [18]:
## Performance Metrics
classifier_multinomialnb = MultinomialNB()
classifier_multinomialnb.fit(X_train,y_train)
y_pred = classifier_multinomialnb.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
cm

array([[1185,   11],
       [  10,  187]], dtype=int64)

In [19]:
pd.crosstab(y_test,y_pred,rownames = ['True'],colnames = ['Predicted'],margins = True)

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1185,11,1196
1,10,187,197
All,1195,198,1393


In [20]:
## Classification Report (Accuracy, Precision, Recall and F1 Score)
from sklearn.metrics import roc_auc_score,roc_curve,classification_report

In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1196
           1       0.94      0.95      0.95       197

    accuracy                           0.98      1393
   macro avg       0.97      0.97      0.97      1393
weighted avg       0.98      0.98      0.98      1393



In [22]:
## Save the file
filename = 'nlp_model.pkl'
pickle.dump(classifier_multinomialnb,open(filename,'wb'))