## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel('spam.xlsx', header=None, names=['class', 'sms'])

In [3]:
df.head()

Unnamed: 0,class,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['class'].value_counts()

ham     3622
spam     553
Name: class, dtype: int64

In [5]:
print('Percentage of Spam = ', round((553/3622)*100,2))

Percentage of Spam =  15.27


In [6]:
df['label'] = df['class'].map({'ham':0, 'spam':1})

In [7]:
df.head()

Unnamed: 0,class,sms,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
x = df['sms']
y = df['label']

In [9]:
print(x.shape)
print(y.shape)

(4175,)
(4175,)


### Splitting dataset into train and test 

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 1)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english')

In [12]:
cv.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
cv.vocabulary_

{'congrats': 1598,
 'nokia': 3867,
 '3650': 365,
 'video': 5864,
 'camera': 1321,
 'phone': 4149,
 '09066382422': 183,
 'calls': 1315,
 'cost': 1645,
 '150ppm': 243,
 'ave': 931,
 '3mins': 378,
 'vary': 5843,
 'mobiles': 3669,
 '16': 249,
 'close': 1507,
 '300603': 347,
 'post': 4277,
 'bcm4284': 1016,
 'ldn': 3267,
 'wc1n3xx': 5971,
 'sorry': 5092,
 'uncle': 5750,
 'll': 3357,
 'touch': 5635,
 'right': 4645,
 'brah': 1187,
 'later': 3248,
 'sms': 5041,
 'ac': 633,
 'blind': 1118,
 'date': 1758,
 '4u': 435,
 'rodds1': 4671,
 '21': 283,
 'aberdeen': 618,
 'united': 5769,
 'kingdom': 3183,
 'check': 1431,
 'http': 2853,
 'img': 2918,
 'icmb3cktz8r7': 2894,
 'dates': 1759,
 'send': 4838,
 'hide': 2768,
 'urgent': 5800,
 'mobile': 3668,
 '077xxx': 28,
 'won': 6090,
 '000': 1,
 'bonus': 1145,
 'caller': 1309,
 'prize': 4352,
 '02': 7,
 '06': 17,
 '03': 12,
 '2nd': 331,
 'attempt': 906,
 'reach': 4488,
 '09066362206': 176,
 'asap': 873,
 'box97n7qp': 1181,
 'hi': 2766,
 'hope': 2818,
 'txt':

In [14]:
xtrt = cv.transform(x_train)
xtst = cv.transform(x_test)

In [15]:
print(type(xtrt))
print(xtrt)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 183)	1
  (0, 243)	1
  (0, 249)	1
  (0, 347)	1
  (0, 365)	1
  (0, 378)	1
  (0, 931)	1
  (0, 1016)	1
  (0, 1315)	1
  (0, 1321)	1
  (0, 1507)	1
  (0, 1598)	1
  (0, 1645)	1
  (0, 3267)	1
  (0, 3669)	1
  (0, 3867)	1
  (0, 4149)	1
  (0, 4277)	1
  (0, 5843)	1
  (0, 5864)	1
  (0, 5971)	1
  (1, 3357)	1
  (1, 5092)	1
  (1, 5635)	1
  (1, 5750)	1
  :	:
  (3128, 5266)	1
  (3129, 1271)	1
  (3129, 1713)	1
  (3129, 5226)	1
  (3129, 5474)	1
  (3129, 5969)	1
  (3129, 6105)	1
  (3130, 205)	1
  (3130, 252)	1
  (3130, 293)	1
  (3130, 709)	1
  (3130, 1758)	1
  (3130, 2339)	1
  (3130, 3091)	1
  (3130, 3121)	1
  (3130, 3575)	1
  (3130, 3727)	1
  (3130, 4114)	1
  (3130, 4514)	1
  (3130, 4587)	1
  (3130, 4740)	1
  (3130, 4868)	1
  (3130, 5474)	1
  (3130, 5517)	1
  (3130, 5586)	1


## Using Multinomial Naive Bayes Classifier Model

In [16]:
from sklearn.naive_bayes import MultinomialNB
m = MultinomialNB()

In [17]:
m.fit(xtrt, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
pred_class = m.predict(xtst)

In [19]:
pred_class

array([0, 0, 0, ..., 1, 0, 0])

In [20]:
pred_prob = m.predict_proba(xtst)

In [21]:
pred_prob

array([[9.99950591e-01, 4.94090549e-05],
       [9.99997452e-01, 2.54772581e-06],
       [9.99404553e-01, 5.95447172e-04],
       ...,
       [3.22395350e-01, 6.77604650e-01],
       [8.69370808e-01, 1.30629192e-01],
       [9.99991367e-01, 8.63252561e-06]])

### Checking for Accuracy Score

In [22]:
from sklearn import metrics

In [23]:
metrics.accuracy_score(y_test, pred_class)

0.9818007662835249

In [24]:
metrics.confusion_matrix(y_test, pred_class)

array([[891,   9],
       [ 10, 134]])

In [25]:
print("Precision",metrics.precision_score(y_test, pred_class))
print("PRECISION SCORE :",metrics.precision_score(y_test, pred_class))
print("RECALL SCORE :", metrics.recall_score(y_test, pred_class))
print("F1 SCORE :",metrics.f1_score(y_test, pred_class))

Precision 0.9370629370629371
PRECISION SCORE : 0.9370629370629371
RECALL SCORE : 0.9305555555555556
F1 SCORE : 0.9337979094076655


### Building a small prediction function

In [28]:
def prediction(s):
    tranforming = cv.transform(s)
    pred_class_p = m.predict(tranforming)
    pred_prob_p = m.predict_proba(tranforming)
    if pred_prob_p[0,0] < pred_prob_p[0,1]:
        return 'Spam Message'
    else:
        return 'Not Spam Message'
   # return pred_class_p,pred_prob_p[0,1]

### User input for classification

In [31]:
text = input("Enter your message : ").split('\n') 

Enter your message : happy birthday


In [32]:
prediction(text)

'Not Spam Message'