In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv')
df.shape

(5572, 2)

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
df['Spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [6]:
df.head()

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Spam'], test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
y_train.shape

(4457,)

In [16]:
X_train.values

array(["Dont kick coco when he's down",
       "Carlos is taking his sweet time as usual so let me know when you and patty are done/want to smoke and I'll tell him to haul ass",
       "I've got it down to a tea. not sure which flavour", ...,
       'Me not waking up until 4 in the afternoon, sup',
       'Mm you ask him to come its enough :-)',
       'Hey! Congrats 2u2. id luv 2 but ive had 2 go home!'], dtype=object)

In [14]:
type(y_train)

pandas.core.series.Series

In [15]:
type(X_train.values)

numpy.ndarray

### Create bag of words representation using CountVectorizer

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7708 sparse matrix of type '<class 'numpy.int64'>'
	with 59550 stored elements in Compressed Sparse Row format>

In [20]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [21]:
X_train_cv.shape

(4457, 7708)

In [24]:
v.get_feature_names_out()

array(['00', '000', '000pes', ..., 'èn', 'ú1', '〨ud'], dtype=object)

In [25]:
v.get_feature_names_out()[1000:1050]

array(['applyed', 'applying', 'appointment', 'appointments', 'appreciate',
       'approaches', 'approaching', 'approve', 'approved', 'approx',
       'apps', 'appt', 'appy', 'april', 'aproach', 'apt', 'aptitude',
       'aquarius', 'ar', 'arab', 'arabian', 'arcade', 'archive', 'ard',
       'are', 'area', 'aren', 'arent', 'arestaurant', 'aretaking',
       'areyouunique', 'argentina', 'argh', 'argue', 'arguing',
       'argument', 'arguments', 'aries', 'arithmetic', 'arm', 'armand',
       'arms', 'arng', 'arngd', 'arnt', 'around', 'arr', 'arrange',
       'arranging', 'arrested'], dtype=object)

In [26]:
dir(v)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',
 '_white_spaces',
 '_word_ngrams',
 'analyzer',
 'binary',
 'build_analyzer',
 'build_preprocessor',
 'build_tokenizer',
 'decode',
 'decode_error',
 'dtype',
 'encoding',
 'fit',

In [27]:
v.vocabulary_

{'dont': 2380,
 'kick': 3891,
 'coco': 1855,
 'when': 7440,
 'he': 3343,
 'down': 2397,
 'carlos': 1639,
 'is': 3720,
 'taking': 6654,
 'his': 3413,
 'sweet': 6613,
 'time': 6862,
 'as': 1059,
 'usual': 7191,
 'so': 6246,
 'let': 4051,
 'me': 4371,
 'know': 3926,
 'you': 7669,
 'and': 942,
 'patty': 5076,
 'are': 1024,
 'done': 2378,
 'want': 7342,
 'to': 6895,
 'smoke': 6221,
 'll': 4128,
 'tell': 6714,
 'him': 3407,
 'haul': 3328,
 'ass': 1078,
 've': 7221,
 'got': 3183,
 'it': 3731,
 'tea': 6690,
 'not': 4789,
 'sure': 6585,
 'which': 7449,
 'flavour': 2870,
 'talk': 6656,
 'ever': 2637,
 'ok': 4880,
 'its': 3738,
 'my': 4629,
 'word': 7551,
 'meant': 4378,
 'an': 938,
 'apology': 991,
 'from': 3003,
 'for': 2922,
 'texting': 6755,
 'get': 3099,
 'drugs': 2434,
 'at': 1089,
 'lt': 4216,
 'gt': 3242,
 'night': 4739,
 'yeah': 7641,
 'right': 5735,
 'bring': 1481,
 'tape': 6670,
 'measure': 4380,
 'fri': 2982,
 'mm': 4495,
 'have': 3332,
 'some': 6259,
 'kanji': 3861,
 'eat': 2479,
 'a

In [28]:
v.get_feature_names_out()[1662]

'caught'

In [30]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:
np.where(X_train_np[0] != 0)

(array([1855, 2380, 2397, 3343, 3891, 7440], dtype=int64),)

In [39]:
X_train_np[0][1855]

1

### Train the Naive Bayes Model

In [41]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv, y_train)

MultinomialNB()

In [42]:
X_test_cv = v.transform(X_test)

### Evaluate Performance

In [43]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       958
           1       0.97      0.90      0.93       157

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [44]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_cv = v.transform(emails)
model.predict(emails_cv)

array([0, 1], dtype=int64)

### Train the model using sklearn pipeline and reduce number of lines of code

In [45]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [46]:
y_pred = clf.predict(X_test)

In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       958
           1       0.97      0.90      0.93       157

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

