In [6]:
import pandas as pd
messages = pd.read_csv('./sample_files/spam_classification/spam.csv', sep=',', names=['label', 'message'])

In [7]:
messages

Unnamed: 0,label,message
0,label,text
1,spam,Congratulations! You have been selected to rec...
2,spam,URGENT: Your account will be suspended unless ...
3,spam,Buy cheap prescription drugs online — no presc...
4,spam,You won a free cruise! Call 1-800-FREE-CRUISE ...
5,spam,Lowest mortgage rates available — refinance to...
6,spam,Work from home and earn $200/day — no experien...
7,spam,Get a refund of your taxes now. Provide your b...
8,spam,"This is not spam. Transfer $5,000 to confirm i..."
9,spam,Exclusive offer: 90% off designer handbags for...


In [8]:
## Data Cleaning and Preprocessing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/naveen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
print(corpus)

['text', 'congratul select receiv gift card click claim', 'urgent account suspend unless verifi inform within hour verifi', 'buy cheap prescript drug onlin prescript requir limit stock order today', 'free cruis call free cruis claim prize', 'lowest mortgag rate avail refin today save mo appli', 'work home earn day experi necessari sign instant access', 'get refund tax provid bank detail releas fund', 'spam transfer confirm ident receiv back', 'exclus offer design handbag hour shop', 'act fast invoic unpaid servic disconnect view invoic link', 'congratul devic infect click link download remov tool', 'limit time loan approv credit check pre approv minut', 'earn bitcoin complet short survey payout guarante join', 'hot singl area wait see view profil', 'free iphon giveaway respond survey limit winner enter', 'get access premium stream librari month unsubscrib anytim', 'unlock exclus crypto signal doubl portfolio day subscrib', 'student loan elig forgiv click see qualifi', 'privat video fou

In [46]:
## Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=250, binary=True)
X = cv.fit_transform(corpus).toarray()
cv.vocabulary_

{'text': np.int64(209),
 'congratul': np.int64(39),
 'select': np.int64(181),
 'receiv': np.int64(160),
 'gift': np.int64(80),
 'card': np.int64(25),
 'click': np.int64(31),
 'claim': np.int64(30),
 'urgent': np.int64(222),
 'account': np.int64(1),
 'suspend': np.int64(205),
 'unless': np.int64(217),
 'verifi': np.int64(223),
 'inform': np.int64(97),
 'within': np.int64(231),
 'hour': np.int64(94),
 'buy': np.int64(22),
 'cheap': np.int64(28),
 'prescript': np.int64(148),
 'drug': np.int64(58),
 'onlin': np.int64(133),
 'requir': np.int64(170),
 'limit': np.int64(110),
 'stock': np.int64(198),
 'order': np.int64(134),
 'today': np.int64(212),
 'free': np.int64(74),
 'cruis': np.int64(43),
 'call': np.int64(24),
 'prize': np.int64(150),
 'lowest': np.int64(114),
 'mortgag': np.int64(125),
 'rate': np.int64(158),
 'avail': np.int64(14),
 'refin': np.int64(161),
 'save': np.int64(179),
 'mo': np.int64(122),
 'appli': np.int64(7),
 'work': np.int64(232),
 'home': np.int64(91),
 'earn': np.

In [47]:
X.shape
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(41, 234))

In [48]:
# N grams
cv = CountVectorizer(ngram_range=(1,2), max_features=500, binary=True)
Y = cv.fit_transform(corpus).toarray()
cv.vocabulary_

{'text': np.int64(441),
 'congratul': np.int64(82),
 'select': np.int64(386),
 'receiv': np.int64(335),
 'gift': np.int64(169),
 'card': np.int64(49),
 'click': np.int64(61),
 'claim': np.int64(59),
 'congratul select': np.int64(85),
 'select receiv': np.int64(387),
 'receiv gift': np.int64(338),
 'gift card': np.int64(170),
 'card click': np.int64(50),
 'click claim': np.int64(62),
 'urgent': np.int64(467),
 'account': np.int64(2),
 'suspend': np.int64(434),
 'unless': np.int64(457),
 'verifi': np.int64(469),
 'inform': np.int64(204),
 'within': np.int64(487),
 'hour': np.int64(197),
 'urgent account': np.int64(468),
 'account suspend': np.int64(4),
 'suspend unless': np.int64(435),
 'unless verifi': np.int64(458),
 'verifi inform': np.int64(470),
 'inform within': np.int64(205),
 'within hour': np.int64(488),
 'hour verifi': np.int64(199),
 'buy': np.int64(42),
 'cheap': np.int64(55),
 'prescript': np.int64(313),
 'drug': np.int64(126),
 'onlin': np.int64(281),
 'requir': np.int64(36

In [49]:
Y.shape

(41, 493)

In [43]:
Y[1]

array([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [51]:
cv = CountVectorizer(ngram_range=(1,3), max_features=1000, binary=True)
trigram = cv.fit_transform(corpus).toarray()
cv.vocabulary_

{'text': np.int64(641),
 'congratul': np.int64(117),
 'select': np.int64(561),
 'receiv': np.int64(490),
 'gift': np.int64(250),
 'card': np.int64(71),
 'click': np.int64(86),
 'claim': np.int64(84),
 'congratul select': np.int64(122),
 'select receiv': np.int64(562),
 'receiv gift': np.int64(494),
 'gift card': np.int64(251),
 'card click': np.int64(72),
 'click claim': np.int64(87),
 'congratul select receiv': np.int64(123),
 'select receiv gift': np.int64(563),
 'receiv gift card': np.int64(495),
 'gift card click': np.int64(252),
 'card click claim': np.int64(73),
 'urgent': np.int64(678),
 'account': np.int64(3),
 'suspend': np.int64(631),
 'unless': np.int64(664),
 'verifi': np.int64(681),
 'inform': np.int64(299),
 'within': np.int64(706),
 'hour': np.int64(290),
 'urgent account': np.int64(679),
 'account suspend': np.int64(6),
 'suspend unless': np.int64(632),
 'unless verifi': np.int64(665),
 'verifi inform': np.int64(682),
 'inform within': np.int64(300),
 'within hour': np.

In [52]:
trigram.shape

(41, 715)