In [1]:
import pandas as pd
messages = pd.read_csv('./sample_files/spam_classification/spam.csv', sep=',', names=['label', 'message'])

In [2]:
messages

Unnamed: 0,label,message
0,label,text
1,spam,Congratulations! You have been selected to rec...
2,spam,URGENT: Your account will be suspended unless ...
3,spam,Buy cheap prescription drugs online — no presc...
4,spam,You won a free cruise! Call 1-800-FREE-CRUISE ...
5,spam,Lowest mortgage rates available — refinance to...
6,spam,Work from home and earn $200/day — no experien...
7,spam,Get a refund of your taxes now. Provide your b...
8,spam,"This is not spam. Transfer $5,000 to confirm i..."
9,spam,Exclusive offer: 90% off designer handbags for...


In [3]:
## Data Cleaning and Preprocessing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordlematize = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/naveen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [wordlematize.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
print(corpus)

['text', 'congratulation selected receive gift card click claim', 'urgent account suspended unless verify information within hour verify', 'buy cheap prescription drug online prescription required limited stock order today', 'free cruise call free cruise claim prize', 'lowest mortgage rate available refinance today save mo apply', 'work home earn day experience necessary sign instant access', 'get refund tax provide bank detail release fund', 'spam transfer confirm identity receive back', 'exclusive offer designer handbag hour shop', 'act fast invoice unpaid service disconnected view invoice link', 'congratulation device infected click link download removal tool', 'limited time loan approval credit check pre approve minute', 'earn bitcoin completing short survey payout guaranteed join', 'hot single area waiting see viewed profile', 'free iphone giveaway respondent survey limited winner enter', 'get access premium streaming library month unsubscribe anytime', 'unlock exclusive crypto si

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf = TfidfVectorizer(max_features=250)
X = tfidf.fit_transform(corpus).toarray()

In [12]:
X

array([[0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.28, 0.  , ..., 0.31, 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]], shape=(41, 244))

In [13]:
#N grams
tfidf_ngram = TfidfVectorizer(ngram_range=(1,3), max_features=250)
x_ngram = tfidf_ngram.fit_transform(corpus).toarray()

In [15]:
tfidf_ngram.vocabulary_

{'congratulation': np.int64(9),
 'receive': np.int64(144),
 'card': np.int64(5),
 'click': np.int64(7),
 'claim': np.int64(6),
 'receive gift': np.int64(146),
 'receive gift card': np.int64(147),
 'account': np.int64(1),
 'suspended': np.int64(213),
 'verify': np.int64(224),
 'within': np.int64(243),
 'hour': np.int64(21),
 'suspended unless': np.int64(214),
 'verify information': np.int64(225),
 'within hour': np.int64(244),
 'urgent account suspended': np.int64(223),
 'suspended unless verify': np.int64(215),
 'verify information within': np.int64(226),
 'within hour verify': np.int64(245),
 'prescription': np.int64(112),
 'online': np.int64(65),
 'limited': np.int64(24),
 'stock': np.int64(189),
 'order': np.int64(68),
 'today': np.int64(220),
 'prescription drug': np.int64(113),
 'online prescription': np.int64(66),
 'prescription required': np.int64(115),
 'stock order': np.int64(190),
 'order today': np.int64(71),
 'prescription drug online': np.int64(114),
 'online prescription 

In [16]:
x_ngram

array([[0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.24, 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.3 , ..., 0.  , 0.  , 0.  ]], shape=(41, 250))

In [17]:
x_ngram[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.