In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [7]:
email_data = pd.read_csv("sms_raw_NB.csv", encoding = 'ISO-8859-1')
email_data

# loding the data

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or å£10,000..."
4,spam,okmail: Dear Dave this is your final notice to...
...,...,...
5554,ham,You are a great role model. You are giving so ...
5555,ham,"Awesome, I remember the last time we got someb..."
5556,spam,"If you don't, your prize will go to another cu..."
5557,spam,"SMS. ac JSco: Energy is high, but u may not kn..."


In [10]:
# cleaning the data

import re

def cleaning_text(i):
    i = re.sub("[^A-Za-z" "]+"," ",i).lower()
    i = re.sub("[0-9""]"," ",i)
    w = []
    for word in i.split(" "):
        if len(word)>3:
            w.append(word)
    return(" ".join(w))

In [12]:
# apply the function to the above dataset, text column

email_data.text = email_data.text.apply(cleaning_text)
email_data

Unnamed: 0,type,text
0,ham,hope having good week just checking
1,ham,give back thanks
2,ham,also doing only have
3,spam,complimentary star ibiza holiday cash needs yo...
4,spam,okmail dear dave this your final notice collec...
...,...,...
5554,ham,great role model giving much really wish each ...
5555,ham,awesome remember last time somebody high first...
5556,spam,your prize will another customer polo suite lo...
5557,spam,jsco energy high know where channel leadership...


In [13]:
# removing the empty rows

email_data = email_data.loc[email_data.text != " ",:]
email_data

Unnamed: 0,type,text
0,ham,hope having good week just checking
1,ham,give back thanks
2,ham,also doing only have
3,spam,complimentary star ibiza holiday cash needs yo...
4,spam,okmail dear dave this your final notice collec...
...,...,...
5554,ham,great role model giving much really wish each ...
5555,ham,awesome remember last time somebody high first...
5556,spam,your prize will another customer polo suite lo...
5557,spam,jsco energy high know where channel leadership...


In [19]:
# split the data into train and test data

# 80 % in train data, 20% in test data
from sklearn.model_selection import train_test_split

email_train, email_test = train_test_split(email_data, test_size = 0.2)
email_train.shape, email_test.shape

((4447, 2), (1112, 2))

In [21]:
# creating a matrix of token counts for the entire text document 


def split_into_words(i):
    return [word for word in i.split(" ")]

In [23]:
# Defining the preparation of email texts into word count matrix format - Bag of Words

emails_bow = CountVectorizer( analyzer = split_into_words).fit(email_data.text)

In [25]:
# all emails matrix 

all_email_matrix = emails_bow.transform(email_data.text)
all_email_matrix

<5559x6661 sparse matrix of type '<class 'numpy.int64'>'
	with 40974 stored elements in Compressed Sparse Row format>

In [39]:
# fro train messages

train_msg_matrix = emails_bow.transform(email_train.text)
train_msg_matrix

<4447x6661 sparse matrix of type '<class 'numpy.int64'>'
	with 32513 stored elements in Compressed Sparse Row format>

In [40]:
test_msg_matrix = emails_bow.transform(email_test.text)
test_msg_matrix

<1112x6661 sparse matrix of type '<class 'numpy.int64'>'
	with 8461 stored elements in Compressed Sparse Row format>

# prepare the tdidf vector for all


In [41]:
# Learning Term weighting and normalizing on entire emails

tfidf_transformer = TfidfTransformer().fit(all_email_matrix)
tfidf_transformer

TfidfTransformer()

In [42]:
# preparing tfidf for train data

train_tfidf = tfidf_transformer.transform(train_msg_matrix)
train_tfidf.shape


(4447, 6661)

In [43]:
# preparing tfidf for test data

test_tfidf = tfidf_transformer.transform(test_msg_matrix)
test_tfidf.shape

(1112, 6661)