# Read data

In [1]:
# read data
raw_data = open("SMSSpamCollection").read()


# Print data
raw_data[0:500]

"ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\nham\tOk lar... Joking wif u oni...\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tU dun say so early hor... U c already then say...\nham\tNah I don't think he goes to usf, he lives around here though\nspam\tFreeMsg Hey there darling it's been 3 week's now and no word bac"

In [2]:
# Parse data and split
parsedData = raw_data.replace("\t","\n").split("\n")

In [3]:
parsedData[0:5]

['ham',
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'ham',
 'Ok lar... Joking wif u oni...',
 'spam']

In [4]:
# label data
labelList = parsedData[0::2]

# Text data
textList = parsedData[1::2]

In [5]:
print(labelList[0:5])
print(textList[0:5])

['ham', 'ham', 'spam', 'ham', 'ham']
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'U dun say so early hor... U c already then say...', "Nah I don't think he goes to usf, he lives around here though"]


In [6]:
# Import package panda
import pandas as pd

In [7]:
# Create a data frame
df = pd.DataFrame({
    'text':textList,
    'label':labelList
})

ValueError: arrays must all be same length

In [8]:
print(len(textList))
print(len(labelList))

5574
5575


**We observe label has 1 more extra data than text**

In [9]:
# print last 5 label data
print(labelList[-5:])

['ham', 'ham', 'ham', 'ham', '']


**We see a blank data in label. Lets remove it**

In [10]:
labelList = labelList[:-1]

# now create a dataframe
df = pd.DataFrame({
    'label':labelList,
    'text':textList
    
})


In [11]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## We can do these part using pandas very easily

In [12]:
pd.set_option('display.max_colwidth',100)
dataset = pd.read_csv("SMSSpamCollection",sep="\t",header = None)
dataset.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [13]:
# Name column
dataset.columns =["label","body_text"]
dataset.head()

Unnamed: 0,label,body_text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


# Explore data

In [14]:
# Shape of data

print("Shape : ",dataset.shape)

Shape :  (5572, 2)


In [15]:
# unic value of label 
print(dataset.label.value_counts())

ham     4825
spam     747
Name: label, dtype: int64


In [16]:
# check any missing value
dataset.isnull().sum()

label        0
body_text    0
dtype: int64

# Implementing pipeline to clean text

## Remove Punctuation

In [17]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [23]:
# Function that remove punctuation
def remove_punctuation(text):
    text_nonpunc = "".join([char for char in text if char  not in string.punctuation])
    
    return text_nonpunc
      

In [24]:
# Apply this function in data set and create a new column
dataset["body_text_cleaned"]= dataset.body_text.apply(lambda x : remove_punctuation(x))
dataset.head()

Unnamed: 0,label,body_text,body_text_cleaned
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


## Tokenization

In [25]:
import re

In [26]:
# Function that tokenize the text
def tokenize(text):
    tokens = re.split("\W+",text) # Split one or more non word charecter
    return tokens

In [28]:
# Create a tokenize column
dataset["body_text_tokenize"] = dataset["body_text_cleaned"].apply(lambda x: tokenize(x.lower()))
dataset.head()

Unnamed: 0,label,body_text,body_text_cleaned,body_text_tokenize
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"


## Remove Stopwords

In [31]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\N M Shihab
[nltk_data]     Islam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [32]:
stopwords = nltk.corpus.stopwords.words('english')

In [33]:
# Function that remove stopwords
def remove_stopwords(tokenize_list):
    text = [word for word in tokenize_list if word  not in stopwords]
    return text


In [34]:
dataset["body_text_nostop"] = dataset.body_text_tokenize.apply(lambda x : remove_stopwords(x))
dataset.head()

Unnamed: 0,label,body_text,body_text_cleaned,body_text_tokenize,body_text_nostop
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
