# Data Preprocessing

In [1]:
#importing all neccesary libraries
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# import and preview datasets
data = pd.read_csv('spam.csv', encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# check dataset properties
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [4]:
# drop the last three axis, rename the first two columns
data = data.filter(["v1","v2"])
data.rename(columns={'v1':'Label','v2':'Message'},inplace=True)
data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# download stopwords from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agbaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# replace symbols, convert all texts to lowercase, split the texts(tokenize) and stemmize the words(elder brother to lammetizing)
ps = PorterStemmer() 
content = []
for i in range(0, data.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', data['Message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    content.append(review)
    
    

In [7]:
# create two columns, one containing cleaned message content and the other cleaned content length
data['clean_msg']=np.empty((len(content),1))
for i in range(len(content)):
    data['clean_msg'][i]=content[i]
data['clean_msg_len']=data['clean_msg'].apply(len)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,Label,Message,clean_msg,clean_msg_len
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,76
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,21
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...,99
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,35
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,36


In [8]:
# Perform one-hot encoding on label column(ham = 1 and spam = 0)
data=pd.concat([data, pd.get_dummies(data['Label'])], axis=1)
data.drop(['Label'],axis=1,inplace=True)
data.drop(['spam'],axis=1,inplace=True)
data.rename(columns={'ham':'label'},inplace=True)
data.head()

Unnamed: 0,Message,clean_msg,clean_msg_len,label
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...,76,1
1,Ok lar... Joking wif u oni...,ok lar joke wif u oni,21,1
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...,99,0
3,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say,35,1
4,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though,36,1


In [9]:
# Vectorization
# Creating X and Y: Assign clean_msg column to X, label column to Y, perform countvectorization on both X and Y
X=data['clean_msg']
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(content).toarray()

Y = data['label']
Y = np.array(Y)

# preview countvectorization of X and Y
print('X = \n', X, '\nY = \n', Y)

X = 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] 
Y = 
 [1 1 0 ... 1 1 1]
