In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.model_selection import train_test_split
import string

pd.set_option('display.max_colwidth', 100)
stopwords = nltk.corpus.stopwords.words('english')

messages = pd.read_csv('spam.csv', encoding = 'latin-1')
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


In [2]:
# Checking the shape of the 'spam.csv' dataset
messages.shape

(5572, 5)

In [3]:
# Calculating how many blank values each column has
messages.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [4]:
# Dropping the columns 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4' as over 99% of the elemens are blank
messages = messages.drop(labels = ['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], axis = 1)

# Renaming the two remaining columns' headers 'v1' and 'v2' to 'label' and 'text'
messages.columns = ['label', 'text']

# Converting the labels from string 'spam'/'ham' to binary
messages['label'] = np.where(messages['label'] == 'spam', 1, 0)

# A function to clean data: converting all the characters to lower case, removing punctuations (such as '!', '^', ..),
# tokenizing each message and finally removing stopwords not adding much meaning to a sentence such as 'I', 'it', ..
def clean_text(text):
    text = "".join([char.lower() for char in text if char not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text
    
# cleanning data using clean_text function
messages['cleanned_text'] = messages['text'].apply(lambda x: clean_text(x))


In [5]:
# Splitting data into train and test sets, 80% and 20%, respectively
x_train, x_test, y_train, y_test = train_test_split(messages['cleanned_text'], messages['label'], test_size = 0.2)

In [6]:
# Verifying to see how x_test as an example looks like
x_test[:]

1198    [al, moan, n, e, thin, goes, wrong, faultal, de, arguments, r, faultfed, himso, bother, hav, 2go...
1617                                                                              [u, download, fring, app]
501                                                                                               [ì, come]
618                                                  [come, n, pick, ì, come, immediately, aft, ur, lesson]
258     [tried, contact, reply, offer, video, handset, 750, anytime, networks, mins, unlimited, text, ca...
                                                       ...                                                 
4845                                                      [pls, help, tell, ashley, cant, find, number, oh]
5152    [idk, im, sitting, stop, shop, parking, lot, right, bawling, eyes, feel, like, im, failure, ever...
890                                                                                         [ask, princess]
1226    [reply, name, addres

In [7]:
# Saving each of x_train, x_test, y_train, y_test as separate csv files in order to use in different models
# Ignoring the index and specifying there are headers in the files so the model does not consider the indices and headers as part of the datasets
x_train.to_csv('x_train.csv', index = False, header = True)
x_test.to_csv('x_test.csv', index = False, header = True)
y_train.to_csv('y_train.csv', index = False, header = True)
y_test.to_csv('y_test.csv', index = False, header = True)