# Compare NLP Techniques: Prep The Data For Modeling

### Read In & Clean Text

In [4]:
# Read in and clean data
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import string
import gensim

# stopwords = nltk.corpus.stopwords.words('english')

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['label'] = np.where(messages['label']=='spam', 1, 0)

messages['clean_text'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,0,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."


In [5]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(messages['clean_text'],
                                                    messages['label'], test_size=0.2,random_state=42, shuffle = False)

In [None]:
# What do the first ten messages in the training set look like?

In [None]:
# What do the labels look like?

In [6]:
# Let's save the training and test sets to ensure we are using the same data for each model
X_train.to_csv('data/X_train.csv', index = False , header = True )
X_test.to_csv('data/X_test.csv', index = False , header = True )
y_train.to_csv('data/y_train.csv', index = False , header = True )
y_test.to_csv('data/y_test.csv', index = False , header = True )

In [7]:
X_train

0       [go, until, jurong, point, crazy, available, o...
1                             [ok, lar, joking, wif, oni]
2       [free, entry, in, wkly, comp, to, win, fa, cup...
3          [dun, say, so, early, hor, already, then, say]
4       [nah, don, think, he, goes, to, usf, he, lives...
                              ...                        
4452                             [or, guess, lt, gt, min]
4453                  [home, ard, wat, time, will, reach]
4454    [storming, msg, wen, lift, phne, say, hello, d...
4455    [if, you, want, to, mapquest, it, or, somethin...
4456    [aight, should, just, plan, to, come, up, late...
Name: clean_text, Length: 4457, dtype: object

In [8]:
X_test

4457    [die, accidentally, deleted, msg, suppose, put...
4458    [welcome, to, uk, mobile, date, this, msg, is,...
4459    [this, is, wishing, you, great, day, moji, tol...
4460    [thanks, again, for, your, reply, today, when,...
4461    [sorry, flaked, last, night, shit, seriously, ...
                              ...                        
5567    [this, is, the, nd, time, we, have, tried, con...
5568           [will, ì_, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, so, any, othe...
5570    [the, guy, did, some, bitching, but, acted, li...
5571                     [rofl, its, true, to, its, name]
Name: clean_text, Length: 1115, dtype: object

In [9]:
y_train

0       0
1       0
2       1
3       0
4       0
       ..
4452    0
4453    0
4454    0
4455    0
4456    0
Name: label, Length: 4457, dtype: int32

In [10]:
y_test

4457    0
4458    1
4459    0
4460    0
4461    0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 1115, dtype: int32