# Compare NLP Techniques: Prep The Data For Modeling

### Read In & Clean Text

In [2]:
%pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.15-cp311-cp311-macosx_10_9_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm (from nltk)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.5.15-cp311-cp311-macosx_10_9_x86_64.whl (281 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.7/281.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tq

In [3]:
# Read in and clean data
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import string

stopwords = nltk.corpus.stopwords.words('english')

messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['label'] = np.where(messages['label']=='spam', 1, 0)

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

messages['clean_text'] = messages['text'].apply(lambda x: clean_text(x))
messages.head()



Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


In [4]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(messages['clean_text'],
                                                    messages['label'], test_size=0.2)

In [5]:
# What do the first ten messages in the training set look like?
X_train[:]

5467    [get, garden, ready, summer, free, selection, ...
5285    [urgent, 1, week, free, membership, å, 100000,...
4205                                      [get, door, im]
1317    [win, newest, ûïharry, potter, order, phoenix,...
2874    [idk, keep, saying, youre, since, moved, keep,...
                              ...                        
1137                                          [lol, busy]
2468    [lol, well, dont, without, could, big, sale, t...
5223                               [die, want, u, stuffs]
1727                              [went, project, centre]
1059    [eastenders, tv, quiz, flower, dot, compare, v...
Name: clean_text, Length: 4457, dtype: object

In [6]:
# What do the labels look like?
y_train[:]

5467    1
5285    1
4205    0
1317    1
2874    0
       ..
1137    0
2468    0
5223    0
1727    0
1059    1
Name: label, Length: 4457, dtype: int64

In [9]:
# Let's save the training and test sets to ensure we are using the same data for each model
X_train.to_csv('../../../data/X_train.csv', index=False, header=True)
X_test.to_csv('../../../data/X_test.csv', index=False, header=True)
y_train.to_csv('../../../data/y_train.csv', index=False, header=True)
y_test.to_csv('../../../data/y_test.csv', index=False, header=True)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

print(X_train)

                                             clean_text
0     ['get', 'garden', 'ready', 'summer', 'free', '...
1     ['urgent', '1', 'week', 'free', 'membership', ...
2                                 ['get', 'door', 'im']
3     ['win', 'newest', 'ûïharry', 'potter', 'order'...
4     ['idk', 'keep', 'saying', 'youre', 'since', 'm...
...                                                 ...
4452                                    ['lol', 'busy']
4453  ['lol', 'well', 'dont', 'without', 'could', 'b...
4454                     ['die', 'want', 'u', 'stuffs']
4455                      ['went', 'project', 'centre']
4456  ['eastenders', 'tv', 'quiz', 'flower', 'dot', ...

[4457 rows x 1 columns]


In [15]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text']) # DO NOT USE UNTIL WE TRAIN FINISH OUR TRAINING DATA

In [16]:
tfidf_vect.vocabulary_

{'get': 3311,
 'garden': 3268,
 'ready': 5970,
 'summer': 7014,
 'free': 3167,
 'selection': 6380,
 'bulbs': 1659,
 'seeds': 6365,
 'worth': 8058,
 '3350': 487,
 'scotsman': 6330,
 'saturday': 6296,
 'stop': 6909,
 'go2': 3353,
 'notxtcouk': 5177,
 'urgent': 7643,
 'week': 7880,
 'membership': 4732,
 '100000': 251,
 'prize': 5793,
 'jackpot': 4029,
 'txt': 7538,
 'word': 8034,
 'claim': 1969,
 '81010': 713,
 'tc': 7147,
 'wwwdbuknet': 8101,
 'lccltd': 4316,
 'pobox': 5633,
 '4403ldnw1a7rw18': 545,
 'door': 2589,
 'im': 3853,
 'win': 7967,
 'newest': 5075,
 'ûïharry': 8273,
 'potter': 5704,
 'order': 5345,
 'phoenix': 5548,
 'book': 1526,
 'reply': 6088,
 'harry': 3549,
 'answer': 1074,
 'questions': 5903,
 'chance': 1853,
 'first': 3057,
 'among': 1035,
 'readers': 5968,
 'idk': 3837,
 'keep': 4160,
 'saying': 6307,
 'youre': 8227,
 'since': 6558,
 'moved': 4916,
 'butting': 1682,
 'heads': 3570,
 'freedom': 3169,
 'vs': 7772,
 'responsibility': 6123,
 'tired': 7339,
 'much': 4949,
 's

In [17]:
X_test_vect[0]

<1x8276 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [18]:
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [20]:
y_pred = rf_model.predict(X_test_vect)

In [21]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

# Precision: All the spam messages the model say is spam is actually spam.
# Recall: Out of all the spam message in the data how many was it able to label as spam.
# Accuracy: predicted spam or ham it was accurate this amount of time.

Precision: 1.0 / Recall: 0.807 / Accuracy: 0.974
