In [17]:
import pathlib
import pandas as pd
import random

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"

In [18]:
df = pd.read_csv(SPAM_DATASET_PATH)
df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam
3,ham,U dun say so early hor... U c already then say...,sms-spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms-spam


In [19]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [20]:
labels[120], texts[120]

('spam',
 'PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires')

In [21]:
label_legend = {"ham": 0, "spam": 1}
label_legend_inverted = {f"{v}" : k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'spam'}

In [22]:
labels_as_int = [label_legend[x] for x in labels]
labels_as_int[120]

1

In [23]:
label_legend_inverted[str(labels_as_int[120])]

'spam'

In [30]:
random_idx = random.randint(0, len(labels))

assert texts[random_idx] == df.iloc[random_idx].text

assert labels[random_idx] == df.iloc[random_idx].label

assert label_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [32]:
MAX_NUM_WORDS = 280

In [38]:
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
# sequences

In [39]:
word_index = tokenizer.word_index
# word_index

In [36]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [37]:
MAX_SEQ_LENGTH = 300

In [40]:
X = pad_sequences(sequences,maxlen=MAX_SEQ_LENGTH)

In [41]:
X

array([[  0,   0,   0, ...,  77,  68, 187],
       [  0,   0,   0, ...,   0,  64,   8],
       [  0,   0,   0, ...,   2, 110, 104],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   8,  24,  21],
       [  0,   0,   0, ...,   9,   5, 156]],
      shape=(7528, 300), dtype=int32)

In [42]:
from tensorflow.keras.utils import to_categorical
import numpy as np

In [43]:
labels_as_int_array = np.asarray(labels_as_int)
labels_as_int_array

array([0, 0, 1, ..., 0, 0, 0], shape=(7528,))

In [44]:
y = to_categorical(labels_as_int_array)

In [45]:
y

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], shape=(7528, 2))

In [46]:
[0, 0, 1]
['ham','ham', 'spam']
['ham','spam']

[[1,0],[1,0],[0,1]]

[[1, 0], [1, 0], [0, 1]]