In [None]:
import pathlib
import pickle
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "data_local"
EXPORT_DIR = DATASET_DIR/"exports"
SPAM_DATASET_PATH = EXPORT_DIR/"spam-dataset.csv"

METADATA_EXPORT_PATH = EXPORT_DIR/"spam-metadata.pkl"
TOKENIZER_EXPORT_PATH = EXPORT_DIR/"spam-tokenizer.json"

In [None]:
model_df = pd.read_csv(SPAM_DATASET_PATH)
model_df.head()

In [None]:
labels = model_df["label"].tolist()
texts = model_df["text"].tolist()

In [None]:
labels[120], texts[120]

In [None]:
label_legend = {"ham": 0, "spam": 1}
label_legend_inverted = {f"{v}": k for k, v in label_legend.items()}

labels_as_int = [label_legend[x] for x in labels]
label_legend_inverted[str(labels_as_int[120])]

In [10]:
random_idx = random.randint(0, len(labels))

assert texts[random_idx] == model_df.iloc[random_idx]["text"]
assert labels[random_idx] == model_df.iloc[random_idx]["label"]
assert label_legend_inverted[str(labels_as_int[random_idx])] == model_df.iloc[random_idx].label

In [11]:
MAX_NUM_WORDS = 280
MAX_SEQ_LENGTH = 300

In [12]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [13]:
word_index = tokenizer.word_index
word_index

{'i': 1,
 'to': 2,
 'you': 3,
 'a': 4,
 'the': 5,
 'and': 6,
 'my': 7,
 'u': 8,
 'is': 9,
 'in': 10,
 'this': 11,
 'me': 12,
 'it': 13,
 'for': 14,
 'of': 15,
 'on': 16,
 'out': 17,
 'your': 18,
 '\ufeff': 19,
 'have': 20,
 'so': 21,
 'that': 22,
 'check': 23,
 'are': 24,
 '2': 25,
 'call': 26,
 'if': 27,
 'but': 28,
 'can': 29,
 'just': 30,
 'now': 31,
 'not': 32,
 'be': 33,
 'at': 34,
 'will': 35,
 'do': 36,
 'or': 37,
 'like': 38,
 'get': 39,
 'with': 40,
 'up': 41,
 "i'm": 42,
 'we': 43,
 'no': 44,
 'love': 45,
 'ur': 46,
 'from': 47,
 'please': 48,
 'all': 49,
 'com': 50,
 'lt': 51,
 'gt': 52,
 'how': 53,
 'when': 54,
 'go': 55,
 '4': 56,
 'video': 57,
 'know': 58,
 'free': 59,
 'am': 60,
 'what': 61,
 'good': 62,
 'was': 63,
 'ok': 64,
 'time': 65,
 'only': 66,
 'then': 67,
 'got': 68,
 'its': 69,
 'song': 70,
 'come': 71,
 '39': 72,
 'youtube': 73,
 'new': 74,
 'br': 75,
 'as': 76,
 'there': 77,
 'day': 78,
 'want': 79,
 'he': 80,
 'one': 81,
 'www': 82,
 'by': 83,
 'amp': 84,
 

In [14]:
x = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)

In [15]:
labels_as_int_array = np.asarray(labels_as_int)
labels_as_int_array

array([0, 0, 1, ..., 0, 0, 0])

In [16]:
y = to_categorical(labels_as_int_array)

In [17]:
y

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [19]:
training_data = {
    "x_train": x_train,
    "x_test": x_test,
    "y_train": y_train,
    "y_test": y_test,
    "max_words": MAX_NUM_WORDS, 
    "max_seq_length": MAX_SEQ_LENGTH,
    "legend": label_legend,
    "label_legend_invereted": label_legend_inverted,
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

1090335

In [20]:
with open(METADATA_EXPORT_PATH, "wb") as f:
    pickle.dump(training_data, f)