In [139]:
import tensorflow as tf
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import seaborn as sns

In [140]:
data = pl.read_csv('SPAM text message 20170820 - Data.csv')

In [141]:
data

In [142]:
data.groupby('Category').count()

In [143]:
ham_msg = data.filter(pl.col('Category') == 'ham')
spam_msg = data.filter(pl.col('Category') == 'spam')

In [144]:
#randomly taking data from ham_ msg
ham_msg = ham_msg.sample(n=len(spam_msg))

In [145]:
print(ham_msg.shape, spam_msg.shape)

In [146]:
data

In [147]:
balanced_data = ham_msg.vstack(spam_msg)
balanced_data = balanced_data.with_columns(
    pl.when(pl.col("Category") == 'spam').then(1).otherwise(0).alias('Category')
)

In [148]:
balanced_data['Category']

In [149]:
balanced_data.head()

In [150]:
train_msg, test_msg, train_labels, test_labels = train_test_split(balanced_data['Message'], balanced_data['Category'],
                                                                  test_size=0.2, random_state=434)

In [151]:
vocab_size = 500
oov_tok = '<OOV>'
max_len = 50

In [152]:
#preprocessing making tokens out of text
token = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
token.fit_on_texts(train_msg)

In [153]:
word_index = token.word_index
word_index

In [154]:
padding_type = 'post'
truncate_type = 'post'
Trainning_seq = token.texts_to_sequences(train_msg)
Trainning_pad = pad_sequences(Trainning_seq, maxlen=50, padding=padding_type, truncating=truncate_type)

In [155]:
Testing_seq = token.texts_to_sequences(test_msg)
Testing_pad = pad_sequences(Testing_seq, maxlen=50, padding=padding_type, truncating=truncate_type)

In [156]:
#model
model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Embedding(vocab_size, 16, input_length=50),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ]
)


In [157]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy'],
    optimizer='adam'
)

In [158]:
epoch = 30
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(
    tf.dtypes.DType(Trainning_pad),
    train_labels,
    validation_data=(Testing_pad, test_labels),
    epochs=epoch,
    callbacks=[early_stop],
    verbose=2
)

In [None]:
model.evaluate(
    Testing_pad,
    test_labels
)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')

In [None]:
predict_msg = [
    "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...",
    "Ok lar... Joking wif u oni...",
    "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

In [None]:
def predict_spam(predict_msg):
    new_seq = token.texts_to_sequences(predict_msg)
    padded = pad_sequences(new_seq, maxlen=50,
                           padding=padding_type,
                           truncating='post')
    return model.predict(padded)


predict_spam(predict_msg)