In [12]:
import numpy as np
import tensorflow as tf
import random
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [13]:
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

In [None]:
dataset = pd.read_csv('data/spam.csv')
dataset.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
dataset.shape

(5572, 2)

In [16]:
count = dataset['Category'].value_counts()
print(count)

Category
ham     4825
spam     747
Name: count, dtype: int64


In [17]:
ham_samples = dataset[dataset['Category'] == 'ham'].sample(n=747, random_state=SEED)
spam_samples = dataset[dataset['Category'] == 'spam']

dataset = pd.concat([ham_samples, spam_samples]).sample(frac=1, random_state=SEED).reset_index(drop=True)
dataset.head()
dataset.shape

(1494, 2)

In [18]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(dataset['Category'])

In [19]:
messages = dataset['Message'].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(messages, y, test_size=0.3, random_state=43)

In [20]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(X_train)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [21]:
X_train = pad_sequences(X_train, padding='post', maxlen=500)
X_test = pad_sequences(X_test, padding='post', maxlen=500)

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Dropout

In [23]:
model = Sequential([
    Input(shape=(500,)),
    Embedding(input_dim=len(token.word_index), output_dim=50),
    Flatten(),
    Dense(units=10, activation='relu'),
    Dropout(0.1),
    Dense(units=1, activation='sigmoid')
])

2025-12-11 21:45:53.007206: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [24]:
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [25]:
model.fit(X_train, y_train, epochs=20, batch_size=10, verbose=True, validation_data=(X_test, y_test))

Epoch 1/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6230 - loss: 0.2284 - val_accuracy: 0.8040 - val_loss: 0.1858
Epoch 2/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8498 - loss: 0.1619 - val_accuracy: 0.8998 - val_loss: 0.1408
Epoch 3/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9158 - loss: 0.1321 - val_accuracy: 0.9465 - val_loss: 0.1222
Epoch 4/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9187 - loss: 0.1222 - val_accuracy: 0.9688 - val_loss: 0.1106
Epoch 5/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9234 - loss: 0.1122 - val_accuracy: 0.9555 - val_loss: 0.1014
Epoch 6/20
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9388 - loss: 0.0840 - val_accuracy: 0.9599 - val_loss: 0.0408
Epoch 7/20
[1m105/105[0m 

<keras.src.callbacks.history.History at 0x7294e8ea5c10>

In [16]:
loss, accuracy = model.evaluate(X_test, y_test)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9465 - loss: 0.0393 


In [17]:
predictions = model.predict(X_test)

predictions = (predictions > 0.5).astype(int)

print(predictions[0:5])

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[[0]
 [0]
 [1]
 [0]
 [0]]


In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [19]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

In [20]:
print(f'Accuracy: {accuracy}\nPrecision: {precision}\nRecall:{recall}\nF1-score:{f1}')

Accuracy: 0.9465478841870824
Precision: 0.9471785625905894
Recall:0.9465478841870824
F1-score:0.9465627355330651
