In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [2]:
spam = pd.read_csv("spam.csv")
spam.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
spam.shape

(5572, 2)

In [6]:
label_encoder = LabelEncoder()
categories = label_encoder.fit_transform(spam["Category"])

In [7]:
messages = spam["Message"].values
X_train, X_test, y_train, y_test = train_test_split(messages, categories, test_size=0.3)

In [8]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(X_train)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

In [10]:
X_train = pad_sequences(X_train, padding="post", maxlen=500)
X_test = pad_sequences(X_test, padding="post", maxlen=500)

In [11]:
model = Sequential()

model.add(Embedding(input_dim=len(token.word_index),
                    output_dim=50,
                    input_length=500))
model.add(Flatten())
model.add(Dense(units=10, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(units=1, activation="sigmoid"))

In [12]:
model.compile(loss="mean_squared_error", optimizer="adam", metrics=["accuracy"])

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 50)           368950    
                                                                 
 flatten (Flatten)           (None, 25000)             0         
                                                                 
 dense (Dense)               (None, 10)                250010    
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 618971 (2.36 MB)
Trainable params: 618971 (2.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
model.fit(X_train,
          y_train,
          epochs=500,
          batch_size=10,
          verbose=True,
          validation_data=(X_test, y_test))

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.src.callbacks.History at 0x796500282650>

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"loss = {loss} accuracy = {accuracy}")

loss = 0.01421177014708519 accuracy = 0.9850478172302246


In [16]:
prediction = model.predict(X_test)
print(prediction)

[[0.]
 [0.]
 [0.]
 ...
 [1.]
 [0.]
 [0.]]


In [17]:
predict = (prediction > 0.5)
print(predict)

[[False]
 [False]
 [False]
 ...
 [ True]
 [False]
 [False]]


In [19]:
cm = confusion_matrix(y_test, predict)
print(cm)

[[1408    7]
 [  18  239]]
