In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [21]:
# 1. Data Loading and Exploration
dataset_path = "SMSSpamCollection"
df = pd.read_csv(dataset_path, sep='\t', header=None, names=['label', 'message'], encoding='utf-8')
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# 2. Text Preprocessing
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['message'])
sequences = tokenizer.texts_to_sequences(df['message'])
max_length = max([len(x) for x in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['label'], test_size=0.2, random_state=42)

# 3. Build the RNN Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=16, input_length=max_length),
    tf.keras.layers.SimpleRNN(64, return_sequences=True),
    tf.keras.layers.SimpleRNN(32),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 4. Train the Model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# 5. Evaluate the Model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 6. Make Prediction
def predict_sms(message):
    seq = tokenizer.texts_to_sequences([message])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    prediction = model.predict(padded)[0][0]
    return "Spam" if prediction > 0.5 else "Ham"

# Example usage
print(predict_sms("Congratulations! You've won a free gift."))

Epoch 1/5




[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 122ms/step - accuracy: 0.8332 - loss: 0.4491 - val_accuracy: 0.8664 - val_loss: 0.3932
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 119ms/step - accuracy: 0.8628 - loss: 0.4034 - val_accuracy: 0.8664 - val_loss: 0.4023
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 119ms/step - accuracy: 0.8711 - loss: 0.3916 - val_accuracy: 0.8664 - val_loss: 0.3968
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 119ms/step - accuracy: 0.8680 - loss: 0.3929 - val_accuracy: 0.8664 - val_loss: 0.3965
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 120ms/step - accuracy: 0.8591 - loss: 0.4094 - val_accuracy: 0.8664 - val_loss: 0.3890
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step
Accuracy: 0.8663677130044843
              precision    recall  f1-score   support

           0       0.87      1.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step
Ham


In [12]:
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
print(predict_sms("Cool, what time you think you can get here?"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
Ham
