# Import libraries

In [154]:
import tensorflow as tf
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import re
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences


# load the data

In [155]:
data = pd.read_csv('spam.csv', encoding='latin1')[['v1', 'v2']]
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## daownsampling  ham count to match spam count 

In [156]:
ham_df = data[data['v1'] == 'ham']
spam_df = data[data['v1'] == 'spam']

ham_downsampled = ham_df.sample(len(spam_df), random_state=42)

# Combine
balanced_df = pd.concat([ham_downsampled, spam_df]).sample(frac=1, random_state=42).reset_index(drop=True)


In [157]:
balanced_df['v1'].value_counts()

v1
spam    747
ham     747
Name: count, dtype: int64

#### Text cleaning 

In [158]:
# import re
# import string

def clean_text(text):
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # 3. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 4. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


In [159]:
balanced_df['clean_text'] = balanced_df['v2'].apply(clean_text)
balanced_df.head()

Unnamed: 0,v1,v2,clean_text
0,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD...",urgent important information for o user today ...
1,spam,Panasonic & BluetoothHdset FREE. Nokia FREE. M...,panasonic bluetoothhdset free nokia free motor...
2,spam,Do you want a new Video handset? 750 any time ...,do you want a new video handset any time any n...
3,spam,Hi if ur lookin 4 saucy daytime fun wiv busty ...,hi if ur lookin saucy daytime fun wiv busty ma...
4,spam,09066362231 URGENT! Your mobile No 07xxxxxxxxx...,urgent your mobile no xxxxxxxxx won a å£ bonus...


#### encoding and padding 

In [160]:
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")  
tokenizer.fit_on_texts(balanced_df['clean_text'])  

sequences = tokenizer.texts_to_sequences(balanced_df['clean_text'])

# padding
padded = pad_sequences(sequences, maxlen=100, padding='post')


## Final DataSet

In [163]:
balanced_df['label_encoded'] = balanced_df['v1'].map({'ham': 0, 'spam': 1})

In [164]:
balanced_df.head()

Unnamed: 0,v1,v2,clean_text,label_encoded
0,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD...",urgent important information for o user today ...,1
1,spam,Panasonic & BluetoothHdset FREE. Nokia FREE. M...,panasonic bluetoothhdset free nokia free motor...,1
2,spam,Do you want a new Video handset? 750 any time ...,do you want a new video handset any time any n...,1
3,spam,Hi if ur lookin 4 saucy daytime fun wiv busty ...,hi if ur lookin saucy daytime fun wiv busty ma...,1
4,spam,09066362231 URGENT! Your mobile No 07xxxxxxxxx...,urgent your mobile no xxxxxxxxx won a å£ bonus...,1


## BUILD MODEL

In [165]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=16, input_length=100),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(units = 24, activation = 'relu'),
    tf.keras.layers.Dense(units = 12, activation = 'relu'), 
    tf.keras.layers.Dense(units = 1, activation = 'sigmoid')
])



#### compile

In [166]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3),
    loss = tf.keras.losses.BinaryCrossentropy(from_logits = False),
    metrics=['accuracy']
)

#### Fit

In [167]:
# from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded, balanced_df['label_encoded'], test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=25, validation_data=(X_test, y_test))

Epoch 1/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.5133 - loss: 0.6928 - val_accuracy: 0.8662 - val_loss: 0.6888
Epoch 2/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5835 - loss: 0.6870 - val_accuracy: 0.5552 - val_loss: 0.6778
Epoch 3/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6158 - loss: 0.6761 - val_accuracy: 0.8428 - val_loss: 0.6579
Epoch 4/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7926 - loss: 0.6494 - val_accuracy: 0.9097 - val_loss: 0.6120
Epoch 5/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8907 - loss: 0.5822 - val_accuracy: 0.8763 - val_loss: 0.5050
Epoch 6/25
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9222 - loss: 0.4630 - val_accuracy: 0.9231 - val_loss: 0.3797
Epoch 7/25
[1m38/38[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1694fe1cda0>

### Accuracy

In [173]:
loss, accuracy = model.evaluate(padded, balanced_df['label_encoded'])
print(f"Accuracy: {accuracy*100:.2f}%")

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9768 - loss: 0.0698 
Accuracy: 97.46%


## Accuracy: 97.46%

## using the model

In [144]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_user_input():
    text = input("📩 Enter a message to classify: ")

    # Convert to sequence
    sequence = tokenizer.texts_to_sequences([text])

    # Pad it to match training size
    padded_input = pad_sequences(sequence, maxlen=100, padding='post')

    # Predict
    prediction = model.predict(padded_input)[0][0]

    # Show result
    if prediction > 0.5:
        print("🚨 Prediction: SPAM")
    else:
        print("✅ Prediction: HAM (Not Spam)")


In [171]:
predict_user_input()

📩 Enter a message to classify:  "URGENT! Your account has been suspended. Login now to verify info: scamlink.com"


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
🚨 Prediction: SPAM


In [172]:
predict_user_input()

📩 Enter a message to classify:  "Don’t forget to bring your notes tomorrow!"


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
✅ Prediction: HAM (Not Spam)
