In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('/kaggle/input/email-classification-nlp/SMS_train.csv',
                encoding = 'latin1')
df.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam


In [78]:
df.loc[4,'Message_body']

'This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'

In [4]:
df.shape

(957, 3)

In [5]:
df['Label'].value_counts()

Label
Non-Spam    835
Spam        122
Name: count, dtype: int64

In [7]:
import re
import string

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow.keras import models, layers
from sklearn.model_selection import train_test_split

**Text cleaning function**
- Lowercase text
- remove punctuations and numbers
- remove stopwords
- keep only meaningful words

In [8]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [9]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [27]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]','',text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

In [28]:
df1 = df.copy()

In [29]:
df1.head()

Unnamed: 0,S. No.,Message_body,Label
0,1,Rofl. Its true to its name,Non-Spam
1,2,The guy did some bitching but I acted like i'd...,Non-Spam
2,3,"Pity, * was in mood for that. So...any other s...",Non-Spam
3,4,Will ü b going to esplanade fr home?,Non-Spam
4,5,This is the 2nd time we have tried 2 contact u...,Spam


In [30]:
df1 = df1.drop(columns = 'S. No.')

In [32]:
df1['Message_body'] = df1['Message_body'].apply(clean_text)

In [33]:
df1.head()

Unnamed: 0,Message_body,Label
0,rofl true name,Non-Spam
1,guy bitching acted like id interested buying s...,Non-Spam
2,pity mood soany suggestions,Non-Spam
3,b going esplanade fr home,Non-Spam
4,nd time tried contact u u pound prize claim ea...,Spam


In [35]:
df1['Label'] = df1['Label'].map({'Non-Spam':0 , 'Spam':1})

In [36]:
df1.head()

Unnamed: 0,Message_body,Label
0,rofl true name,0
1,guy bitching acted like id interested buying s...,0
2,pity mood soany suggestions,0
3,b going esplanade fr home,0
4,nd time tried contact u u pound prize claim ea...,1


**Tokenize & pad sequences**

In [39]:
texts = df1['Message_body'].tolist()
labels = df1['Label'].values

In [40]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)    #builds vocabulary

In [42]:
print(len(tokenizer.word_index))

2925


In [43]:
sequences = tokenizer.texts_to_sequences(texts)
print(sequences)

[[1071, 243, 151], [202, 1072, 647, 11, 176, 477, 361, 126, 203, 85, 34, 362, 69, 7], [1073, 1074, 1075, 1076], [86, 31, 648, 1077, 24], [244, 14, 299, 87, 1, 1, 649, 70, 45, 204, 2, 92, 205, 478, 650], [1078, 4, 363, 7, 2, 651, 206, 46, 479, 32, 50, 39, 300, 151, 127, 1079], [301, 364], [9, 152, 302, 365, 480, 17, 4], [366, 11, 153, 35], [8, 35, 481, 1080, 1081, 61, 1082, 111, 245, 367, 246, 1083], [4, 1084, 1085, 482, 303, 20, 1086, 1087], [62, 35, 652, 69, 35], [1088, 85, 10, 483, 1089, 1090, 247], [51, 22, 2, 23], [1091, 1092, 10, 1093, 10, 154, 80, 1094, 207, 1095, 1096, 208, 303, 155, 1097, 154, 484, 1098, 653], [63, 52, 1, 485, 1099, 81], [1100, 654, 102, 368, 1101, 248, 1102, 6, 304, 655, 93, 1103, 1104, 6, 57, 305, 486, 93, 36, 369, 156, 156], [40, 71, 370, 112, 12, 209, 306, 25, 656, 249, 1105, 1106, 1107, 64], [657, 658, 65], [1108, 249, 659, 157, 660, 93, 371, 661, 1109], [372, 4, 1110, 487, 52, 46], [94, 14, 53, 4], [13, 488, 128, 662, 1111, 1112, 1113, 1114, 13, 33, 1115]

In [47]:
#list((len(x) for x in sequences))
max_len  = max(len(x) for x in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(padded_sequences)

[[   0    0    0 ... 1071  243  151]
 [   0    0    0 ...  362   69    7]
 [   0    0    0 ... 1074 1075 1076]
 ...
 [   0    0    0 ...   84 2925  305]
 [   0    0    0 ...   73   14  173]
 [   0    0    0 ...    0  625  623]]


In [48]:
vocab_size = len(tokenizer.word_index) + 1
print("Vocab size:", vocab_size)

Vocab size: 2926


**Train test split**

In [51]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences,
                                                   labels, test_size=0.2,
                                                   stratify=labels)

**Build neural network model**

In [52]:
embedding_dim = 8

model = models.Sequential()                                #Define a model for neural network to learn from text

model.add(layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim))     #Layer to create word embeddings

model.add(layers.Flatten())                               #Flattening the embedded vectors to feed into neural networks

model.add(layers.Dense(8))                                #Hidden layers with 5 neurons

model.add(layers.Dense(1, activation = 'sigmoid'))         #Output layer with 1 neuron as we need to predict 1 or 0

model.summary()

In [53]:
model.compile(optimizer='adam' , loss = 'binary_crossentropy' , metrics = ['auc'])

2025-10-18 16:16:45.095263: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


**Train the model**

In [54]:
model.fit(X_train, y_train,
         epochs = 30,
         validation_data = (X_test, y_test))

Epoch 1/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - auc: 0.5020 - loss: 0.6071 - val_auc: 0.8079 - val_loss: 0.3704
Epoch 2/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - auc: 0.8180 - loss: 0.3719 - val_auc: 0.8366 - val_loss: 0.3381
Epoch 3/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - auc: 0.8848 - loss: 0.2939 - val_auc: 0.8700 - val_loss: 0.3078
Epoch 4/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - auc: 0.9554 - loss: 0.2716 - val_auc: 0.8987 - val_loss: 0.2592
Epoch 5/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - auc: 0.9776 - loss: 0.1898 - val_auc: 0.9275 - val_loss: 0.2178
Epoch 6/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - auc: 0.9786 - loss: 0.1634 - val_auc: 0.9365 - val_loss: 0.1878
Epoch 7/30
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - auc: 0.9935 -

<keras.src.callbacks.history.History at 0x7ca2ed1bd610>

**Evaluate on test data**

In [55]:
model.evaluate(X_train, y_train)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - auc: 1.0000 - loss: 0.0013 


[0.0016054328298196197, 1.0]

In [56]:
model.evaluate(X_test, y_test)

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - auc: 0.9951 - loss: 0.0577 


[0.08388721197843552, 0.9905753135681152]

**Calculate confusion matrix and classification report of model**

In [72]:
X_train.shape

(765, 64)

In [71]:
ytrain_pred = [int(pred > 0.5) for pred in model.predict(X_train)]
ytest_pred = [int(pred > 0.5) for pred in model.predict(X_test)]

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


  ytrain_pred = [int(pred > 0.5) for pred in model.predict(X_train)]
  ytest_pred = [int(pred > 0.5) for pred in model.predict(X_test)]


In [74]:
from sklearn.metrics import classification_report

In [75]:
print(classification_report(y_train , ytrain_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       667
           1       1.00      1.00      1.00        98

    accuracy                           1.00       765
   macro avg       1.00      1.00      1.00       765
weighted avg       1.00      1.00      1.00       765



In [76]:
print(classification_report(y_test , ytest_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       168
           1       0.86      0.79      0.83        24

    accuracy                           0.96       192
   macro avg       0.92      0.89      0.90       192
weighted avg       0.96      0.96      0.96       192



**Test Predictions**

In [79]:
new_texts = [
    "Congratulation, You've won a free ticket",
    "Hey, are we meeting tommorow?",
    "This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate."
]

#preprocess
new_texts_cleaned = [clean_text(t) for t in new_texts]
new_seq = tokenizer.texts_to_sequences(new_texts_cleaned)
new_pad = pad_sequences(new_seq , maxlen=max_len, padding='pre')

#predictions
pred = model.predict(new_pad)
pred_labels = ['Spam' if p > 0.5 else 'Not-Spam' for p in pred]


for t,l in zip(new_texts, pred_labels):
    print(f"{t} -> {l}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Congratulation, You've won a free ticket -> Not-Spam
Hey, are we meeting tommorow? -> Not-Spam
This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate. -> Spam
