# Spam Ham Detection Using CNN
Here we will try to implement Spam ham Detection model Using Convolutional Neural Network

In [15]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
df = pd.read_csv('Data\\smsspamcollection.tsv', sep='\t')

In [17]:
df

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1


In [18]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [19]:
df['label'] = df['label'].map({'ham':0, 'spam':1})

In [20]:
df.head()

Unnamed: 0,label,message,length,punct
0,0,"Go until jurong point, crazy.. Available only ...",111,9
1,0,Ok lar... Joking wif u oni...,29,6
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,0,U dun say so early hor... U c already then say...,49,6
4,0,"Nah I don't think he goes to usf, he lives aro...",61,2


In [21]:
df.drop(['length', 'punct'], axis=1, inplace=True)

In [22]:
X = df['message']
y = df['label']

In [23]:
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [26]:
MAX_VOCAB_SIZE = 20000
token = Tokenizer(num_words=MAX_VOCAB_SIZE)
token.fit_on_texts(X_train)
seq_train = token.texts_to_sequences(X_train)
seq_test = token.texts_to_sequences(X_test)

In [27]:
# get word -> index mapping
word2idx = token.word_index
V = len(word2idx)
V

7253

In [28]:
# Pad sequence so that we can get a N x T matrix
X_train = pad_sequences(seq_train)
T = X_train.shape[1]
X_train.shape

(3733, 162)

In [30]:
X_test = pad_sequences(seq_test, maxlen=T)
X_test.shape

(1839, 162)

In [31]:
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model

In [32]:
# Embedding dimension
D = 20

i = Input(shape=(T,))
x = Embedding(V+1, D)(i)
x = Conv1D(32, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(64, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(i, x)

In [33]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [36]:
model.summary()

In [35]:
r = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8290 - loss: 0.4605 - val_accuracy: 0.8624 - val_loss: 0.2412
Epoch 2/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9379 - loss: 0.1514 - val_accuracy: 0.9723 - val_loss: 0.0993
Epoch 3/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9887 - loss: 0.0310 - val_accuracy: 0.9782 - val_loss: 0.0825
Epoch 4/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9981 - loss: 0.0078 - val_accuracy: 0.9837 - val_loss: 0.0488
Epoch 5/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9995 - loss: 0.0046 - val_accuracy: 0.9848 - val_loss: 0.0677
Epoch 6/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9999 - loss: 0.0015 - val_accuracy: 0.9869 - val_loss: 0.0631
Epoch 7/10
[1m117/117[0m 

# Evaluation

In [40]:
pred = model.predict(X_test)

[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [41]:
pred

array([[1.7062448e-04],
       [1.9704383e-07],
       [1.8894083e-06],
       ...,
       [9.5105133e-07],
       [9.9990255e-01],
       [3.5539214e-02]], dtype=float32)

In [42]:
threshold = 0.5
pred = (pred >= threshold).astype(int)

In [43]:
pred

array([[0],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]])

In [44]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [45]:
confusion_matrix(y_test, pred)

array([[1582,    4],
       [  20,  233]], dtype=int64)

In [46]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1586
           1       0.98      0.92      0.95       253

    accuracy                           0.99      1839
   macro avg       0.99      0.96      0.97      1839
weighted avg       0.99      0.99      0.99      1839



In [48]:
accuracy_score(y_test, pred)

0.9869494290375204

# Pipeline

In [149]:
from sklearn.base import TransformerMixin, BaseEstimator, ClassifierMixin
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

class TextPreprocessor(TransformerMixin):
    def __init__(self, max_vocab_size=20000, max_len=None):
        self.max_vocab_size = max_vocab_size
        self.max_len = max_len
        self.tokenizer = Tokenizer(num_words=self.max_vocab_size)
    
    def fit(self, X, y=None):
        self.tokenizer.fit_on_texts(X)
        self.word_index = self.tokenizer.word_index
        self.vocab_size = len(self.word_index)
        if self.max_len is None:
            self.max_len = max(len(seq) for seq in self.tokenizer.texts_to_sequences(X))
        return self
    
    def transform(self, X, y=None):
        seqs = self.tokenizer.texts_to_sequences(X)
        X_padded = pad_sequences(seqs, maxlen=self.max_len)
        return X_padded

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)


In [150]:
class KerasClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model
    
    def fit(self, X, y):
        # Next time I will do training of model here
        return self 

    def label(self, predictions):
        ans = ["Spam" if pred_i==1 else "Ham" for pred_i in predictions]
        return ans
    
    def predict(self, X):
        predictions = self.model.predict(X)
        pred = (predictions >= 0.5).astype(int)
        return self.label(pred)

In [151]:
from sklearn.pipeline import Pipeline

model_pipeline_spHam = Pipeline([
    ('preprocessor', TextPreprocessor(max_vocab_size=20000, max_len=T)),
    # ('classifier', model),
    ('Predictor', KerasClassifier(model))
])



# Manual Test

In [152]:
model_pipeline_spHam.predict(['Hey this is manas bisht, how are you??'])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step


['Ham']

# Save Model

In [153]:
from joblib import Parallel, delayed 
import joblib 

In [154]:
joblib.dump(model_pipeline_spHam, 'spam-ham-pipe.pkl') 

['spam-ham-pipe.pkl']

In [155]:
from joblib import load

m = load('spam-ham-pipe.pkl')

In [156]:
m.predict(['What is this messageg??'])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


['Ham']

# Final Pipeline test

In [157]:
pred1 = m.predict(df['message'])

[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [158]:
set(pred1)

{'Ham'}