# Spam Ham Detection Using CNN
Here we will try to implement Spam ham Detection model Using Convolutional Neural Network

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('Data\\smsspamcollection.tsv', sep='\t')

In [3]:
label = df['label']

In [4]:
df

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1


In [5]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [6]:
df['label'] = df['label'].map({'ham':0, 'spam':1})

In [7]:
df.head()

Unnamed: 0,label,message,length,punct
0,0,"Go until jurong point, crazy.. Available only ...",111,9
1,0,Ok lar... Joking wif u oni...,29,6
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,0,U dun say so early hor... U c already then say...,49,6
4,0,"Nah I don't think he goes to usf, he lives aro...",61,2


In [8]:
df.drop(['length', 'punct'], axis=1, inplace=True)

In [9]:
X = df['message']
y = df['label']

In [10]:
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

In [12]:
MAX_VOCAB_SIZE = 20000
token = Tokenizer(num_words=MAX_VOCAB_SIZE)
token.fit_on_texts(X_train)
seq_train = token.texts_to_sequences(X_train)
seq_test = token.texts_to_sequences(X_test)

In [13]:
# get word -> index mapping
word2idx = token.word_index
V = len(word2idx)
V

7245

In [14]:
# Pad sequence so that we can get a N x T matrix
X_train = pad_sequences(seq_train)
T = X_train.shape[1]
X_train.shape

(3733, 189)

In [15]:
X_test = pad_sequences(seq_test, maxlen=T)
X_test.shape

(1839, 189)

In [16]:
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model

In [17]:
# Embedding dimension
D = 20

i = Input(shape=(T,))
x = Embedding(V+1, D)(i)
x = Conv1D(32, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(64, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(i, x)

In [18]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [19]:
model.summary()

In [20]:
r = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.8401 - loss: 0.4712 - val_accuracy: 0.8912 - val_loss: 0.2107
Epoch 2/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9384 - loss: 0.1538 - val_accuracy: 0.9782 - val_loss: 0.0772
Epoch 3/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9832 - loss: 0.0505 - val_accuracy: 0.9859 - val_loss: 0.0546
Epoch 4/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9923 - loss: 0.0202 - val_accuracy: 0.9897 - val_loss: 0.0451
Epoch 5/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9979 - loss: 0.0079 - val_accuracy: 0.9902 - val_loss: 0.0508
Epoch 6/10
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9983 - loss: 0.0052 - val_accuracy: 0.9875 - val_loss: 0.0653
Epoch 7/10
[1m117/117[0m 

# Evaluation

In [21]:
pred = model.predict(X_test)

[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [22]:
pred

array([[2.8676483e-10],
       [6.8308509e-08],
       [5.9631639e-06],
       ...,
       [7.5932854e-05],
       [1.0297034e-05],
       [7.9202443e-11]], dtype=float32)

In [23]:
threshold = 0.5
pred = (pred >= threshold).astype(int)

In [24]:
pred

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [25]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [26]:
confusion_matrix(y_test, pred)

array([[1595,    6],
       [  10,  228]], dtype=int64)

In [27]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1601
           1       0.97      0.96      0.97       238

    accuracy                           0.99      1839
   macro avg       0.98      0.98      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [28]:
accuracy_score(y_test, pred)

0.9912996193583469

# Pipeline

In [29]:
from sklearn.base import TransformerMixin, BaseEstimator, ClassifierMixin
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

class TextPreprocessor(TransformerMixin):
    def __init__(self, max_vocab_size=20000, max_len=None):
        self.max_vocab_size = max_vocab_size
        self.max_len = max_len
        self.tokenizer = Tokenizer(num_words=self.max_vocab_size)
    
    def fit(self, X, y=None):
        self.tokenizer.fit_on_texts(X)
        self.word_index = self.tokenizer.word_index
        self.vocab_size = len(self.word_index)
        if self.max_len is None:
            self.max_len = max(len(seq) for seq in self.tokenizer.texts_to_sequences(X))
        return self
    
    def transform(self, X, y=None):
        seqs = self.tokenizer.texts_to_sequences(X)
        X_padded = pad_sequences(seqs, maxlen=self.max_len)
        return X_padded

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)


In [30]:
class KerasClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model
    
    def fit(self, X, y):
        # Next time I will do training of model here
        return self 

    def label(self, predictions):
        ans = ["Spam" if pred_i==1 else "Ham" for pred_i in predictions]
        return ans
    
    def predict(self, X):
        predictions = self.model.predict(X)
        pred = (predictions >= 0.5).astype(int)
        return self.label(pred)

In [31]:
from sklearn.pipeline import Pipeline

model_pipeline_spHam = Pipeline([
    ('preprocessor', TextPreprocessor(max_vocab_size=20000, max_len=T)),
    # ('classifier', model),
    ('Predictor', KerasClassifier(model))
])



# Manual Test

In [32]:
model_pipeline_spHam.predict(['free!!!! you have won a lottery of 2000 dollars!'])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


['Ham']

# Save Model

In [33]:
from joblib import Parallel, delayed 
import joblib 

In [34]:
joblib.dump(model_pipeline_spHam, 'spam-ham-pipe.pkl') 

['spam-ham-pipe.pkl']

In [35]:
from joblib import load

m = load('spam-ham-pipe.pkl')

In [36]:
m.predict(['What is this messageg??'])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step


['Ham']

# Final Pipeline test

In [37]:
pred1 = m.predict(df['message'])

[1m175/175[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [38]:
set(pred1)

{'Ham'}