# Spam Ham Detection Using CNN
Here we will try to implement Spam ham Detection model Using Convolutional Neural Network

In [60]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [61]:
df = pd.read_csv('Data\\smsspamcollection.tsv', sep='\t')

In [62]:
label = df['label']

In [63]:
df

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1


In [64]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [65]:
df['label'] = df['label'].map({'ham':0, 'spam':1})

In [66]:
df.head()

Unnamed: 0,label,message,length,punct
0,0,"Go until jurong point, crazy.. Available only ...",111,9
1,0,Ok lar... Joking wif u oni...,29,6
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,0,U dun say so early hor... U c already then say...,49,6
4,0,"Nah I don't think he goes to usf, he lives aro...",61,2


In [67]:
df.drop(['length', 'punct'], axis=1, inplace=True)

In [68]:
X = df['message']
y = df['label']

In [69]:
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

In [71]:
from sklearn.base import TransformerMixin, BaseEstimator, ClassifierMixin
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, max_vocab_size=20000, max_len=None):
        self.max_vocab_size = max_vocab_size
        self.max_len = None
        self.tokenizer = Tokenizer(num_words=self.max_vocab_size)
    
    def fit(self, X, y=None):
        print("Text: ", X)
        self.tokenizer.fit_on_texts(X)
        self.word_index = self.tokenizer.word_index
        self.max_len = len(self.word_index)
        self.vocab_size = len(self.word_index)
        if self.max_len is None:
            self.max_len = max(len(seq) for seq in self.tokenizer.texts_to_sequences(X))
        return self
    
    def transform(self, X, y=None):
        print("Inside tranform function")
        seqs = self.tokenizer.texts_to_sequences(X)
        X_padded = pad_sequences(seqs, maxlen=self.max_len)
        print("X_padded: ", X_padded)
        return X_padded



In [72]:
from sklearn.pipeline import Pipeline

In [73]:
preprocess_pipe = Pipeline([
    ('preprocess' , TextPreprocessor())
])

In [74]:
X_train

2632    URGENT! Your mobile No 077xxx WON a £2,000 Bon...
5435                    I'm wif him now buying tix lar...
1904    Free entry in 2 a weekly comp for a chance to ...
4345    Hi:)did you asked to waheeda fathima about leave?
2339                            Alright, see you in a bit
                              ...                        
2589    My superior telling that friday is leave for a...
5158    I will come with karnan car. Please wait till ...
5054                          Lmao you know me so well...
1169    Ok . . now i am in bus. . If i come soon i wil...
945     I sent my scores to sophas and i had to do sec...
Name: message, Length: 3733, dtype: object

In [75]:
processor = preprocess_pipe.fit(X_train)

Text:  2632    URGENT! Your mobile No 077xxx WON a £2,000 Bon...
5435                    I'm wif him now buying tix lar...
1904    Free entry in 2 a weekly comp for a chance to ...
4345    Hi:)did you asked to waheeda fathima about leave?
2339                            Alright, see you in a bit
                              ...                        
2589    My superior telling that friday is leave for a...
5158    I will come with karnan car. Please wait till ...
5054                          Lmao you know me so well...
1169    Ok . . now i am in bus. . If i come soon i wil...
945     I sent my scores to sophas and i had to do sec...
Name: message, Length: 3733, dtype: object


In [76]:
processor

In [77]:
from joblib import dump

dump(processor, 'Preprocessing.joblib')

['Preprocessing.joblib']

In [78]:
from joblib import load

process_new_pipe = load('Preprocessing.joblib')

In [79]:
X_train = process_new_pipe.transform(X_train)

Inside tranform function
X_padded:  [[   0    0    0 ... 1000 1819  374]
 [   0    0    0 ...  813 3420  300]
 [   0    0    0 ...   12  452  242]
 ...
 [   0    0    0 ...   10   23  120]
 [   0    0    0 ...   59 1444  150]
 [   0    0    0 ...  975 1403 1137]]


In [80]:
X_test = process_new_pipe.transform(X_test)

Inside tranform function
X_padded:  [[   0    0    0 ... 3138  586  325]
 [   0    0    0 ...   14 1333  128]
 [   0    0    0 ...   26  588  166]
 ...
 [   0    0    0 ...   47 5212 1140]
 [   0    0    0 ...  888   98   89]
 [   0    0    0 ...    3 3108  112]]


In [81]:
X_train

array([[   0,    0,    0, ..., 1000, 1819,  374],
       [   0,    0,    0, ...,  813, 3420,  300],
       [   0,    0,    0, ...,   12,  452,  242],
       ...,
       [   0,    0,    0, ...,   10,   23,  120],
       [   0,    0,    0, ...,   59, 1444,  150],
       [   0,    0,    0, ...,  975, 1403, 1137]])

In [82]:
X_test

array([[   0,    0,    0, ..., 3138,  586,  325],
       [   0,    0,    0, ...,   14, 1333,  128],
       [   0,    0,    0, ...,   26,  588,  166],
       ...,
       [   0,    0,    0, ...,   47, 5212, 1140],
       [   0,    0,    0, ...,  888,   98,   89],
       [   0,    0,    0, ...,    3, 3108,  112]])

In [83]:
V = X_train.shape[1]
V

7345

In [84]:
print(f"X_train: ({X_train.shape}), X_test: {X_test.shape}), y_train: ({y_train.shape}), y_test: ({y_test.shape})")

X_train: ((3733, 7345)), X_test: (1839, 7345)), y_train: ((3733,)), y_test: ((1839,))


In [85]:
T = V
T

7345

In [86]:
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model

In [87]:
# Embedding dimension
D = 20

i = Input(shape=(T,))
x = Embedding(V+1, D)(i)
x = Conv1D(32, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(64, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(i, x)

In [88]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [89]:
model.summary()

In [90]:
r = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Epoch 1/3
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 222ms/step - accuracy: 0.8769 - loss: 0.4620 - val_accuracy: 0.8635 - val_loss: 0.2956
Epoch 2/3
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 213ms/step - accuracy: 0.9014 - loss: 0.2086 - val_accuracy: 0.9755 - val_loss: 0.0850
Epoch 3/3
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 216ms/step - accuracy: 0.9901 - loss: 0.0382 - val_accuracy: 0.9804 - val_loss: 0.0861


# Evaluation

In [91]:
print(X_test)

[[   0    0    0 ... 3138  586  325]
 [   0    0    0 ...   14 1333  128]
 [   0    0    0 ...   26  588  166]
 ...
 [   0    0    0 ...   47 5212 1140]
 [   0    0    0 ...  888   98   89]
 [   0    0    0 ...    3 3108  112]]


In [92]:
pred = model.predict(X_test)

[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 63ms/step


In [93]:
pred

array([[3.3903876e-04],
       [3.7712365e-07],
       [9.9983692e-01],
       ...,
       [3.3207959e-04],
       [6.9624530e-03],
       [4.7470683e-05]], dtype=float32)

In [94]:
threshold = 0.5
pred = (pred >= threshold).astype(int)

In [95]:
pred

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [96]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [97]:
confusion_matrix(y_test, pred)

array([[1586,    2],
       [  34,  217]], dtype=int64)

In [98]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1588
           1       0.99      0.86      0.92       251

    accuracy                           0.98      1839
   macro avg       0.98      0.93      0.96      1839
weighted avg       0.98      0.98      0.98      1839



In [99]:
accuracy_score(y_test, pred)

0.9804241435562806

# Pipeline

In [100]:
class KerasClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model
    
    def fit(self, X, y):
        print("Inside Keras Clasifier fit")
        # Next time I will do training of model here
        return self 

    def label(self, predictions):
        # ans = ["Spam" if pred_i[0]==1 else "Ham" for pred_i in predictions]
        ans = []
        print("Inside label")
        for pred_i in predictions:
            print(pred_i)
            if pred_i == 1:
                ans.append("Spam")
            else:
                ans.append("Ham")
                
        return ans
    
    def predict(self, X):
        print(X)
        predictions = self.model.predict(X)
        pred = (predictions >= 0.5).astype(int)
        print("Predictions: ", predictions)
        return self.label(pred)

In [101]:
from sklearn.pipeline import Pipeline

model_pipeline_spHam = Pipeline([
    ('Predictor', KerasClassifier(model))
])



# Manual Test

In [102]:
padd = process_new_pipe.transform(['free!!!! you have won a lottery of 2000 dollars!'])
padd

Inside tranform function
X_padded:  [[   0    0    0 ...   15 2703 1647]]


array([[   0,    0,    0, ...,   15, 2703, 1647]])

In [103]:
model_pipeline_spHam.predict(padd)

[[   0    0    0 ...   15 2703 1647]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Predictions:  [[0.70377225]]
Inside label
[1]


['Spam']

# Save Model

In [104]:
from joblib import Parallel, delayed 
import joblib 

In [105]:
joblib.dump(model_pipeline_spHam, 'spam-ham-pipe.joblib') 

['spam-ham-pipe.joblib']