# Spam SMS Filtering
------------------------------
### *using LSTM and Convolution layers*

-----------------

**Necessary importings**

In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Data preparation

In [113]:
# Reading data from a text file
file = open('SMSSpamCollection.txt', 'r')
datalines = file.readlines()

# create text list for raw sms text, and label list for their labels (spam or ham) 
text = []
label = []
for line in datalines:
    line = line.split('\t')
    label.append(line[0])
    text.append(line[1].split('\n')[0])

# printing some sms texts and their labels     
for i in range(3):
    print(label[i])
    print(text[i]+ '\n')

ham
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

ham
Ok lar... Joking wif u oni...

spam
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's



In [114]:
# create a dataframe of text and label lists to have a better sense of the data
df = pd.DataFrame({'label':label, 'text':text})

print(df.info())
print('\nLabels counts:')
print(df.label.value_counts())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5574 non-null   object
 1   text    5574 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None

Labels counts:
ham     4827
spam     747
Name: label, dtype: int64


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


> 13.4% of the data is spam

We split our data intro train and test, with 30% for testing. Because of the small proportion of spam labels, We make sure that the two partitions have the same proportion of spam labels.

In [115]:
from sklearn.model_selection import train_test_split

# convert labels to numeric type : 1 for spam and 0 for ham
y = df.label.apply(lambda x: 1 if x=='spam' else 0).values

# plitting data
text_train, text_test, y_train, y_test = train_test_split(text, y, test_size=0.3,stratify=y,random_state=12)

In [116]:
print('%spam in train data:', y_train.sum()/len(y_train))
print('%spam in test data:', y_test.sum()/len(y_test))

%spam in train data: 0.1340681876441938
%spam in test data: 0.13389121338912133


> A model with an accuracy equal or below 87% on the test data is not relevant

# 2. Preprocessing text

Text must be tokenized and converted to sequence of numbers in order to be fed into a machine learning model.  

In [117]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Create and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_train) # the tokenizer creates a dictionary of tokens and their indexes based on text_train

# Transform each sms text in text_train into a sequence of numbers
X_train = tokenizer.texts_to_sequences(text_train)

In [118]:
# The maximum length of a sequence
maxlen = np.max([len(seq) for seq in X_train]) 
maxlen

189

In [119]:
# in order to have the same sequence length in X_train, we pad these sequences with zeros to reach maxlen 
X_train = pad_sequences(X_train, maxlen=maxlen)
# we get the vocabulary size used to preprocess train_text
# 1 is added to account for unseen words that will have index 0
vocab_size = len(tokenizer.word_index)+1

In [120]:
# tokenize and pad test data
X_test = tokenizer.texts_to_sequences(text_test)
X_test = pad_sequences(X_test, maxlen=maxlen)

# 3. Create a Neural Network model

## The model's architecture

In [124]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Conv1D, MaxPooling1D

# Instantiating the model class
model = Sequential()

# we use "Embedding" as first layer to have a dense representation of words.
# each word will be represented as a vector of 128 length.   
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=128,
                    trainable=True,
                    input_length=maxlen))

# This convolution layer do feature selection on the embedding vector
model.add(Conv1D(filters=64, kernel_size=3, padding='same'))
model.add(MaxPooling1D(pool_size=2))

# adding an LSTM layer will allow the model to operate over sequences of word vectors.
# this can be understood as interpreting words in their context
model.add(LSTM(64, dropout=0.1, recurrent_dropout=0.1)) 
# the dropouts remove 10% of input and memory cells respectively to avoid overfitting

model.add(Dense(32, activation='relu'))

# output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# summary of the model's architecture
model.summary()

Model: "sequential_43"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_34 (Embedding)     (None, 189, 128)          951680    
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 189, 64)           24640     
_________________________________________________________________
max_pooling1d_20 (MaxPooling (None, 94, 64)            0         
_________________________________________________________________
lstm_45 (LSTM)               (None, 64)                33024     
_________________________________________________________________
dense_58 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_59 (Dense)             (None, 1)                 33        
Total params: 1,011,457
Trainable params: 1,011,457
Non-trainable params: 0
___________________________________________

## Generating and evaluating predictions 

For efficiency, we create *fit_evaluate* function to perform model's fitting on training data and model's evaluation on testing data. 

In [125]:
from sklearn.metrics import classification_report, confusion_matrix

def fit_evaluate(model, X_train, X_test, y_train, y_test, epochs=1):
    # fitting
    model.fit(X_train,y_train,epochs=epochs)
    
    # generating predictions i.e. a probability of being spam
    predictions_proba = model.predict(X_test)
    
    # generating class predictions 0 (not spam) or 1 (spam)
    predictions_class = []
    for pred in predictions_proba:
        if pred>=0.5:
            predictions_class.append(1)
        else:
            predictions_class.append(0)
            
    # printing metrics about the classification
    print('\n-------------------------------------------------------')
    print('\n=============== Confusion Matrix ======================')
    print(confusion_matrix(y_test, predictions_class))
    print('\n============ Classification report ====================')
    print(classification_report(y_test, predictions_class))
    print('---------------------------------------------------------')
    
    return predictions_class

In [126]:
y_pred = fit_evaluate(model, X_train, X_test, y_train, y_test, epochs=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/2
Epoch 2/2

-------------------------------------------------------

[[1446    3]
 [  12  212]]

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1449
           1       0.99      0.95      0.97       224

    accuracy                           0.99      1673
   macro avg       0.99      0.97      0.98      1673
weighted avg       0.99      0.99      0.99      1673

---------------------------------------------------------


> The model performed very well: 95% as recall for the class 'spam'. Which means that the model predicts 95% of the actual spam SMSs. It also predicted 3 SMSs to be spam but they're actualy not. 

# 4. Investigating wrong predictions

In [127]:
# indexes of wrong predictions
er = np.where(y_test + y_pred == 1)

# we take raw text from text_test corresponding to these wrong predictions 
original_text = np.array(text_test)[er]

# we take the tokenizer-processed version of the raw text
tokenizer_processed_text = tokenizer.sequences_to_texts(X_test[er])

# we create a data frame that recaps information about wrong predictions
review = pd.DataFrame({'original_text':original_text,
                       'processed_text':tokenizer_processed_text,
                       'actual_label':y_test[er],
                       'pred_label':np.array(y_pred)[er]}) 

review.head()

Unnamed: 0,original_text,processed_text,actual_label,pred_label
0,Check Out Choose Your Babe Videos @ sms.shsex....,check out choose your babe sms,1,0
1,Sorry I missed your call let's talk when you h...,sorry i missed your call let's talk when you h...,1,0
2,They have a thread on the wishlist section of ...,they have a thread on the of the forums where ...,0,1
3,TBS/PERSOLVO. been chasing us since Sept forÂ£...,been chasing us since sept definitely not payi...,1,0
4,Hello darling how are you today? I would love ...,hello darling how are you today i would love t...,1,0


Here below we can see in detail which sms texts our model had wrong, along with their the actual and predicted labels.

In [128]:
for i in range(review.shape[0]):
    print('===============================',i+1,'===============================')
    for col in review.columns:
        print(col)
        print(review.iloc[i][col])
        print('\n')

original_text
Check Out Choose Your Babe Videos @ sms.shsex.netUN fgkslpoPW fgkslpo


processed_text
check out choose your babe sms


actual_label
1


pred_label
0


original_text
Sorry I missed your call let's talk when you have the time. I'm on 07090201529


processed_text
sorry i missed your call let's talk when you have the time i'm on


actual_label
1


pred_label
0


original_text
They have a thread on the wishlist section of the forums where ppl post nitro requests. Start from the last page and collect from the bottom up.


processed_text
they have a thread on the of the forums where ppl post requests start from the last page and collect from the bottom up


actual_label
0


pred_label
1


original_text
TBS/PERSOLVO. been chasing us since Sept forÂ£38 definitely not paying now thanks to your information. We will ignore them. Kath. Manchester.


processed_text
been chasing us since sept definitely not paying now thanks to your information we will ignore them


actual_label
1


pr