In [3]:
import tensorflow as tf
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Dense, LSTM, GlobalAveragePooling1D, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [5]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [6]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [7]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [8]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.columns = ['Labels', 'Data']

In [11]:
df.head()

Unnamed: 0,Labels,Data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
labels = pd.get_dummies(df['Labels'], drop_first=True)
print(labels.shape)

(5572, 1)


In [None]:
# df['b_labels'] = df['labels'].map({'ham':0, 'spam':1})
# y = df['b_labels'].values

In [13]:
df.drop(['Labels'], inplace=True, axis=1)
df.head()

Unnamed: 0,Data
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
df = pd.concat([df, labels], axis=1)

In [15]:
df.head()

Unnamed: 0,Data,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [18]:
df.columns = ['Data', 'Labels']

In [19]:
X = df['Data']
y = df['Labels']
print("X SHAPE: {} Y SHAPE {}".format(X.shape, y.shape))

X SHAPE: (5572,) Y SHAPE (5572,)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [23]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

In [24]:
word2idx = tokenizer.word_index
V = len(word2idx)
print("Found {} unique tokens".format(V))

Found 7360 unique tokens


In [26]:
train_data = pad_sequences(sequences_train)
print("Training Data shape {}".format(train_data.shape))
T = train_data.shape[1]

Training Data shape (3900, 189)


In [28]:
test_data = pad_sequences(sequences_test, maxlen=T)
print("Test Data Shape {}".format(test_data.shape))

Test Data Shape (1672, 189)


In [32]:
D = 30 # Embedding Dimensionality
M = 20 # Hidden Units
i = Input(shape=(T,))
x = Embedding(V+1, D)(i)
x = LSTM(M, return_sequences=True)(x)
x = GlobalAveragePooling1D()(x)
x = Dense(1, activation='sigmoid')(x)

model = Model(inputs=i, outputs=x)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 189)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 189, 30)           220830    
_________________________________________________________________
lstm_1 (LSTM)                (None, 189, 20)           4080      
_________________________________________________________________
global_average_pooling1d_1 ( (None, 20)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 21        
Total params: 224,931
Trainable params: 224,931
Non-trainable params: 0
_________________________________________________________________
