In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
import time
from keras import metrics
from sklearn import metrics as me
print('import done')

import done


In [2]:
df = pd.read_csv('spam.csv', encoding='latin-1', usecols=['v1', 'v2'])
print(df.head())

tags = df.v1
texts = df.v2

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
num_max = 1000
# preprocess
le = LabelEncoder()
tags = le.fit_transform(tags)
tok = Tokenizer(num_words=num_max)
tok.fit_on_texts(texts)
mat_texts = tok.texts_to_matrix(texts,mode='count')
print(tags[:5])
print(mat_texts[:5])
print(tags.shape,mat_texts.shape)

[0 0 1 0 0]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 3. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]]
(5572,) (5572, 1000)


In [4]:
# try a simple model first

def get_simple_model():
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(num_max,)))
    model.add(Dropout(0.2))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc',metrics.binary_accuracy])
    print('compile done')
    return model

In [5]:
# for cnn preproces
max_len = 100
cnn_texts_seq = tok.texts_to_sequences(texts)
print(cnn_texts_seq[0])
cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=max_len)
print(cnn_texts_mat[0])
print(cnn_texts_mat.shape)

[50, 469, 841, 751, 657, 64, 8, 89, 121, 349, 147, 67, 58, 144]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0  50 469 841 751
 657  64   8  89 121 349 147  67  58 144]
(5572, 100)


In [6]:
def get_cnn_model_v1(max_len=100):   
    model = Sequential()
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    # 1000 is num_max
    model.add(Embedding(1000,
                        20,
                        input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Conv1D(64,
                     3,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(256))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc', metrics.binary_accuracy])
    return model

In [7]:
def get_cnn_model_v2(max_len=100): # added embed   
    model = Sequential()
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    # 1000 is num_max
    model.add(Embedding(1000,
                        50, #!!!!!!!!!!!!!!!!!!!!!!!
                        input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Conv1D(64,
                     3,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(256))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc',metrics.binary_accuracy])
    return model

In [8]:
def get_cnn_model_v3(max_len=100):    # added filter
    model = Sequential()
    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    # 1000 is num_max
    model.add(Embedding(1000,
                        20,
                        input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Conv1D(256, #!!!!!!!!!!!!!!!!!!!
                     3,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(256))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    return model

In [9]:
from sklearn.model_selection import train_test_split
# prepare the dataset
def get_data(train, test):
    # generate dataset
    X, y = train, test
    trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3, random_state=43)
    return trainX, trainy, testX, testy

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_metrics(test, pred):
    print("Metrics Measures")
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(test, pred)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(test, pred)
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(test, pred)
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(test, pred)
    print('F1 score: %f' % f1)

In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
def check_model(model, x, y):
    callback = ModelCheckpoint(filepath='best_model.hdf5', monitor='loss', mode='min')
    model.fit(x, y, batch_size=32, epochs=20, verbose=0, callbacks=callback)
    

trainX, trainy, testX, testy = get_data(mat_texts, tags)
m = get_simple_model()
check_model(m, trainX, trainy)
pred = m.predict_classes(testX)
pred = pred[:, 0]

get_metrics(testy, pred)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               512512    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 644,097
Trainable params: 644,097
Non-trainable params: 0
_________________________________________________________________
compile done
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model

In [12]:
#CNN 1
trainX, trainy, testX, testy = get_data(cnn_texts_mat, tags)
m = get_cnn_model_v1()
check_model(m, trainX, trainy)
pred = m.predict_classes(testX)
pred = pred[:, 0]

get_metrics(testy, pred)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 20)           20000     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 20)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 98, 64)            3904      
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               16640     
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
activation (Activation)      (None, 256)              

In [13]:
#CNN 2
trainX, trainy, testX, testy = get_data(cnn_texts_mat, tags)
m = get_cnn_model_v2()
check_model(m, trainX, trainy)
pred = m.predict_classes(testX)
pred = pred[:, 0]

get_metrics(testy, pred)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           50000     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100, 50)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 64)            9664      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 256)               16640     
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
activation_2 (Activation)    (None, 256)              

In [14]:
#CNN 3
trainX, trainy, testX, testy = get_data(cnn_texts_mat, tags)
m = get_cnn_model_v3()
check_model(m, trainX, trainy)
pred = m.predict_classes(testX)
pred = pred[:, 0]

get_metrics(testy, pred)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 20)           20000     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100, 20)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 98, 256)           15616     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
activation_4 (Activation)    (None, 256)              

With more finetuning, i would get better f1-scores