In [0]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
%matplotlib inline
import seaborn as sns
sns.set(style = "whitegrid", 
        color_codes = True,
        font_scale = 1.5)
from keras.models import load_model, Model
from keras.layers import SimpleRNN, Dense, Input, Dropout, LSTM, Activation, Embedding, Bidirectional
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
nltk.download('punkt')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
!pip install beautifulsoup4



In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Read in data from CSV
original_training_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/train.csv')
evaluation = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/eval.csv')

# Convert the emails to lower case as a first step to processing the text
original_training_data['email'] = original_training_data['email'].str.lower()
evaluation['email'] = evaluation['email'].str.lower()

In [0]:
# View the CSV
original_training_data.head()

Unnamed: 0,id,subject,email,spam
0,0,Subject: A&L Daily to be auctioned in bankrupt...,url: http://boingboing.net/#85534171\n date: n...,0
1,1,"Subject: Wired: ""Stronger ties between ISPs an...",url: http://scriptingnews.userland.com/backiss...,0
2,2,Subject: It's just too small ...,<html>\n <head>\n </head>\n <body>\n <font siz...,1
3,3,Subject: liberal defnitions\n,depends on how much over spending vs. how much...,0
4,4,Subject: RE: [ILUG] Newbie seeks advice - Suse...,hehe sorry but if you hit caps lock twice the ...,0


In [0]:
[train, test] = train_test_split(original_training_data, test_size=0.1, random_state=42)

In [0]:
stop_words = set(stopwords.words('english')) 
def remove_stopwords(x):
  word_tokens = word_tokenize(x) 
  filtered_sentence = [w for w in word_tokens if not w in stop_words] 
  return ' '.join(filtered_sentence)

In [0]:
def clean_data(data):
  df = data.copy()
  df['subject'] = data['subject'].fillna("")
  df['subject'] = df['subject'].str.replace("Subject:", "", regex=False)
  df['email'] = df['email'].apply(lambda x: BeautifulSoup(x).get_text())
  df['email'] = df['email'].apply(lambda x: x.strip())
  df['email'] = df['email'].apply(remove_stopwords)
  return df


In [0]:
clean_train = clean_data(train)
clean_test = clean_data(test)

In [0]:
clean_train.email.map(len).max()

299735

In [0]:
training_sentences = clean_train['email'].to_numpy()
training_labels = clean_train['spam'].to_numpy()

testing_sentences = clean_test['email'].to_numpy()
testing_labels = clean_test['spam'].to_numpy()

In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

vocab_size = 500 # only the top 500 most frequent words
embedding_dim = 100
max_length = 500
trunc_type = 'post'
oov_tok = '<OOV>'

tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(training_sentences)

# Dictionary mapping words to their index
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen = max_length, truncating = trunc_type)

print('Found %s unique tokens.' % len(word_index))

test_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(test_sequences, maxlen = max_length, truncating = trunc_type)

Found 102506 unique tokens.


Lets examine the words that the tokenization found.
Unfortunately, due to the way tokenization works, word_index contains tokens for all unique words, despite our vocab_size. When we use tokenizer later to token our text, only the most common vocab_size (500) words will be used. 
So for now, as a quick solution, we can filter the words by index to get the most frequent words

In [0]:
for word, index in word_index.items():
  if index < vocab_size + 1:
    print (word, index)

<OOV> 1
e 2
n 3
'' 4
r 5
1 6
0 7
http 8
com 9
c 10
's 11
09 12
l 13
f 14
h 15
20 16
p 17
2 18
w 19
www 20
b 21
n't 22
g 23
list 24
net 25
2002 26
3 27
one 28
get 29
u 30
email 31
5 32
' 33
linux 34
free 35
new 36
4 37
mail 38
time 39
use 40
lists 41
would 42
people 43
like 44
000 45
click 46
3d 47
html 48
v 49
content 50
us 51
font 52
3e 53
6 54
listinfo 55
information 56
users 57
7 58
d 59
00 60
message 61
8 62
please 63
10 64
also 65
text 66
make 67
mailing 68
k 69
color 70
business 71
spamassassin 72
x 73
9 74
ie 75
web 76
mailman 77
0d 78
org 79
money 80
z 81
want 82
could 83
see 84
work 85
wrote 86
first 87
may 88
exmh 89
software 90
way 91
address 92
internet 93
'm 94
rpm 95
world 96
've 97
know 98
size 99
sourceforge 100
even 101
send 102
type 103
need 104
said 105
system 106
home 107
date 108
news 109
name 110
spam 111
much 112
22 113
using 114
find 115
razor 116
go 117
many 118
well 119
right 120
company 121
3c 122
think 123
url 124
s 125
'll 126
're 127
good 128
file 129
grou

In [0]:
embeddings_index = {}
f = open('/content/drive/My Drive/Colab Notebooks/data/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [0]:
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim)) # +1 to account for the <OOV> token
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [0]:
embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights = [embedding_matrix],
                            input_length = max_length,
                            trainable = False)

In [0]:
tokenized_mail = Input(shape=(max_length,), dtype='int32')

embedded_sequences = embedding_layer(tokenized_mail)
x = Bidirectional(LSTM(64, return_sequences = True))(embedded_sequences)
# x = Dropout(rate = 0.5 )(x)
x = Bidirectional(LSTM(32))(x)
# x = Dropout(rate = 0.5 )(x)
x = Dense(64, activation = 'relu')(x)
# x = Dropout(rate = 0.5 )(x)
x = Dense(1, activation = 'sigmoid')(x)

model = Model(inputs = tokenized_mail, outputs = x)

In [0]:
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 500, 100)          10250700  
_________________________________________________________________
bidirectional_7 (Bidirection (None, 500, 128)          84480     
_________________________________________________________________
bidirectional_8 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_7 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
Total params: 10,380,621
Trainable params: 129,921
Non-trainable params: 10,250,700
_________________________________________

In [0]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
model.fit(padded, 
          training_labels, 
          epochs = 5, 
          batch_size=256,
          validation_data = (testing_padded, testing_labels))

Train on 7513 samples, validate on 835 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe024386710>

In [0]:
model.save('my_model.h5')

In [0]:
from google.colab import files

files.download('my_model.h5')

In [0]:
clean_eval = clean_data(evaluation)
eval_sentences = clean_eval['email'].to_numpy()
eval_sequences = tokenizer.texts_to_sequences(eval_sentences)
eval_padded = pad_sequences(eval_sequences, maxlen = max_length, truncating = trunc_type)

In [0]:
clean_eval.head()

Unnamed: 0,id,subject,email
0,0,CERT Advisory CA-2002-21 Vulnerability in PHP\n,-- -- -begin pgp signed message -- -- - cert a...
1,1,ADV: Affordable Life Insurance ddbfk\n,low-cost term-life insurance ! save 70 % term ...
2,2,CAREER OPPORTUNITY. WORK FROM HOME\n,-- -- -- =_nextpart_000_00a0_03e30a1a.b1804b54...
3,3,Marriage makes both sexes happy\n,"url : http : //www.newsisfree.com/click/-3,848..."
4,4,Re: [SAtalk] SA very slow (hangs?) on this me...,thursday 29 august 2002 16:39 cet mike burger ...


In [0]:
predicted_labels = model.predict(eval_padded)

In [0]:
predicted_labels[:5]

array([[0.00117591],
       [0.99175394],
       [0.94528234],
       [0.04643938],
       [0.00165203]], dtype=float32)

In [0]:
evaluation_predictions = np.where(predicted_labels >= 0.5, 1, 0)

In [0]:
evaluation_predictions.shape

(1000, 1)

In [0]:
evaluation_predictions = np.squeeze(evaluation_predictions)

In [0]:
evaluation_predictions.shape

(1000,)

In [0]:
assert isinstance(evaluation_predictions, np.ndarray) 

# must be binary labels (0 or 1) and not probabilities
assert np.all((evaluation_predictions == 0) | (evaluation_predictions == 1))

# must be the right number of predictions
assert evaluation_predictions.shape == (1000, )

In [0]:
from datetime import datetime

# Assuming that your predictions on the evaluation set are stored in a 1-dimensional array called
# evaluation_predictions. Feel free to modify this cell as long you create a CSV in the right format.

# must be ndarray of predictions
assert isinstance(evaluation_predictions, np.ndarray) 

# must be binary labels (0 or 1) and not probabilities
assert np.all((evaluation_predictions == 0) | (evaluation_predictions == 1))

# must be the right number of predictions
assert evaluation_predictions.shape == (1000, )

# Construct and save the submission:
submission_df = pd.DataFrame({
    "Id": evaluation['id'], 
    "Class": evaluation_predictions,
}, columns=['Id', 'Class'])

timestamp = datetime.isoformat(datetime.now()).split(".")[0]
submission_file_name = "submission_{}.csv".format(timestamp)

submission_df.to_csv(submission_file_name, index=False)

print('Created a CSV file: {}.'.format(submission_file_name))
print('You may now upload this CSV file to Kaggle for scoring.')

Created a CSV file: submission_2020-03-12T22:09:36.csv.
You may now upload this CSV file to Kaggle for scoring.


In [0]:
from google.colab import files

files.download(submission_file_name)