# Binary Classification with Convolutional Neural Network and Word Embedding

In [1]:
import string
import re
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
import os
import numpy

# constants
classA_path = './enron/enron_train/ham'
classB_path = './enron/enron_train/spam'
# Word embeddings
vocabulary_path = './enron/enron_train/vocab' 
model_path = './enron/enron_train/model.h5'
classA = 'HAM'
classB = 'SPAM'

min_occurrence = 5

# load doc in memory
def load_doc(filename):
    # open file as only read
    file = open(filename, 'r')
    # read text
    try:
        text = file.read()
    except:
        os.remove(filename)
        return ''
    # close file
    file.close()
    return text

# from doc to clean tokens
def clean_doc(doc):
    # dividido en tokens por espacio en blanco
    tokens = doc.split()
    # regex for character filter
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # filter punctuation
    tokens = [re_punc.sub('', w) for w in tokens]
    # delete tokens out of alphabetic order
    tokens = [word for word in tokens if word.isalpha()]
    # filter stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter tokens by size
    tokens = [word for word in tokens if len(word) < 15 and len(word) > 1]
    return tokens

# load document
filename = './enron/enron_train/spam/0002.2001-05-25.SA_and_HP.spam.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['Subject', 'fw', 'solution', 'mentioned', 'lsc', 'oo', 'thank', 'email', 'address', 'obtained', 'purchased', 'list', 'reference', 'mid', 'wish', 'unsubscribe', 'list', 'please', 'click', 'enter', 'name', 'remove', 'box', 'previously', 'unsubscribed', 'still', 'receiving', 'message', 'may', 'email', 'abuse', 'control', 'center', 'call', 'write', 'us', 'nospam', 'coral', 'way', 'miami', 'fl', 'web', 'credit', 'inc', 'rights', 'reserved']


In [2]:
# load doc and add to vocabulary 
def add_doc_to_vocab(filename, vocab):
    # load odc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counter()
    vocab.update(tokens)

# load all documents in a directory
def process_docs(directory, vocab):
    # review all files in the directory
    for filename in listdir(directory):
        # path
        path = directory + '/' + filename
        # add vocabulary
        add_doc_to_vocab(path, vocab)

# define vocabulary
vocab = Counter()
vocab_A = Counter()
vocab_B = Counter()
# fill vocabulary from all the documents in the dataset
process_docs(classA_path, vocab_A)
frec_classA = str(vocab.most_common(50))
process_docs(classB_path, vocab_B)
# main words and vocbaulary size
print(f'* {classA} with vocabulary length {len(vocab_A)} as (word, frecuency): \n {vocab_A.most_common(75)} \n')
print(f'* {classB} with vocabulary length {len(vocab_B)} as (word, frecuency): \n {vocab_B.most_common(75)} \n')
vocab = vocab_A + vocab_B
print(f'* dataset with vocabulary length {len(vocab)} as (word, frecuency): \n {vocab.most_common(75)}')

* HAM with vocabulary length 41681 as (word, frecuency): 
 [('enron', 47467), ('ect', 23914), ('Subject', 12184), ('hou', 11377), ('company', 10885), ('said', 10342), ('please', 8646), ('gas', 8372), ('would', 8364), ('com', 8123), ('new', 8119), ('subject', 7847), ('energy', 7806), ('deal', 6346), ('pm', 6144), ('corp', 5820), ('power', 5772), ('business', 5416), ('cc', 5375), ('trading', 5358), ('may', 4911), ('dynegy', 4835), ('know', 4743), ('also', 4689), ('thanks', 4657), ('message', 4602), ('time', 4595), ('sent', 4478), ('year', 4478), ('one', 4453), ('market', 4385), ('billion', 4284), ('stock', 4255), ('houston', 4227), ('financial', 4132), ('need', 4100), ('week', 3931), ('million', 3884), ('credit', 3778), ('information', 3718), ('original', 3704), ('price', 3682), ('inc', 3637), ('us', 3631), ('louise', 3604), ('day', 3559), ('last', 3559), ('get', 3519), ('jones', 3458), ('could', 3442), ('let', 3330), ('call', 3300), ('dow', 3198), ('questions', 3195), ('data', 3166), ('

In [3]:
# skip tokens under minimum occurrence
tokens = [k for k,c in vocab.items() if c >= min_occurrence]
print(f' From {len(vocab)} to {len(tokens)}: {len(vocab)-len(tokens)} words eliminated')

 From 85561 to 25479: 60082 words eliminated


In [4]:
# save in file
def save_list(lines, filename):
    # convertir líneas a una sola nota de texto
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # cclose file
    file.close()
    
# save tokens into a vocabulary file
save_list(tokens, vocabulary_path)

## Capa de palabras incrustadas

In [5]:
from keras.models import load_model
import string
import re
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Using TensorFlow backend.


In [6]:
# load in memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read the doc
    try:
        text = file.read()
    except:
        os.remove(filename)
        return ''
    # close file
    file.close()
    return text

In [7]:
# doc to clean tokens
def clean_doc(doc, vocab):
    # doc splitted to tokens in white space
    tokens = doc.split()
    # regex for filtering characters
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # eliminate punctuation
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove words that where not included in vocabulary
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

In [8]:
# load all the documents in the specified directory and process them
def process_docs(directory, vocab, is_train):
    documents = list()
    # for each file in the directory
    for filename in listdir(directory):
        # create path
        path = directory + '/' + filename
        # load the document
        doc = load_doc(path)
        # from doc to tokens
        tokens = clean_doc(doc, vocab)
        # filter too long documents (mora than 2000 tokens)
        if len(doc.split()) < 2000:
            documents.append(tokens)
    return documents

In [9]:
# Loading and cleaning the dataset
def load_clean_dataset(vocab, is_train):
    # Merging documents into one variable; first class A samples, then class B samples.
    classA = process_docs(classA_path, vocab, is_train)
    classB = process_docs(classB_path, vocab, is_train)
    docs = classA + classB
    # Respective labels designated in the same order that the samples where loaded into docs.
    labels = array([0 for _ in range(len(classA))] + [1 for _ in range(len(classB))])
    return docs, labels

**Tokenizer()**: This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf...(https://keras.io/preprocessing/text/).

Tokenizer().**fit_on_texts**( --> list of texts to train on. <-- )

In [10]:
# creating a Keras tokenizer instance
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [11]:
# code and fill documents
def encode_docs(tokenizer, max_length, docs):
    # integer codification
    encoded = tokenizer.texts_to_sequences(docs)
    # sequence to pad
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

# Model

In [12]:
# define the model
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile the network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # model diagram
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [13]:
# load the vocabulary
vocab = load_doc(vocabulary_path)
vocab = set(vocab.split())
print(vocab)



In [14]:
# Loading training data: first class A samples, then class B samples.
train_docs, ytrain = load_clean_dataset(vocab, True)
# class A
print(f'1st doc sample: {train_docs[0]} --> {ytrain[0]} \n')
# class B
print(f'Last doc sample: {train_docs[len(train_docs)-1]} --> {ytrain[len(train_docs)-1]}')

1st doc sample: Subject ena sales hpl update project status based new report scott mills ran sitara come following counterparties ones ena selling gas hpl pipe transaction gulf gas utilities company brazoria city panther pipeline inc central illinois light company praxair inc central power light company reliant energy entex ces equistar chemicals lp reliant energy hl corpus christi gas marketing lp southern union company gas company inc texas utilities fuel company duke energy field services inc txu gas distribution entex gas marketing company union carbide corporation equistar chemicals lp unit gas transmission company inc since sure exactly gets entered sitara pat clynes suggested check daren farmer make sure missing something waiting response mary smith begin gathering contractual volumes contracts forwarded cheryl dudley hou ect cheryl king pm sent cheryl dudley daren farmer hou ect ect mary smith hou ect ect cc subject ena sales hpl working project brenda herod wondering one could

In [15]:
# Creating the tokenizer
tokenizer = create_tokenizer(train_docs)

**Example** to understand the tokenizer used methods:

In [16]:
num_words = 3
tk = Tokenizer(num_words=num_words+1)
texts = ["my name is Guzman", "my name is Charls","your name is Chin Yao"]
tk.fit_on_texts(texts)
print(tk.word_index)
print(tk.texts_to_sequences(texts))

{'name': 1, 'is': 2, 'my': 3, 'guzman': 4, 'charls': 5, 'your': 6, 'chin': 7, 'yao': 8}
[[3, 1, 2], [3, 1, 2], [1, 2]]


In [17]:
# Define vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
# For each tokenized document in docs, compute max sequence length
max_length = max([len(doc.split()) for doc in train_docs])
print('Max length: %d' % max_length)

Vocabulary size: 24570
Max length: 1026


In [18]:
# datos codificados
Xtrain = encode_docs(tokenizer, max_length, train_docs)
# definir modelo
model = define_model(vocab_size, max_length)
# red adecuada
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# guardar el modelo
model.save(model_path)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1026, 100)         2457000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1019, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 509, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 16288)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                162890    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 2,645,533
Trainable params: 2,645,533
Non-trainable params: 0
_________________________________________________________________


# Evaluation

In [20]:
import string
import re
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

test_classA_path = './enron/enron_test/ham'
test_classB_path = './enron/enron_test/spam'

# load in memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read the doc
    try:
        text = file.read()
    except:
        os.remove(filename)
        return ''
    # close file
    file.close()
    return text

# doc to clean tokens
def clean_doc(doc, vocab):
    # doc splitted to tokens in white space
    tokens = doc.split()
    # regex for filtering characters
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # eliminate punctuation
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove words that where not included in vocabulary
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

# load all the documents in the specified directory and process them
def process_docs(directory, vocab, is_train):
    documents = list()
    # for each file in the directory
    for filename in listdir(directory):
        # create path
        path = directory + '/' + filename
        # load the document
        doc = load_doc(path)
        # from doc to tokens
        tokens = clean_doc(doc, vocab)
        # filter too long documents (mora than 2000 tokens)
        if len(doc.split()) < 2000:
            documents.append(tokens)
    return documents

# load and clean the dataset
def load_clean_dataset(vocab, is_train):
    # load docs
    if is_train:
        classA = process_docs(classA_path, vocab, is_train)
        classB = process_docs(classB_path, vocab, is_train)
    else:
        classA = process_docs(test_classA_path, vocab, is_train)
        classB = process_docs(test_classB_path, vocab, is_train)
    docs = classA + classB
    # Asign labels
    labels = array([0 for _ in range(len(classA))] + [1 for _ in range(len(classB))])
    return docs, labels

# from text to token 
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# code and fill documents
def encode_docs(tokenizer, max_length, docs):
    # integer codification
    encoded = tokenizer.texts_to_sequences(docs)
    # sequences to pads
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

# classify a review as A or B classes
def predict_class(review, vocab, tokenizer, max_length, model):
    # review
    line = clean_doc(review, vocab)
    # code and pads review
    padded = encode_docs(tokenizer, max_length, [line])
    # predict class
    yhat = model.predict(padded, verbose=0)
    # retrive estimated percetaje and label
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), classA
    return percent_pos, classB

# load the extracted vocabulary in training
vocab_filename = vocabulary_path
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

# load all coments: True for selecting the ones in training dataset, False for the testing one
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)

# creating the tokenizer keras instance
tokenizer = create_tokenizer(train_docs)
# define vocab size
vocab_size = len(tokenizer.word_index) + 1
print('* Vocabulary size: %d' % vocab_size)

# compute maximum length of each tokenize document. MUST be same as the infered in
# the training (model fit)
max_length = max([len(s.split()) for s in train_docs])
print('* Max length: %d' % max_length)

# from document to a vector
Xtrain = encode_docs(tokenizer, max_length, train_docs)
Xtest = encode_docs(tokenizer, max_length, test_docs)

# load the model
model = load_model(model_path)

# model evaluation with training dataset: (sample docs, defined label)
_, acc = model.evaluate(Xtrain, ytrain, verbose=0)
print('* Accuray of the prediction with the training dataset: %.2f' % (acc*100))
# model evaluation with testing dataset: (sample docs, label should be)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('* Accuray of the prediction with the testing dataset: %.2f' % (acc*100))

# Class A trial classification
text_A = load_doc('./enron/enron_test/ham/0005.1999-12-12.kaminski.ham.txt')
percent, classification = predict_class(text_A, vocab, tokenizer, max_length, model)
print(f'* {classA} sample:\n{text_A}* --> {classification} with {percent*100} scoring')

# Class B trial classification
text_B = load_doc('./enron/enron_test/spam/0008.2003-12-18.GP.spam.txt')
percent, classification = predict_class(text_B, vocab, tokenizer, max_length, model)
print(f'* {classB} sample:\n{text_B}* --> {classification} with {percent*100} scoring')

* Tamaño del vocabulario: 24570
* Max length: 1026
* Accuray of the prediction with the training dataset: 100.00
* Accuray of the prediction with the testing dataset: 97.63
* HAM sample:
Subject: christmas baskets
the christmas baskets have been ordered .
we have ordered several baskets .
individual earth - sat freeze - notis
smith barney group baskets
rodney keys matt rodgers charlie
notis jon davis move
team
phillip randle chris hyde
harvey
freese
faclities
iain russell darren
prager
telephone services
mary
martinez
( robert knights dept . )
trina
williams
daniel hornbuckle
todd butler
pamela ford
ozarka -
maryam golnaraghi
special baskets
greg whalley
richard weeks
any questions please contact kevin moore
other request contact kevin moore
price information contact kevin moore
please also if you need any assistance with your christmas cards let me know .
thanks kevin moore* --> HAM with 100.0 scoring
* SPAM sample:
Subject: when sp @ m doesn ' t annoy you eternal
benson sibilant char