In [3]:
import logging
import re
import nltk
import keras
import numpy as np
import sys


#Product: LSTM recurring neural net using one-hot encoding

#Tokenizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

logger = logging.getLogger('FraudEmails')

frequency = 100

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jseme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jseme\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jseme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Open and convert file to String
try:
    with open('../data/fraudulent_emails.txt','r') as file:
        text = file.read()
except Exception as e:
    logger.error('Process failed with error: '+repr(e))
finally:
    file.close()

In [5]:
#Delete everything between From r [.*?] Status: ?O
formatEmails = re.sub('From.*?Status: ?O','',text,flags=re.DOTALL)


In [6]:
#Convert to lower case
formatEmails = formatEmails.lower()

In [7]:
#Tokenize into string
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(formatEmails)
words = filter(lambda tok: tok not in nltk.corpus.stopwords.words('english'), tokens)

processed_emails = ' '.join(words)

In [8]:
#Frequency of each character
characters = sorted(list(set(processed_emails)))
numb_chars = dict((c,i) for i,c in enumerate(characters))

In [9]:
numb_words = len(processed_emails)
print('Number of characters:',numb_words)

Number of characters: 3052533


In [10]:
numb_vocab = len(numb_chars)
print('Number of different types of characters:',numb_vocab)

Number of different types of characters: 53


In [11]:
#Length of an input sequence mapping
sequence_length = 100
x = []
y = []

In [12]:

for i in range(numb_words-sequence_length):
    #Current character to sequence length - input gate
    input = processed_emails[i:i+sequence_length]
    #Initial character to final sequence length - output gate
    output = processed_emails[i+sequence_length]

    #Chars to ints
    x.append([numb_chars[char] for char in input])
    y.append(numb_chars[output])

In [13]:
patterns = len(x)
print('Number of patterns:',patterns)

Number of patterns: 3052433


In [14]:
#Sigmoid func

sig_x = np.reshape(x, (patterns, sequence_length, 1))
sig_x = sig_x/float(numb_vocab)

In [15]:
#One-hot encode

encode = keras.utils.to_categorical(y)

In [16]:
#Sequential model

model = keras.models.Sequential()

model.add(keras.layers.LSTM(256, input_shape=(sig_x.shape[1], sig_x.shape[2]), return_sequences=True))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.LSTM(256, return_sequences=True))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.LSTM(128))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(encode.shape[1], activation='softmax'))

In [17]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [18]:
checkpoint = keras.callbacks.ModelCheckpoint('../data/model_weights.hdf5', monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callback = [checkpoint]

In [None]:
#Train model (greater epochs = greater accuracy)

model.fit(sig_x, encode, epochs=4, batch_size=256, callbacks=desired_callback)

Epoch 1/4
   79/11924 [..............................] - ETA: 14:54:54 - loss: 3.3152

In [None]:
#Weights

model.load_weights('../data/model_weights.hdf5')
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
#Convert back to chars

num_to_char = dict((i, c) for i, c in enumerate(characters))

In [None]:
start = np.random.randint(0, len(x) - 1)
pattern = x[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

In [None]:
#https://stackabuse.com/text-generation-with-python-and-tensorflow-keras/

for i in range(1000):
    x_input = np.reshape(pattern, (1, len(pattern), 1))
    x_input = x_input / float(numb_vocab)
    prediction = model.predict(x_input, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]