In [1]:
import sys
import os
import json
import pandas as pd
import numpy as np
import optparse
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict
from pprint import pprint
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
datapath = "/content/drive/MyDrive/Colab Notebooks/Phishing Classification Model/data.csv"
data_frame = pd.read_csv(datapath, engine='python', quotechar='"', header=None)
dataset = data_frame.sample(frac=1).values

feature = dataset[:, 0]
Y = dataset[:, 1]

for index, item in enumerate(feature):
  feature[index] = item

tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(feature)

num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(feature)

max_log_length = 2083
train_size = int(len(dataset) * .8)
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)
X_train, X_test = X_processed[0:train_size], X_processed[train_size:len(X_processed)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(Y)]
X_train = np.asarray(X_train).astype(np.int)
Y_train = np.asarray(Y_train).astype(np.int)
X_test = np.asarray(X_test).astype(np.int)
Y_test = np.asarray(Y_test).astype(np.int)

In [None]:
model = tf.keras.Sequential([
  Embedding(num_words, 32, input_length=max_log_length),
  Dropout(.5),
  LSTM(64, recurrent_dropout=.5),
  Dropout(0.5),
  Dense(1, activation='sigmoid')           
])

model.compile(
  loss='binary_crossentropy',
  optimizer='adam',
  metrics=['accuracy']
)

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 2083, 32)          2112      
_________________________________________________________________
dropout_6 (Dropout)          (None, 2083, 32)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 27,009
Trainable params: 27,009
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(X_train, Y_train, validation_split=0.25, epochs=10, batch_size=500)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4832656588>

In [None]:
model.save("/content/drive/MyDrive/Colab Notebooks/Phishing Classification Model/model")

In [None]:
loss, acc = model.evaluate(X_test, Y_test, verbose=2)
print('accuracy: {:5.2f}%'.format(100*acc))

391/391 - 194s - loss: 0.0864 - accuracy: 0.9729
accuracy: 97.29%
