In [1]:
# build character map function for encoding URL string

import string
ascii_letters = string.ascii_letters # 1~52
digits = string.digits # 53~62
punctuation = string.punctuation # 63~94
total_char = ascii_letters + digits + punctuation    

UNKNOWN_CHAR = len(total_char) + 1
TOTAL_FEATURES = UNKNOWN_CHAR + 1 # include the default padding integer 0 
charmap = {
    c: idx+1
    for idx, c in enumerate(total_char)
}

def encodeChar(c):
    return charmap.get(c, UNKNOWN_CHAR)

encodeChar("x"), encodeChar("a"), encodeChar("æˆ‘")

(24, 1, 95)

In [2]:
# load dataset

import pandas
import statistics
df = pandas.read_csv("all_urls.csv")

df["len"] = df.url.apply(lambda s: len(s))

In [3]:
# view the length stats
df.len.describe()

count    420464.000000
mean         48.342005
std          35.021279
min           1.000000
25%          29.000000
50%          41.000000
75%          58.000000
max        2307.000000
Name: len, dtype: float64

In [4]:
# find a Length for large coverage for all sample URL
# Length = 400 has ~98% coverage
for t in [200, 300, 400, 500, 600, 700, 800, 900, 1000]:
    print("x={} {:.5f}%".format(t, 100 * sum(df.len.apply(lambda x: x > t)) / len(df.len)))

x=200 0.60172%
x=300 0.12034%
x=400 0.07420%
x=500 0.06469%
x=600 0.04614%
x=700 0.02545%
x=800 0.01784%
x=900 0.00880%
x=1000 0.00476%


In [1]:
# sampling train/test dataset

from sklearn.model_selection import train_test_split
from keras.utils import np_utils

sub_df, preserved_df = train_test_split(df, test_size=0.2, random_state=1)
print(len(sub_df), len(preserved_df), len(df))
# sub_df = df.sample(1000)
categorical_label = np_utils.to_categorical(sub_df.label)

url_train, url_test, y_train, y_test \
    = train_test_split(sub_df.url, categorical_label, test_size=0.2, random_state=1)

url_train, url_val, y_train, y_val \
    = train_test_split(url_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

print('Loading data...')
print(len(url_train), 'train sequences')
print(len(url_test), 'test sequences')
print(len(url_val), 'val sequences')

ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`

In [2]:
# Some configurations 

# Embedding
max_features = TOTAL_FEATURES
maxlen = 400 # ~98% coverage, paper uses 96% coverage
embedding_size = 128

# Training
batch_size = 64 # paper param
epochs = 20 # paper param

# Convolution
kernel_size = 5
filters = 64
pool_size = 2

# LSTM
lstm_output_size = 70

# Dropout ratio
Dropout_ratio = 0.25

NameError: name 'TOTAL_FEATURES' is not defined

In [7]:
# encode the URL by one-hot encoding and padding feature vector by 'pre'

from keras.preprocessing.sequence import pad_sequences
import numpy

print('Pad sequences (samples x time)')

x_train = pad_sequences(url_train.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')
x_test = pad_sequences(url_test.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')

x_val = pad_sequences(url_val.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('x_val shape:', x_val.shape)

Using TensorFlow backend.
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\MYDESK\anaconda3\envs\hi\lib\site-packages\IPython\core\interactiveshell.py", line 3417, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-56cc78ac56e4>", line 3, in <module>
    from keras.preprocessing.sequence import pad_sequences
  File "C:\Users\MYDESK\anaconda3\envs\hi\lib\site-packages\keras\__init__.py", line 3, in <module>
    from . import utils
  File "C:\Users\MYDESK\anaconda3\envs\hi\lib\site-packages\keras\utils\__init__.py", line 6, in <module>
    from . import conv_utils
  File "C:\Users\MYDESK\anaconda3\envs\hi\lib\site-packages\keras\utils\conv_utils.py", line 9, in <module>
    from .. import backend as K
  File "C:\Users\MYDESK\anaconda3\envs\hi\lib\site-packages\keras\backend\__init__.py", line 1, in <module>
    from .load_backend import epsilon
  File "C:\Users\MYDESK\anaconda3\envs\hi\lib\site-packages\keras\backend\load_backend.py", line 90, in <module>
    from .tenso

TypeError: object of type 'NoneType' has no len()

In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D
from keras.optimizers import SGD

print('Build model...')

model = Sequential(name="CNN-LSTM for phishing detection")
model.add(Embedding(max_features, embedding_size, input_length=maxlen, trainable=True))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dropout(Dropout_ratio))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

In [None]:
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
#           epochs=3,
          epochs=epochs,
          validation_data=(x_val, y_val))
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
model.evaluate(x_train, y_train, batch_size=batch_size)

In [None]:
model.evaluate(x_val, y_val, batch_size=batch_size)

In [None]:
# evaluate preserved_df
preserved_x = pad_sequences(preserved_df.url.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')

preserved_y = np_utils.to_categorical(preserved_df.label)
model.evaluate(preserved_x, preserved_y, batch_size=batch_size)