In [1]:
import pandas as pd
import numpy as np
import re, os
from string import printable
from sklearn import model_selection

import tensorflow as tf
from keras.models import Sequential, Model, model_from_json, load_model
from keras import regularizers
from keras.layers.core import Dense, Dropout, Activation, Lambda, Flatten
from keras.layers import *
from keras.preprocessing import sequence
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils
from keras import backend as K

from pathlib import Path
import json

import warnings
warnings.filterwarnings("ignore")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Data Preparations

In [2]:
DATA = 'data/URL.csv'
df = pd.read_csv(DATA)
df.head()

Unnamed: 0,url,isMalicious
0,songlyrics.com/news/riffd-the-shins-heartworms,0
1,imaging-resource.com/PRODS/olympus-e-m1-ii/oly...,0
2,gosugamers.net/lol/streams,0
3,thingiverse.com/corkyzett/collections/intlwome...,0
4,bausch.com/our-products/contact-lens-care/spec...,0


In [3]:
df.shape

(194798, 2)

In [4]:
df.describe()

Unnamed: 0,isMalicious
count,194798.0
mean,0.5
std,0.500001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194798 entries, 0 to 194797
Data columns (total 2 columns):
url            194798 non-null object
isMalicious    194798 non-null int64
dtypes: int64(1), object(1)
memory usage: 3.0+ MB


In [6]:
max_len=75
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable] for url in df.url]
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)
target = np.array(df.isMalicious)
print('Matrix dimensions of X: ', X.shape, 'Vector dimension of target: ', target.shape)

Matrix dimensions of X:  (194798, 75) Vector dimension of target:  (194798,)


In [7]:
X_train, X_test, target_train, target_test = model_selection.train_test_split(X, target, test_size=0.2, random_state=42)

# Model Preparation

In [8]:
def print_layers_dims(model):
    l_layers = model.layers
    for i in range(len(l_layers)):
        print(l_layers[i])
        print('Input Shape: ', l_layers[i].input_shape, 'Output Shape: ', l_layers[i].output_shape)


def save_model(fileModelJSON,fileWeights):
    if Path(fileModelJSON).is_file():
        os.remove(fileModelJSON)
    json_string = model.to_json()
    with open(fileModelJSON,'w' ) as f:
        json.dump(json_string, f)
    if Path(fileWeights).is_file():
        os.remove(fileWeights)
    model.save_weights(fileWeights)
    
def load_model(fileModelJSON,fileWeights):
    with open(fileModelJSON, 'r') as f:
         model_json = json.load(f)
         model = model_from_json(model_json)
    
    model.load_weights(fileWeights)
    return model

### LSTM

In [9]:
def simple_lstm(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                dropout=0.2, W_regularizer=W_reg)(main_input) 

    lstm = LSTM(lstm_output_size)(emb)
    lstm = Dropout(0.5)(lstm)
    
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [10]:
epochs = 2
batch_size = 5

model = simple_lstm()
model.fit(X_train, target_train, epochs, batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
print_layers_dims(model)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Final Cross-Validation Accuracy 0.8829568788501027 

<keras.engine.topology.InputLayer object at 0x7f3273718da0>
Input Shape:  (None, 75) Output Shape:  (None, 75)
<keras.layers.embeddings.Embedding object at 0x7f32736bb0f0>
Input Shape:  (None, 75) Output Shape:  (None, 75, 32)
<keras.layers.recurrent.LSTM object at 0x7f3272685518>
Input Shape:  (None, 75, 32) Output Shape:  (None, 32)
<keras.layers.core.Dropout object at 0x7f32705c0dd8>
Input Shape:  (None, 32) Output Shape:  (None, 32)
<keras.layers.core.Dense object at 0x7f327055b3c8>
Input Shape:  (None, 32) Output Shape:  (None, 1)


In [11]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 75)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 75, 32)        3200        main_input[0][0]                 
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 32)            8320        embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 32)            0           lstm_1[0][0]                     
___________________________________________________________________________________________

In [14]:
DATA_HOME = 'data'
model_name = "LSTM"
save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")

### Convolutional LSTM

In [28]:
def lstm_conv(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                W_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    conv = Convolution1D(kernel_size=5, filters=256, \
                     border_mode='same')(emb)
    conv = ELU()(conv)

    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)
    
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 75)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 75, 32)        3200        main_input[0][0]                 
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 32)            8320        embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 32)            0           lstm_1[0][0]                     
___________________________________________________________________________________________

In [None]:
epochs = 2
batch_size = 5

model = lstm_conv()
model.fit(X_train, target_train, epochs, batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
print_layers_dims(model)

In [27]:
DATA_HOME = 'data'
model_name = "1DConvLSTM"
save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")

### Convolutional Fully Connected

In [29]:
def conv_fully(max_len=75, emb_dim=32, max_vocab_len=100, W_reg=regularizers.l2(1e-4)):
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                W_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    
    def sum_1d(X):
        return K.sum(X, axis=1)
    
    def get_conv_layer(emb, kernel_size=5, filters=256):
        conv = Convolution1D(kernel_size=kernel_size, filters=filters, \
                     border_mode='same')(emb)
        conv = ELU()(conv)

        conv = Lambda(sum_1d, output_shape=(filters,))(conv)
        conv = Dropout(0.5)(conv)
        return conv
        
 
    conv1 = get_conv_layer(emb, kernel_size=2, filters=256)
    conv2 = get_conv_layer(emb, kernel_size=3, filters=256)
    conv3 = get_conv_layer(emb, kernel_size=4, filters=256)
    conv4 = get_conv_layer(emb, kernel_size=5, filters=256)

    merged = concatenate([conv1,conv2,conv3,conv4], axis=1)

    hidden1 = Dense(1024)(merged)
    hidden1 = ELU()(hidden1)
    hidden1 = BatchNormalization(mode=0)(hidden1)
    hidden1 = Dropout(0.5)(hidden1)

    hidden2 = Dense(1024)(hidden1)
    hidden2 = ELU()(hidden2)
    hidden2 = BatchNormalization(mode=0)(hidden2)
    hidden2 = Dropout(0.5)(hidden2)
    
    output = Dense(1, activation='sigmoid', name='output')(hidden2)

    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 75)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 75, 32)        3200        main_input[0][0]                 
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 32)            8320        embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 32)            0           lstm_1[0][0]                     
___________________________________________________________________________________________

In [None]:
epochs = 2
batch_size = 5

model = conv_fully()
model.fit(X_train, target_train, epochs, batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
print_layers_dims(model)

In [None]:
target_proba = model.predict(X_test, batch_size=1)

In [None]:
target_proba[0:10]

In [26]:
DATA_HOME = 'data'
model_name = "1DConv"
save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")

# Predictions

In [22]:
l_layers = model.layers
weights = l_layers[1].get_weights()
weights[0].shape

(100, 32)

In [23]:
test_url_mal = "mydrivers.com/1/524/524241.htm"
test_url_benign = "ubuntulinux.org/server/hyperscale"
url = test_url_benign

In [24]:
max_len=75
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable]]
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

In [25]:
target_proba = model.predict(X, batch_size=1)
def print_result(proba):
    if proba > 0.5:
        return "malicious"
    else:
        return "benign"
print("Test URL:", url, "is", print_result(target_proba[0]))

Test URL: ubuntulinux.org/server/hyperscale is benign
