The training process is to train a neural network in a manner of supervised learning process using a convolutional neural network architecture. 

The network classifies whether the url string is beningn or malicious based on the url String.

In [54]:
# Package Imports
import pandas as pd
import numpy as np
from string import printable
from sklearn import model_selection
from keras.preprocessing import sequence
from keras.layers.core import Dropout, Dense
from keras import regularizers
from keras.layers import Input, Embedding, Convolution1D, ELU, MaxPooling1D, LSTM
from keras.models import Model
from keras.optimizers import Adam

In [5]:
# Importing dataset
dataset = pd.read_csv('./dataset/urlString.csv')

In [6]:
dataset.head(20)

Unnamed: 0,url,isMalicious
0,songlyrics.com/news/riffd-the-shins-heartworms,0
1,imaging-resource.com/PRODS/olympus-e-m1-ii/oly...,0
2,gosugamers.net/lol/streams,0
3,thingiverse.com/corkyzett/collections/intlwome...,0
4,bausch.com/our-products/contact-lens-care/spec...,0
5,w88mobile.org/game/5151.html,0
6,datacenterdynamics.com/awards/latin-america-aw...,0
7,123people.com/s/marc+pageau,0
8,nownews.com/n/2017/03/21/2449327,0
9,wikipedia.org/wiki/Plague_Park,0


In [7]:
len(dataset)

194798

In [10]:
# List of printable characters
printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [14]:
# dataset preparation
# encode the printable characters in the url string as integers
def urlPrep(url):
    return [printable.index(char) + 1 for char in url if char in printable]

In [15]:
url_tokens = []
for url in dataset.url:
    url_tokens.append(urlPrep(url))

In [21]:
# Standardize the url string length in the dataset taking only the first 75 printable characters or padding with zeros
max_length = 75
url_data = sequence.pad_sequences(url_tokens, maxlen=max_length)

In [23]:
# target_label array
target_label = np.array(dataset.isMalicious)

In [24]:
# Shape of the encoded url string array
url_data.shape

(194798, 75)

In [25]:
# Shape of the target labels array
target_label.shape

(194798,)

In [27]:
# splitting the url dataset using 25 percent for testing
url_train, url_test, target_train, target_test = model_selection.train_test_split(url_data, target_label, test_size=0.25, random_state=30)

In [38]:
def lstm_conv(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                W_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    # Conv layer
    conv = Convolution1D(kernel_size=5, filters=256, \
                     border_mode='same')(emb)
    conv = ELU()(conv)

    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    # LSTM layer
    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)
    
    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Compile model and define optimizer
    model = Model(input=[main_input], output=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [60]:
epochs = 7
batch_size = 30

model = lstm_conv()
model.fit(url_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = model.evaluate(url_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')

  
  # This is added back by InteractiveShellApp.init_path()


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7

Final Cross-Validation Accuracy 0.9164681724699126 



In [61]:
model.save_weights('model_convolutional.h5')

In [62]:
model.load_weights('model_convolutional.h5')

In [87]:
test_url_mal = "naureen.net/etisalat.ae/index2.php"
test_url_benign = "sixt.com/php/reservation?language=en_US"

url = "dashboard.heroku.com/apps/glacial-ridge-51682"

In [88]:
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable]]

# Step 2: Cut URL string at max_len or pad with zeros if shorter
max_len=75
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

In [89]:
target_proba = model.predict(X, batch_size=1)
def print_result(proba):
    if proba > 0.5:
        return "malicious"
    else:
        return "benign"
print("Test URL:", url, "is", print_result(target_proba[0]))

Test URL: dashboard.heroku.com/apps/glacial-ridge-51682 is benign
