In [0]:
from keras.layers import Dense, Dropout, LSTM, Embedding, Flatten, TimeDistributed, Bidirectional, Lambda, Input, CuDNNLSTM, BatchNormalization, CuDNNGRU
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, load_model
from sklearn import metrics, preprocessing, feature_selection
import matplotlib.pyplot as plt
from collections import defaultdict
import tensorflow as tf
import hyperas as hp
import pandas as pd
import numpy as np
import keras
import io
import random

In [0]:
random.seed(42)

In [0]:
!ls

sample_data


In [0]:
from google.colab import files
uploaded = files.upload()

Saving data1.csv to data1.csv


In [0]:
inputs = pd.read_csv(io.StringIO(uploaded['data1.csv'].decode('utf-8')))
# inputs = pd.read_csv('data1.csv')


In [0]:
inputs = inputs[["task_id", "task_duration", "panel_out", "event_type", "original", "label"]]

In [0]:
inputs.head()

Unnamed: 0,task_id,task_duration,panel_out,event_type,original,label
0,1,0,0,0,0,1
1,1,6,0,0,0,1
2,1,6,0,1,1,1
3,1,36,0,2,1,1
4,1,62,0,3,1,1


In [0]:

nb_events = len(set(inputs.event_type))
nb_urls = len(set(inputs.original))

inputs = inputs[inputs["event_type"] != 4]


event_mapping = dict(zip((i for i in range(nb_events)), (np.eye(nb_events)[i] for i in range(nb_events))))
inputs["event_binarized"] = inputs.event_type.map(event_mapping)

url_mapping = dict(zip((i for i in range(nb_urls)), (np.eye(nb_urls)[i] for i in range(nb_urls))))
inputs["url_binarized"] = inputs.original.map(url_mapping)

In [0]:
inputs = inputs[['task_id', "task_duration","panel_out", "event_binarized", "url_binarized", "label"]]

In [0]:
inputs['dwell_time'] = inputs['task_duration'].diff()
inputs.dwell_time = inputs.dwell_time.clip(lower=0)
total_time = sum(inputs.dwell_time.fillna(0))

In [0]:
float_array = inputs['dwell_time'].fillna(0).values.astype(float).reshape(-1,1)
min_max_scaler = preprocessing.MinMaxScaler()
scaled_array = min_max_scaler.fit_transform(float_array)
inputs.dwell_time = scaled_array

In [0]:
inputs['change'] = inputs['task_id'].diff()

In [0]:
average_length = len(inputs)/max(inputs.task_id)
average_duration = float(total_time)/max(inputs.task_id)
average_length, average_duration

(31.317862689271387, 590.846560760968)

In [0]:
tasks = defaultdict(list)

for key, value in inputs.task_id.value_counts().items():
        
        tasks[value].append(key)

In [0]:
def process_column_values(values, dim1, dim2, dim3):
    concat_values = np.concatenate(values)
    reshaped_values = concat_values.reshape(dim1, dim2, dim3)
    
    return reshaped_values


def process_column_values_50(values, dim1, dim2, dim3):
    concat_values = np.concatenate(values)
    reshaped_values = concat_values.reshape(dim1, dim2, dim3)
    
    if dim2 < 100:
        fill = 100 - dim2
        padding = np.zeros((dim1, fill, dim3))
        final_values = np.concatenate((padding, reshaped_values), axis = 1)
    else:
        final_values = reshaped_values[:,-100:, :]

    return final_values


def get_domain_values(values, number, length):
    domain_values = []
    for i in range(1, number + 1):
        domain_values.append(values.iloc[length * i - 1])
        
    return np.array(domain_values)


batches = []
jump = []

for length in range(10, 1001):
    
    num_length = len(tasks[length])
    
    if num_length > 0:

        dataset = inputs[pd.Series.isin(inputs["task_id"], tasks[length])] 


        durations = get_domain_values(dataset.task_duration, num_length, length)
        lengths = np.array([length] * num_length)
        targets = get_domain_values(dataset.label, num_length, length)

        
        urls = process_column_values(dataset.url_binarized.values, num_length, length, nb_urls)
        events = process_column_values(dataset.event_binarized.values, num_length, length, nb_events)
        dwell = process_column_values([dataset.dwell_time.values], num_length, length, 1)
        panel = process_column_values([dataset.panel_out.values], num_length, length, 1)

        batches.append((events, urls, panel, dwell, durations, lengths, targets))


In [0]:
np.random.shuffle(batches)
train_batches = batches[:int(0.8*len(batches))]
valid_batches = batches[int(0.8*len(batches)):int(0.9*len(batches))]
test_batches = batches[int(0.8*len(batches)):]

In [0]:
max_epochs = 10 
dense_size = 150 
rnn_size = 200
dropout = 0.3
learning_rate = 0.001

event_input = Input(shape=(None, 17))
url_input = Input(shape=(None, 167))
panel_input = Input(shape=(None, 1))
dwell_input = Input(shape=(None, 1))
duration_input = Input(shape=(1,))
length_input = Input(shape=(1, ))

x1 = (event_input)
x2 = (url_input)

x = keras.layers.concatenate([x1, x2])
x = CuDNNGRU(rnn_size, return_sequences=False)(x)
x = Dense(dense_size)(x)
x = Dropout(dropout)(x)

predictions = Dense(1, activation='sigmoid')(x)


model2 = Model(inputs=[event_input, url_input, panel_input, dwell_input, duration_input, length_input], outputs=predictions)

opt = keras.optimizers.Adam(lr=learning_rate)

model2.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])

class_weight = {0: 1.,
                1: 7}

previous_roc = 0

for epoch in range(max_epochs):
    valid_predict = []
    valid_true = []
    count = 0 
    length = 0
    np.random.shuffle(train_batches)
    for batch in train_batches:
        model2.fit([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]], batch[6], class_weight = class_weight, verbose=0)  # starts training 
    for batch in valid_batches:
        predictors = model2.predict_on_batch([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]])

        valid_predict.append(np.around(predictors))
        valid_true.append(batch[6])

    valid_predict = np.concatenate(valid_predict)    
    valid_true = np.concatenate(valid_true)
    roc = metrics.f1_score(valid_true, valid_predict, average=None)[1]
    map = metrics.average_precision_score(valid_true, valid_predict)
    previous_roc = roc
    print("epoch: {0}  F1_1: {1:.3f}  ROC: {2:.3f}  MAP: {3: .3f}".format(epoch+1, roc, metrics.roc_auc_score(valid_true, valid_predict), map))

In [0]:

max_epochs = 10 
dense_size = 200 
dense_size_2 = 100 
rnn_size = 200 
dropout = 0.3 
learning_rate = 0.001 

event_input = Input(shape=(None, 17))
url_input = Input(shape=(None, 167))
panel_input = Input(shape=(None, 1))
dwell_input = Input(shape=(None, 1))
duration_input = Input(shape=(1,))
length_input = Input(shape=(1, ))

x1 = Dense(14)(event_input)
x2 = Dense(140)(url_input)

x = keras.layers.concatenate([x1, x2])
x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)

x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=False))(x)


x = Dense(dense_size)(x)

x = Dropout(dropout)(x)

x = Dense(dense_size_2)(x)

x = Dropout(dropout)(x)
predictions = Dense(1, activation='sigmoid')(x)


model3 = Model(inputs=[event_input, url_input, panel_input, dwell_input, duration_input, length_input], outputs=predictions)

opt = keras.optimizers.Adam(lr=learning_rate)

model3.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])

class_weight = {0: 1.,
                1: 7.}

previous_roc = 0

for epoch in range(max_epochs):
    valid_predict = []
    valid_true = []
    count = 0 
    length = 0
    np.random.shuffle(train_batches)
    for batch in train_batches:
        model3.fit([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]], batch[6], class_weight = class_weight, verbose=0)  # starts training 
    for batch in valid_batches:
        predictors = model3.predict_on_batch([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]])

        valid_predict.append(np.around(predictors))
        valid_true.append(batch[6])

    valid_predict = np.concatenate(valid_predict)    
    valid_true = np.concatenate(valid_true)
    roc = metrics.f1_score(valid_true, valid_predict, average=None)[1]
    map = metrics.average_precision_score(valid_true, valid_predict)
    previous_roc = roc
    print("epoch: {0}  F1_1: {1:.3f}  ROC: {2:.3f}  MAP: {3: .3f}".format(epoch+1, roc, metrics.roc_auc_score(valid_true, valid_predict), map))

In [0]:

max_epochs = 10 
dense_size = 200 
dense_size_2 = 100 
rnn_size = 200 
dropout = 0.3 
learning_rate = 0.001 

event_input = Input(shape=(None, 17))
url_input = Input(shape=(None, 167))
panel_input = Input(shape=(None, 1))
dwell_input = Input(shape=(None, 1))
duration_input = Input(shape=(1,))
length_input = Input(shape=(1, ))

x1 = Dense(14)(event_input)
x2 = Dense(140)(url_input)

x1 = Bidirectional(CuDNNGRU(rnn_size, return_sequences=False))(x1)

x2 = Bidirectional(CuDNNGRU(rnn_size, return_sequences=False))(x2)

x = keras.layers.concatenate([x1, x2])


x = Dense(dense_size)(x)

x = Dropout(dropout)(x)

x = Dense(dense_size_2)(x)

x = Dropout(dropout)(x)

predictions = Dense(1, activation='sigmoid')(x)


model4 = Model(inputs=[event_input, url_input, panel_input, dwell_input, duration_input, length_input], outputs=predictions)

opt = keras.optimizers.RMSprop(lr=learning_rate)

model4.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])

class_weight = {0: 1.,
                1: 7.}


for epoch in range(max_epochs):
    valid_predict = []
    valid_true = []
    count = 0 
    length = 0
    np.random.shuffle(train_batches)
    for batch in train_batches:
        model4.fit([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]], batch[6], class_weight = class_weight, verbose=0)  # starts training 
    for batch in valid_batches:
        predictors = model4.predict_on_batch([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]])

        valid_predict.append(np.around(predictors))
        valid_true.append(batch[6])

    valid_predict = np.concatenate(valid_predict)    
    valid_true = np.concatenate(valid_true)
    roc = metrics.f1_score(valid_true, valid_predict, average=None)[1]
    map = metrics.average_precision_score(valid_true, valid_predict)
    previous_roc = roc
    print("epoch: {0}  F1_1: {1:.3f}  ROC: {2:.3f}  MAP: {3: .3f}".format(epoch+1, roc, metrics.roc_auc_score(valid_true, valid_predict), map))
    


In [0]:

max_epochs = 10 
dense_size = 200 
dense_size_2 = 100 

rnn_size = 200 
dropout = 0.3
learning_rate = 0.001

event_input = Input(shape=(None, 17))
url_input = Input(shape=(None, 167))
panel_input = Input(shape=(None, 1))
dwell_input = Input(shape=(None, 1))
duration_input = Input(shape=(1,))
length_input = Input(shape=(1, ))

x1 = Dense(14)(event_input)
x2 = Dense(150)(url_input)

x = keras.layers.concatenate([x1, x2, panel_input, dwell_input])
x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=False))(x)

x = keras.layers.concatenate([x, duration_input, length_input])

x = Dense(dense_size)(x)

x = Dropout(dropout)(x)

x = Dense(dense_size_2)(x)

x = Dropout(dropout)(x)

predictions = Dense(1, activation='sigmoid')(x)


model5 = Model(inputs=[event_input, url_input, panel_input, dwell_input, duration_input, length_input], outputs=predictions)

opt = keras.optimizers.Adam(lr=learning_rate)

model5.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])

class_weight = {0: 1.,
                1: 7.}



for epoch in range(max_epochs):
    valid_predict = []
    valid_true = []
    count = 0 
    length = 0
    np.random.shuffle(train_batches)
    for batch in train_batches:
        model5.fit([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]], batch[6], class_weight = class_weight, verbose=0)  # starts training 
    for batch in valid_batches:
        predictors = model5.predict_on_batch([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]])

        valid_predict.append(np.around(predictors))
        valid_true.append(batch[6])

    valid_predict = np.concatenate(valid_predict)    
    valid_true = np.concatenate(valid_true)
    roc = metrics.f1_score(valid_true, valid_predict, average=None)[1]
    map = metrics.average_precision_score(valid_true, valid_predict)
    previous_roc = roc
    print("epoch: {0}  F1_1: {1:.3f}  ROC: {2:.3f}  MAP: {3: .3f}".format(epoch+1, roc, metrics.roc_auc_score(valid_true, valid_predict), map))


In [0]:
models = [model2, model3, model4, model5]
for model in models:
    test_predict = []  
    test_true = []

    for batch in test_batches:
        predictors = model.predict_on_batch([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]])

        test_predict.append(np.around(predictors))
        test_true.append(batch[6])

    test_predict = np.concatenate(test_predict)    
    test_true = np.concatenate(test_true)
    print(metrics.average_precision_score(test_true, test_predict))
    print(metrics.classification_report(test_true, test_predict))
    print(metrics.roc_auc_score(test_true, test_predict))

In [0]:
def process_column_values(values, dim1, dim2, dim3):
    concat_values = np.concatenate(values)
    reshaped_values = concat_values.reshape(dim1, dim2, dim3)
    
    return reshaped_values


def process_column_values_50(values, dim1, dim2, dim3):
    concat_values = np.concatenate(values)
    reshaped_values = concat_values.reshape(dim1, dim2, dim3)
    
    if dim2 < 100:
        fill = 100 - dim2
        padding = np.zeros((dim1, fill, dim3))
        final_values = np.concatenate((padding, reshaped_values), axis = 1)
    else:
        final_values = reshaped_values[:,-100:, :]

    return final_values


def get_domain_values(values, number, length):
    domain_values = []
    for i in range(1, number + 1):
        domain_values.append(values.iloc[length * i - 1])
        
    return np.array(domain_values)


batches = []

for length in range(10, 1001):
    
    num_length = len(tasks[length])
    
    if num_length > 0:

        dataset = inputs[pd.Series.isin(inputs["task_id"], tasks[length])] 


        durations = get_domain_values(dataset.task_duration, num_length, length)
        lengths = np.array([length] * num_length)
        targets = get_domain_values(dataset.label, num_length, length)

        
        urls_50 = process_column_values_50(dataset.url_binarized.values, num_length, length, nb_urls)
        events_50 = process_column_values_50(dataset.event_binarized.values, num_length, length, nb_events)
        dwell_50 = process_column_values_50([dataset.dwell_time.values], num_length, length, 1)
        panel_50 = process_column_values_50([dataset.panel_out.values], num_length, length, 1)
        batches_50.append((events_50, urls_50, panel_50, dwell_50, durations, lengths, targets))
        

NameError: ignored

In [0]:
np.random.shuffle(batches_50)
train_batches = batches_50[:int(0.8*len(batches))]
valid_batches = batches_50[int(0.8*len(batches)):int(0.9*len(batches))]
test_batches = batches_50[int(0.8*len(batches)):]

In [0]:

max_epochs = 10 
dense_size = 150 
rnn_size = 200 
dropout = 0.3
learning_rate = 0.001

event_input = Input(shape=(None, 17))
url_input = Input(shape=(None, 167))
panel_input = Input(shape=(None, 1))
dwell_input = Input(shape=(None, 1))
duration_input = Input(shape=(1,))
length_input = Input(shape=(1, ))

x1 = Dense(14)(event_input)
x2 = Dense(150)(url_input)

x = keras.layers.concatenate([x1, x2, panel_input, dwell_input])
x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x)
x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=False))(x)

# x = keras.layers.concatenate([x, duration_input, length_input])

x = Dense(dense_size)(x)

x = Dropout(dropout)(x)

predictions = Dense(1, activation='sigmoid')(x)


model6 = Model(inputs=[event_input, url_input, panel_input, dwell_input, duration_input, length_input], outputs=predictions)

opt = keras.optimizers.RMSprop(lr=learning_rate)

model6.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])

class_weight = {0: 1.,
                1: 7.}



for epoch in range(max_epochs):
    valid_predict = []
    valid_true = []
    count = 0 
    length = 0
    np.random.shuffle(train_batches)
    for batch in train_batches:
        model6.fit([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]], batch[6], class_weight = class_weight, verbose=0)  # starts training 
    for batch in valid_batches:
        predictors = model6.predict_on_batch([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]])

        valid_predict.append(np.around(predictors))
        valid_true.append(batch[6])

    valid_predict = np.concatenate(valid_predict)    
    valid_true = np.concatenate(valid_true)
    roc = metrics.f1_score(valid_true, valid_predict, average=None)[1]
    previous_roc = roc
    print("epoch: {0}  F1_1: {1:.3f}  ROC: {2:.3f}".format(epoch+1, roc, metrics.roc_auc_score(valid_true, valid_predict)))

    model4.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'

###Run GRU vs LSTM

This can be done by running the following cells with CuDNNGRU or CuDNNLSTM in the RNN. The amount of data exposed to the model can be changed in the first cell after this one.

In [0]:
np.random.shuffle(batches)
train_batches = batches[:int(0.8*len(batches))]#0.3, 0.4, 0.5, 0.6, 0.7, 0.8
valid_batches = batches[int(0.8*len(batches)):int(0.9*len(batches))]
test_batches = batches[int(0.8*len(batches)):]

In [0]:

max_epochs = 10 
dense_size = 150 
rnn_size = 200 
dropout = 0.3
learning_rate = 0.001

event_input = Input(shape=(None, 17))
url_input = Input(shape=(None, 167))
panel_input = Input(shape=(None, 1))
dwell_input = Input(shape=(None, 1))
duration_input = Input(shape=(1,))
length_input = Input(shape=(1, ))

x1 = Dense(14)(event_input)
x2 = Dense(150)(url_input)

x = keras.layers.concatenate([x1, x2, panel_input, dwell_input])
x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=True))(x) # CuDNNLSTM
x = Bidirectional(CuDNNGRU(rnn_size, return_sequences=False))(x) #CuDNNLSTM

# x = keras.layers.concatenate([x, duration_input, length_input])

x = Dense(dense_size)(x)

x = Dropout(dropout)(x)

predictions = Dense(1, activation='sigmoid')(x)


model6 = Model(inputs=[event_input, url_input, panel_input, dwell_input, duration_input, length_input], outputs=predictions)

opt = keras.optimizers.RMSprop(lr=learning_rate)

model6.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])

class_weight = {0: 1.,
                1: 7.}



for epoch in range(max_epochs):
    valid_predict = []
    valid_true = []
    count = 0 
    length = 0
    np.random.shuffle(train_batches)
    for batch in train_batches:
        model6.fit([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]], batch[6], class_weight = class_weight, verbose=0)  # starts training 
    for batch in valid_batches:
        predictors = model6.predict_on_batch([batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]])

        valid_predict.append(np.around(predictors))
        valid_true.append(batch[6])

    valid_predict = np.concatenate(valid_predict)    
    valid_true = np.concatenate(valid_true)
    roc = metrics.f1_score(valid_true, valid_predict, average=None)[1]
    previous_roc = roc
    print("epoch: {0}  F1_1: {1:.3f}  ROC: {2:.3f}".format(epoch+1, roc, metrics.roc_auc_score(valid_true, valid_predict)))

    model4.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'