In [41]:
import numpy as np
import pandas as pd
import pickle
import json
from keras.models import load_model
from sklearn.preprocessing import StandardScaler, MinMaxScaler

MODEL_PATH = 'models_2019_06_01_21_21/'
prefix = '../'

## Load config

In [4]:
with open(MODEL_PATH + 'config.json') as f:
    config = json.load(f)

In [5]:
config

{'IsolationForest': {'n_estimators': 500,
  'random_state': 101,
  'verbose': 1,
  'n_jobs': -1,
  'contamination': 0.01,
  'behaviour': 'new'},
 'Autoencoder': {'optimizer': 'adam',
  'loss': 'mean_squared_error',
  'epochs': 30,
  'batch_size': 256,
  'shuffle': True},
 'Data': 'data/final/final_dataset_30T.csv'}

## Load data

In [7]:
data = pd.read_csv(prefix + config['Data'], sep='\t', index_col=[0])
data = StandardScaler().fit_transform(data)
data.head()

Unnamed: 0,"('avg_Length', 'avg_2019-02-01 09:30:00')","('avg_Length', 'avg_2019-02-01 10:00:00')","('avg_Length', 'avg_2019-02-01 10:30:00')","('avg_Length', 'avg_2019-02-01 11:00:00')","('avg_Length', 'avg_2019-02-01 11:30:00')","('avg_Length', 'avg_2019-02-01 12:00:00')","('avg_Length', 'avg_2019-02-01 12:30:00')","('avg_Length', 'avg_2019-02-01 13:00:00')","('avg_Length', 'avg_2019-02-01 13:30:00')","('avg_Length', 'avg_2019-02-01 14:00:00')",...,"('count_Port2_last_1', 'count_2019-02-01 22:00:00')",host0,host1,host2,host3,host4,host5,host6,host7,is_src
1.0.171.245source,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1,1,1,1,0,1,0,1,1
1.0.255.54source,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.219512,0.0,...,0.0,0,0,1,1,0,1,1,0,1
1.1.158.248source,0.0,0.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1,1,1,1,1,0,0,0,1
1.1.217.60source,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,1,1,1,1,0,0,1
1.1.230.15source,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,...,0.0,0,0,0,0,1,1,1,1,1


## Load models

In [11]:
autoencoder = load_model(MODEL_PATH + 'autoencoder.h5')
with open(MODEL_PATH + 'IsolationForest.pkl', 'rb') as f:
    forest = pickle.load(f)

In [17]:
reconstructed_data = autoencoder.predict(data)
forest_anomalies = forest.predict(data)

In [60]:
mse = np.mean(np.power(data - reconstructed_data, 2), axis=1)
error = pd.DataFrame({'Recostruction_Error': mse, 'is_anomaly':0})
error.describe()

Unnamed: 0,Recostruction_Error,is_anomaly
count,46713.0,46713.0
mean,0.907569,0.0
std,36.001205,0.0
min,0.011125,0.0
25%,0.016692,0.0
50%,0.018707,0.0
75%,0.028167,0.0
max,6278.611243,0.0


In [62]:
np.unique(forest_anomalies, return_counts=True)
a = error.Recostruction_Error.nlargest(sum(forest_anomalies == -1))
error.is_anomaly.loc[a.index] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [66]:
data[(error.is_anomaly == -1) & (forest_anomalies == -1)].shape

(368, 1101)

In [55]:
a

31433    1.000000
39541    0.466964
1176     0.373981
34135    0.162414
16662    0.147459
37217    0.136991
5689     0.129379
5688     0.107599
5690     0.105414
39540    0.087607
5687     0.083833
5691     0.064587
5694     0.061462
5698     0.057994
5695     0.057282
5696     0.055492
5700     0.054557
5692     0.051366
5697     0.050193
5699     0.046857
15689    0.046592
31440    0.044549
1185     0.044484
5987     0.043671
5693     0.040334
34291    0.039782
33950    0.038167
12039    0.036261
12064    0.033040
33949    0.031728
           ...   
33964    0.001516
33939    0.001507
1354     0.001505
11915    0.001500
33931    0.001486
33925    0.001484
33861    0.001484
44583    0.001476
37033    0.001474
42761    0.001462
33871    0.001459
37214    0.001457
33973    0.001457
37126    0.001456
33934    0.001454
33938    0.001449
12033    0.001448
33944    0.001448
37132    0.001442
45352    0.001437
41943    0.001433
33866    0.001431
35604    0.001429
33917    0.001429
33933    0

In [72]:
sum((error.is_anomaly == -1) & (forest_anomalies == -1))

368

In [73]:
forest_anomalies = forest.predict(data)
mse = np.mean(np.power(data - autoencoder.predict(data), 2), axis=1)
error = pd.DataFrame({'Recostruction_Error': mse,
                      'ae_anomaly': 0,
                      'is_anomaly': 0})
idx = error.Recostruction_Error.nlargest(sum(forest_anomalies == -1)).index
error['ae_anomaly'][idx] = -1
idx = (error.ae_anomaly == -1) & (forest_anomalies == -1)
error['is_anomaly'][idx] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [78]:
error = pd.DataFrame({'Recostruction_Error': mse,
                      'ae_anomaly': 0,
                      'is_anomaly': 0})
idx = error.Recostruction_Error.nlargest(sum(forest_anomalies == -1)).index
error['ae_anomaly'][idx] = -1
idx = (error.ae_anomaly == -1) & (forest_anomalies == -1)
error['is_anomaly'][idx] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [80]:
error['is_anomaly'].value_counts()

 0    46345
-1      368
Name: is_anomaly, dtype: int64