In [1]:
import numpy as np
import json
import codecs
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from model import my_aggclustering3
from tokenizer import clean_commands

  import pandas.util.testing as tm


In [127]:
msdat_dir = '/home/hpms/Microsoft.IoT-Dump1.json'
with codecs.open(msdat_dir, 'r', 'utf-8-sig') as f:
    msdat = json.load(f)
msdat = pd.DataFrame(msdat)
msdat['Commands'] = [tuple(session) for session in msdat['Commands']]
msdat = msdat.drop_duplicates(subset='Commands').reset_index(drop=True)  # drop duplicates
msdat['Commands'] = [list(session) for session in msdat['Commands']]
msdat['Commands Length'] = [len(session) for session in msdat['Commands']]
msdat = msdat.sort_values(by='FirstSeen').reset_index(drop=True)

In [110]:
msdat = msdat.loc[msdat['Commands Length'] > 12].reset_index(drop=True)

In [3]:
sessions_token_list, dictionary = clean_commands(msdat, 2)
msdat['Commands'] = sessions_token_list

In [126]:
alpha_cluster = 0.0001
alpha_label = 0.0001
cluster_threshold = -0.3
for seq_len in range(3,13,3):
    print('cluster threshold: {:.1f}, seq length: {}.'.format(cluster_threshold, seq_len))
    my_agg = my_aggclustering3(msdat, alpha_cluster, cluster_threshold, alpha_label, seq_len)
    for depth in range(6):
        my_agg.fit()
    with open('table/aggclustering_{:.1f}_{}.pkl'.format(cluster_threshold, seq_len), 'wb') as f:
        pickle.dump(my_agg, f, protocol=4)
    print('Expected probabilities against depth are {}'.format([np.sum(my_agg.expected_predictive_probs[i]) for i in range(6)]))

cluster threshold: -0.3, seq length: 3.
Depth 1: 11 clusters exist. Time spent 80.15s.
Depth 2: 41 clusters exist. Time spent 296.87s.
Depth 3: 66 clusters exist. Time spent 718.42s.
Depth 4: 76 clusters exist. Time spent 960.80s.
Depth 5: 80 clusters exist. Time spent 965.76s.
Depth 6: 89 clusters exist. Time spent 1237.28s.
Expected probabilities against depth are [0.47639908815884674, 0.5241080024995824, 0.5393412951374248, 0.5473188987457906, 0.5517750826386454, 0.5550260001550098]
cluster threshold: -0.3, seq length: 6.
Depth 1: 14 clusters exist. Time spent 101.92s.
Depth 2: 48 clusters exist. Time spent 332.25s.
Depth 3: 71 clusters exist. Time spent 844.00s.
Depth 4: 81 clusters exist. Time spent 1070.58s.
Depth 5: 94 clusters exist. Time spent 1005.21s.
Depth 6: 104 clusters exist. Time spent 1387.45s.
Expected probabilities against depth are [0.46012436286352865, 0.5081396597666602, 0.5243176276644581, 0.5325811863634167, 0.5369250027850014, 0.5463127655037823]
cluster thresh

In [5]:
# with open('table/aggclustering_1000.pkl', 'rb') as f:
#     my_agg = pickle.load(f)
# max_depth = my_agg.depth
# init_commands_clusters = my_agg.init_commands_clusters
# posteriors = my_agg.posteriors
# expected_predictive_probs = my_agg.expected_predictive_probs
# weights = my_agg.weights

### Baseline

In [89]:
alpha_cluster = 0.0001
cluster_threshold = -np.inf
alpha_label = 0.0001
seq_len = 12
with open('table/aggclustering_baseline.pkl', 'rb') as f:
    my_agg = pickle.load(f)
for i in range(3,7):
    my_agg.fit()
with open('table/aggclustering_baseline.pkl', 'wb') as f:
    pickle.dump(my_agg, f, protocol=4)

Depth 3: 14237 clusters exist. Time spent 3183.02s.
Depth 4: 19537 clusters exist. Time spent 5534.97s.
Depth 5: 19931 clusters exist. Time spent 5678.35s.
Depth 6: 28116 clusters exist. Time spent 10700.26s.
0.5172550176759918


In [112]:
with open('table/aggclustering_baseline.pkl', 'rb') as f:
    my_agg = pickle.load(f)
max_depth = my_agg.depth
init_commands_clusters = my_agg.init_commands_clusters
posteriors = my_agg.posteriors
expected_predictive_probs = my_agg.expected_predictive_probs
weights = my_agg.weights

In [113]:
# expected predivtive prob
[np.sum(k) for k in expected_predictive_probs]

[0.4542414953747525,
 0.5172550176759918,
 0.5495890918454058,
 0.5684372298204223,
 0.574061221867031,
 0.6037521323163807]

In [114]:
# number of clusters
[len(k) for k in expected_predictive_probs]

[1796, 7380, 14237, 19537, 19931, 28116]

In [121]:
sorted(weights[-1], reverse=True)

[21785,
 10664,
 4299,
 2620,
 1469,
 1269,
 1124,
 1056,
 1013,
 938,
 896,
 825,
 644,
 634,
 627,
 490,
 486,
 464,
 443,
 438,
 385,
 366,
 322,
 279,
 277,
 265,
 232,
 177,
 175,
 125,
 115,
 102,
 100,
 93,
 81,
 80,
 76,
 72,
 65,
 57,
 52,
 47,
 39,
 39,
 38,
 37,
 35,
 33,
 33,
 31,
 31,
 28,
 28,
 27,
 27,
 27,
 26,
 25,
 25,
 22,
 22,
 19,
 19,
 18,
 17,
 16,
 16,
 15,
 14,
 14,
 14,
 14,
 13,
 13,
 13,
 13,
 13,
 12,
 12,
 12,
 12,
 12,
 12,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7

In [133]:
alpha_cluster = 0.0001
cluster_threshold = np.inf
alpha_label = 0.0001
seq_len = 12
with open('table/aggclustering_baseline_onecluster.pkl', 'rb') as f:
    my_agg = pickle.load(f)
my_agg.fit()
with open('table/aggclustering_baseline_onecluster.pkl', 'wb') as f:
    pickle.dump(my_agg, f, protocol=4)

In [83]:
with open('table/aggclustering_baseline_onecluster.pkl', 'rb') as f:
    my_agg = pickle.load(f)
max_depth = my_agg.depth
init_commands_clusters = my_agg.init_commands_clusters
posteriors = my_agg.posteriors
expected_predictive_probs = my_agg.expected_predictive_probs
weights = my_agg.weights

In [None]:
posteriors[0][0][:30]

In [84]:
my_agg.expected_predictive_probs[0][0]

0.43236842592452174

### Evaluation

In [136]:
with open('table/aggclustering_-0.3_12.pkl', 'rb') as f:
    my_agg = pickle.load(f)
init_commands_clusters = my_agg.init_commands_clusters

In [143]:
weights = my_agg.weights

In [147]:
xs = [x for _, x in sorted(zip(weights[-1], init_commands_clusters[-1]), reverse=True)]

In [151]:
xs[2][:3]

(('sh',
  'linuxshell',
  'bah',
  'hostname Ex0_1115',
  'bin busybox Ex0',
  'bin busybox ps'),
 ('sh',
  'linuxshell',
  'bah',
  'hostname Ex0_8572',
  'bin busybox Ex0',
  'bin busybox ps'),
 ('sh',
  'linuxshell',
  'bah',
  'hostname Ex0_0447',
  'bin busybox Ex0',
  'bin busybox ps'))

In [142]:
init_commands_clusters[-1]

[(('shell',
   'sh',
   'tmp .botnettest cd tmp',
   'var .botnettest cd var',
   'dev .botnettest cd dev',
   'mnt .botnettest cd mnt'),
  ('cat rarecommand',
   'tmp rarecommand cd tmp',
   'var rarecommand cd var',
   'dev rarecommand cd dev',
   'mnt rarecommand cd mnt',
   'var run rarecommand cd var run'),
  ('shell',
   'sh',
   'tmp rarecommand cd tmp',
   'var rarecommand cd var',
   'dev rarecommand cd dev',
   'mnt rarecommand cd mnt'),
  ('enable',
   'system',
   'shell',
   'sh',
   'tmp rarecommand cd tmp',
   'var rarecommand cd var'),
  ('sh',
   'linuxshell',
   'bash',
   'tmp .wellnope cd tmp',
   'var .wellnope cd var',
   'dev .wellnope cd dev'),
  ('shell',
   'sh',
   'linuxshell',
   'bash',
   'tmp .wellnope cd tmp',
   'var .wellnope cd var'),
  ('enable', 'system', 'shell', 'sh', 'linuxshell', 'bash'),
  ('enable', 'system', 'shell', 'sh', 'linuxshell', 'tmp .ptmx cd tmp'),
  ('enable', 'system', 'shell', 'sh', 'linuxshell', 'bah'),
  ('linuxshell', 'sh', 's

In [128]:
prob_lists = []

for seq_len in range(3,13,3):
    with open('table/aggclustering_-0.3_{}.pkl'.format(seq_len), 'rb') as f:
        my_agg = pickle.load(f)
    prob_lists.append(my_agg.expected_predictive_probs[:6])
    
with open('table/aggclustering_baseline.pkl', 'rb') as f:
    my_agg = pickle.load(f)
prob_lists.append(my_agg.expected_predictive_probs[:6])

with open('table/aggclustering_baseline_onecluster.pkl', 'rb') as f:
    my_agg = pickle.load(f)
for i in range(5):
    my_agg.expected_predictive_probs.append([])
prob_lists.append(my_agg.expected_predictive_probs)

In [131]:
prob_df = pd.DataFrame(index=['$l_c=3$', '$l_c=6$', '$l_c=9$', '$l_c=12$', 'Baseline 1', 'Baseline 2'])
cluster_df = pd.DataFrame(index=['$l_c=3$', '$l_c=6$', '$l_c=9$', '$l_c=12$', 'Baseline 1', 'Baseline 2'])
for i in range(1,7):
    prob_df['Depth {}'.format(i)] = [np.sum(prob_list[i-1]) for prob_list in prob_lists]
    cluster_df['Depth {}'.format(i)] = [len(prob_list[i-1]) for prob_list in prob_lists]
prob_df = prob_df.replace({0:'NA'})
cluster_df = cluster_df.replace({0:'NA'})
prob_df.to_latex('table/prob_df.tex')
cluster_df.to_latex('table/cluster_df.tex')

In [132]:
prob_df

Unnamed: 0,Depth 1,Depth 2,Depth 3,Depth 4,Depth 5,Depth 6
$l_c=3$,0.476399,0.524108,0.539341,0.547319,0.551775,0.555026
$l_c=6$,0.460124,0.50814,0.524318,0.532581,0.536925,0.546313
$l_c=9$,0.453345,0.501228,0.527811,0.536087,0.541361,0.55031
$l_c=12$,0.450155,0.49781,0.530307,0.538657,0.544359,0.55324
Baseline 1,0.454241,0.517255,0.549589,0.568437,0.574061,0.603752
Baseline 2,0.432368,,,,,
