In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from sklearn import random_projection
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import kneighbors_graph

In [2]:
with open("/home/luca/ml-malware-concept-drift/data/dataset/dataset.pickle", "rb") as f:
    df_1 = pickle.load(f).reset_index()
    print(df_1.shape)

with open("/home/luca/ml-malware-concept-drift/data/dataset/dataset_opcodes.pickle", "rb") as f:
    df_opcodes = pickle.load(f).reset_index()
    print(df_opcodes.shape)

df = pd.merge(left=df_1, right=df_opcodes, on="sample_hash")
df = df.drop(["ms_elapsed", "pesectionProcessed_entrypointSection_name"], axis=1)
df.head()

(67000, 42232)
(67000, 2501)


Unnamed: 0,sample_hash,generic_fileSize,generic_fileEntropy,header_SizeOfHeaders,header_AddressOfEntryPoint,header_ImageBase,header_SizeOfImage,header_SizeOfCode,header_SizeOfInitializedData,header_SizeOfUninitializedData,...,opcode_movzx movzx add,opcode_stosd stosd stosd,opcode_sub je push,opcode_je nop,opcode_add adc mov,opcode_mov mov nop,opcode_mov adc mov,opcode_adc mov mov,opcode_cmove,opcode_cmovne
0,cfa2d5eeac7de9f134cb2e36d54a5c4053f62fdf9d5302...,1666137,7.996651,1024,5358807,4194304,5365760,310272,56832,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00ebebc75f61527282cee19ab7aed80693b63fbb969e71...,10543104,4.821304,4096,4512,4194304,118784,94208,20480,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f45e6ca3a2bbbaa1c514d81abe9daaa47f7d41da500e69...,57368,4.979988,4096,9759,4194304,57344,8192,45056,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,d694a203bb211751669f8742db877e8ebd5eff5b126abc...,7922688,7.950081,1024,155508,4194304,7950336,570880,7350784,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15c9feb810c48ffef1c7e32cbe58a11037da94e958fb1f...,159744,5.911521,4096,86524,4194304,159744,86016,69632,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
X = df.set_index("sample_hash")
X.head()

Unnamed: 0_level_0,generic_fileSize,generic_fileEntropy,header_SizeOfHeaders,header_AddressOfEntryPoint,header_ImageBase,header_SizeOfImage,header_SizeOfCode,header_SizeOfInitializedData,header_SizeOfUninitializedData,header_BaseOfCode,...,opcode_movzx movzx add,opcode_stosd stosd stosd,opcode_sub je push,opcode_je nop,opcode_add adc mov,opcode_mov mov nop,opcode_mov adc mov,opcode_adc mov mov,opcode_cmove,opcode_cmovne
sample_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cfa2d5eeac7de9f134cb2e36d54a5c4053f62fdf9d5302994557da1287719414,1666137,7.996651,1024,5358807,4194304,5365760,310272,56832,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00ebebc75f61527282cee19ab7aed80693b63fbb969e7154d17838f9dd08db2f,10543104,4.821304,4096,4512,4194304,118784,94208,20480,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f45e6ca3a2bbbaa1c514d81abe9daaa47f7d41da500e6933f61adf29f5f39835,57368,4.979988,4096,9759,4194304,57344,8192,45056,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d694a203bb211751669f8742db877e8ebd5eff5b126abc4c3e59cdb81f9dbb56,7922688,7.950081,1024,155508,4194304,7950336,570880,7350784,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15c9feb810c48ffef1c7e32cbe58a11037da94e958fb1f34e8feca1643dd80eb,159744,5.911521,4096,86524,4194304,159744,86016,69632,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df_families = pd.read_csv("/home/luca/ml-malware-concept-drift/src/vt_reports/merge.csv")
df_families = df_families.drop("benign", axis=1)
df_families = df_families.rename(columns={"sha256": "sample_hash"})
df_families = df_families.set_index("sample_hash")
df_families.head()

Unnamed: 0_level_0,family,first_submission_date
sample_hash,Unnamed: 1_level_1,Unnamed: 2_level_1
cfa2d5eeac7de9f134cb2e36d54a5c4053f62fdf9d5302994557da1287719414,buzy,2018-09-30 14:37:53
00ebebc75f61527282cee19ab7aed80693b63fbb969e7154d17838f9dd08db2f,buzy,2022-03-02 12:20:37
f45e6ca3a2bbbaa1c514d81abe9daaa47f7d41da500e6933f61adf29f5f39835,buzy,2013-03-05 06:28:07
d694a203bb211751669f8742db877e8ebd5eff5b126abc4c3e59cdb81f9dbb56,buzy,2018-03-13 21:48:59
15c9feb810c48ffef1c7e32cbe58a11037da94e958fb1f34e8feca1643dd80eb,buzy,2022-03-02 16:52:12


In [5]:
df_sha_fam = pd.merge(left=X, right=df_families, left_index=True, right_index=True)
df_sha_fam.head()

Unnamed: 0_level_0,generic_fileSize,generic_fileEntropy,header_SizeOfHeaders,header_AddressOfEntryPoint,header_ImageBase,header_SizeOfImage,header_SizeOfCode,header_SizeOfInitializedData,header_SizeOfUninitializedData,header_BaseOfCode,...,opcode_sub je push,opcode_je nop,opcode_add adc mov,opcode_mov mov nop,opcode_mov adc mov,opcode_adc mov mov,opcode_cmove,opcode_cmovne,family,first_submission_date
sample_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cfa2d5eeac7de9f134cb2e36d54a5c4053f62fdf9d5302994557da1287719414,1666137,7.996651,1024,5358807,4194304,5365760,310272,56832,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,buzy,2018-09-30 14:37:53
00ebebc75f61527282cee19ab7aed80693b63fbb969e7154d17838f9dd08db2f,10543104,4.821304,4096,4512,4194304,118784,94208,20480,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,buzy,2022-03-02 12:20:37
f45e6ca3a2bbbaa1c514d81abe9daaa47f7d41da500e6933f61adf29f5f39835,57368,4.979988,4096,9759,4194304,57344,8192,45056,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,buzy,2013-03-05 06:28:07
d694a203bb211751669f8742db877e8ebd5eff5b126abc4c3e59cdb81f9dbb56,7922688,7.950081,1024,155508,4194304,7950336,570880,7350784,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,buzy,2018-03-13 21:48:59
15c9feb810c48ffef1c7e32cbe58a11037da94e958fb1f34e8feca1643dd80eb,159744,5.911521,4096,86524,4194304,159744,86016,69632,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,buzy,2022-03-02 16:52:12


In [6]:
families = df_sha_fam["family"]
X = df_sha_fam.drop(["family", "first_submission_date"], axis=1)
X.head()

Unnamed: 0_level_0,generic_fileSize,generic_fileEntropy,header_SizeOfHeaders,header_AddressOfEntryPoint,header_ImageBase,header_SizeOfImage,header_SizeOfCode,header_SizeOfInitializedData,header_SizeOfUninitializedData,header_BaseOfCode,...,opcode_movzx movzx add,opcode_stosd stosd stosd,opcode_sub je push,opcode_je nop,opcode_add adc mov,opcode_mov mov nop,opcode_mov adc mov,opcode_adc mov mov,opcode_cmove,opcode_cmovne
sample_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cfa2d5eeac7de9f134cb2e36d54a5c4053f62fdf9d5302994557da1287719414,1666137,7.996651,1024,5358807,4194304,5365760,310272,56832,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00ebebc75f61527282cee19ab7aed80693b63fbb969e7154d17838f9dd08db2f,10543104,4.821304,4096,4512,4194304,118784,94208,20480,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f45e6ca3a2bbbaa1c514d81abe9daaa47f7d41da500e6933f61adf29f5f39835,57368,4.979988,4096,9759,4194304,57344,8192,45056,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d694a203bb211751669f8742db877e8ebd5eff5b126abc4c3e59cdb81f9dbb56,7922688,7.950081,1024,155508,4194304,7950336,570880,7350784,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15c9feb810c48ffef1c7e32cbe58a11037da94e958fb1f34e8feca1643dd80eb,159744,5.911521,4096,86524,4194304,159744,86016,69632,0,4096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)
X_new.shape

(67000, 9524)

In [8]:
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=670,random_state=64).fit(X_new)
kmeans_labels = kmeans.fit_predict(X_new)

In [9]:
len(set(kmeans_labels))

670

In [10]:
for label in set(kmeans_labels):
    families_label = families[kmeans_labels == label]
    print(len(set(families_label)))

19
10
4
19
30
8
43
16
27
45
23
21
10
30
17
11
37
4
15
3
10
21
2
3
8
3
4
2
29
44
6
11
15
2
84
22
15
13
14
42
14
2
6
25
64
24
10
2
10
9
15
2
16
43
14
34
6
20
33
20
4
50
16
28
3
9
15
16
3
2
3
44
3
34
12
11
4
12
26
39
8
20
13
2
16
15
7
1
11
19
5
47
8
15
17
34
7
19
17
2
5
7
56
12
23
45
17
9
19
17
15
49
64
11
9
64
14
70
22
39
17
19
4
9
6
1
9
46
12
23
27
21
3
3
8
53
18
64
5
41
8
48
11
12
31
25
8
7
5
10
51
6
5
67
9
11
101
16
10
18
40
55
1
2
9
9
31
28
28
45
3
25
13
9
10
15
41
7
1
1
12
57
3
57
10
42
52
33
30
38
18
12
2
10
10
32
12
41
13
4
27
17
33
18
13
30
5
11
9
25
73
16
13
6
27
72
16
28
17
7
4
12
8
15
5
12
11
30
71
62
16
27
29
33
43
3
13
21
40
10
5
1
4
2
69
48
30
17
42
23
63
24
8
28
39
62
49
28
13
28
14
2
24
15
1
2
44
7
37
8
15
11
9
5
17
13
17
38
26
7
8
7
10
2
20
26
15
17
6
38
24
68
66
17
30
5
28
42
11
13
43
41
20
21
6
15
3
18
26
29
34
26
17
6
25
10
8
5
6
21
35
5
7
14
14
20
16
22
12
2
16
36
27
22
24
4
37
1
25
3
93
40
51
6
16
19
15
44
10
29
22
34
32
31
18
3
80
11
54
6
7
32
27
14
26
8
17
23
23
4

In [11]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=64, n_jobs=-1)
clf = clf.fit(X_new, families)
y_families = clf.predict(X_new)

In [12]:
from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(families, y_families)
print("The accuracy on the full dataset is {0:.2f}%".format(accuracy_train * 100))

The accuracy on the full dataset is 99.72%
