In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import ast
!pip install seaborn
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

Defaulting to user installation because normal site-packages is not writeable


In [42]:
tokens_df = pd.read_csv('./tokenizations_samp_1000.csv',sep='|')
tokens_df.head()

Unnamed: 0,text,labelname,label,input_ids,attention_mask
0,10.237.1.238 - - [08/Feb/2022:07:16:35 +0000] ...,foothold:service_scan,20,"[101, 2184, 1012, 23297, 1012, 1015, 1012, 220...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,2022-01-23 12:10:00 192.168.96.3:42036 Validat...,attacker_vpn:foothold,11,"[101, 16798, 2475, 1011, 5890, 1011, 2603, 226...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Jan 17 11:58:17 intranet-server su[20749]: + /...,attacker_change_user:escalate:escalated_comman...,4,"[101, 5553, 2459, 2340, 1024, 5388, 1024, 2459...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,Jan 17 11:15:09 dnsmasq[14919]: forwarded 216....,foothold:traceroute,21,"[101, 5553, 2459, 2340, 1024, 2321, 1024, 5641...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,10.182.193.78 - - [07/Feb/2022:11:20:08 +0000]...,attacker_http:foothold:wpscan,9,"[101, 2184, 1012, 17691, 1012, 19984, 1012, 62...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [43]:
labels_df = pd.DataFrame(tokens_df[['labelname','label']])
labels_df = labels_df.drop_duplicates()
labels_df = labels_df.sort_values(by='label')
labels_df.head(30)

Unnamed: 0,labelname,label
28,attacker:dnsteal:dnsteal-dropped,0
34,attacker:dnsteal:dnsteal-received,1
24,attacker:dnsteal:exfiltration-service,2
30,attacker_change_user:escalate,3
2,attacker_change_user:escalate:escalated_comman...,4
15,attacker_http:dirb:foothold,5
9,attacker_http:foothold:service_scan,6
8,attacker_http:foothold:webshell_cmd,7
52,attacker_http:foothold:webshell_upload,8
4,attacker_http:foothold:wpscan,9


In [44]:
#convert the literal array strings into actual arrays
def convert_str_arr_to_float(X_numpy_str):
    X_numpy = []
    for row in X_numpy_str:
        arr = np.array(ast.literal_eval(row))
        X_numpy.append(arr)
    return np.array(X_numpy)

In [45]:
labels = labels_df['labelname'].values

sim_results = {}

for label1 in tqdm(labels):
    for label2 in labels:
        if label1 != label2 and (f'{label2}___{label1}' not in sim_results.keys()):
            data1 = tokens_df[tokens_df['labelname']==label1]['input_ids'].values
            data2 = tokens_df[tokens_df['labelname']==label2]['input_ids'].values
            data1 = convert_str_arr_to_float(data1)
            data2 = convert_str_arr_to_float(data2)
            sim = cosine_similarity(data1, data2)
            sim_results[f'{label1}___{label2}'] = sim.mean()

  0%|          | 0/23 [00:00<?, ?it/s]

In [41]:
# Extract names and values
sorted_results = dict(sorted(sim_results.items(), key=lambda item:item[1], reverse=True))

for key, value in sorted_results.items():
    print(f'LABEL: {key}, AVG COSINE SIM: {value}')

# len(names)
# # Create the plot
# plt.figure(figsize=(20,15))
# plt.barh(names, values, color='red')
# plt.xlabel('Label Name')
# plt.ylabel('Average Cosine Similarity')
# plt.title('Cosine Similarity among labels')
# plt.show()


LABEL: dns_scan:foothold___foothold:wpscan, AVG COSINE SIM: 0.546057386172228
LABEL: attacker_vpn:escalate___attacker_vpn:foothold, AVG COSINE SIM: 0.5432065124358064
LABEL: attacker_change_user:escalate:escalated_command:escalated_sudo_command___escalate:escalated_command:escalated_sudo_command:escalated_sudo_session, AVG COSINE SIM: 0.5331215449853508
LABEL: benign___foothold:wpscan, AVG COSINE SIM: 0.5288532298250666
LABEL: foothold:service_scan___foothold:wpscan, AVG COSINE SIM: 0.5153892334585052
LABEL: benign___dns_scan:foothold, AVG COSINE SIM: 0.5112141320259939
LABEL: dns_scan:foothold___foothold:service_scan, AVG COSINE SIM: 0.5068145759719326
LABEL: dirb:foothold___foothold:service_scan, AVG COSINE SIM: 0.5022741248110886
LABEL: foothold:network_scan___foothold:wpscan, AVG COSINE SIM: 0.501478159052835
LABEL: attacker_http:dirb:foothold___attacker_http:foothold:webshell_upload, AVG COSINE SIM: 0.4994519282204817
LABEL: benign___foothold:service_scan, AVG COSINE SIM: 0.499207

In [52]:
def parse_label(label):
    idx = label.find('___')
    return label[:idx], label[idx+3:]

In [59]:
#create groups

# num_groups = 8

labels_to_add = list(labels.copy())
list_results = list(sorted_results.items())
group_num = 1
groups = {}

while len(labels_to_add) > 0:
    key, value = list_results.pop(0)
    label1, label2 = parse_label(key)
    if label1 in labels_to_add and label2 in labels_to_add:
        groups[f'group_{group_num}'] = [label1, label2]
        group_num += 1
        labels_to_add.remove(label1)
        labels_to_add.remove(label2)
        
    elif label1 in labels_to_add:
        for key, val in groups.items():
            if label2 in val:
                val.append(label1)
                labels_to_add.remove(label1)
                break
    elif label2 in labels_to_add:
        for key, val in groups.items():
            if label1 in val:
                val.append(label2)
                labels_to_add.remove(label2)
                break


In [60]:
groups

{'group_1': ['dns_scan:foothold',
  'foothold:wpscan',
  'benign',
  'foothold:service_scan',
  'dirb:foothold',
  'foothold:network_scan',
  'foothold:traceroute',
  'escalate:webshell_cmd'],
 'group_2': ['attacker_vpn:escalate', 'attacker_vpn:foothold'],
 'group_3': ['attacker_change_user:escalate:escalated_command:escalated_sudo_command',
  'escalate:escalated_command:escalated_sudo_command:escalated_sudo_session',
  'attacker_change_user:escalate'],
 'group_4': ['attacker_http:dirb:foothold',
  'attacker_http:foothold:webshell_upload',
  'attacker_http:foothold:service_scan'],
 'group_5': ['attacker_http:foothold:webshell_cmd',
  'attacker_http:foothold:wpscan'],
 'group_6': ['attacker:dnsteal:exfiltration-service',
  'escalate:escalated_command:escalated_sudo_command'],
 'group_7': ['attacker:dnsteal:dnsteal-dropped',
  'attacker:dnsteal:dnsteal-received',
  'crack_passwords:escalate']}