# How to vectorize MITRE ATT&CK Knowledge Database

In [51]:
import json

In [52]:
# Let's try to open one Enterprise json document
with open('/media/basic/CYBERTRON/data/MITRE/enterprise_attack/attack-pattern.json') as f:
    # read the first line and then convert it to json
    data = json.loads(f.readline())
print(data)

{'type': 'attack-pattern', 'id': 'attack-pattern--3a32740a-11b0-4bcf-b0a9-3abd0f6d3cd5', 'created_by_ref': 'identity--c78cb6e5-0c4b-4611-8297-d1b8b55e40b5', 'created': '2024-03-29T18:07:04.743Z', 'modified': '2024-04-10T16:05:22.456Z', 'name': 'AutoHotKey & AutoIT', 'description': "Adversaries may execute commands and perform malicious tasks using AutoIT and AutoHotKey automation scripts. AutoIT and AutoHotkey (AHK) are scripting languages that enable users to automate Windows tasks. These automation scripts can be used to perform a wide variety of actions, such as clicking on buttons, entering text, and opening and closing programs.(Citation: AutoIT)(Citation: AutoHotKey)\n\nAdversaries may use AHK (`.ahk`) and AutoIT (`.au3`) scripts to execute malicious code on a victim's system. For example, adversaries have used for AHK to execute payloads and other modular malware such as keyloggers. Adversaries have also used custom AHK files containing embedded malware as [Phishing](https://att

Interesting fields:
- type (str)
- id (str)
- name (str)
- description (str)
- x_mitre_platforms (list)

In [53]:
# Each line is a json document, let's get all of them in a list
data = []
with open('/media/basic/CYBERTRON/data/MITRE/enterprise_attack/attack-pattern.json') as f:
    for line in f:
        data.append(json.loads(line))
print(len(data))

780


In [54]:
# add it to a pandas dataframe
import pandas as pd
interesting_fields = ['type', 'id', 'name', 'description', 'x_mitre_platforms']

df = pd.DataFrame(data)[interesting_fields]
df.head()

Unnamed: 0,type,id,name,description,x_mitre_platforms
0,attack-pattern,attack-pattern--3a32740a-11b0-4bcf-b0a9-3abd0f...,AutoHotKey & AutoIT,Adversaries may execute commands and perform m...,[Windows]
1,attack-pattern,attack-pattern--09b008a9-b4eb-462a-a751-a0eb58...,File/Path Exclusions,Adversaries may attempt to hide their file-bas...,"[Linux, macOS, Windows]"
2,attack-pattern,attack-pattern--0d91b3c0-5e50-47c3-949a-2a796f...,Encrypted/Encoded File,Adversaries may encrypt or encode files to obf...,"[Linux, macOS, Windows]"
3,attack-pattern,attack-pattern--356662f7-e315-4759-86c9-6214e2...,AppDomainManager,Adversaries may execute their own malicious pa...,[Windows]
4,attack-pattern,attack-pattern--149b477f-f364-4824-b1b5-aa1d56...,Network Devices,Adversaries may compromise third-party network...,[PRE]


In [55]:
# let's do that for all jsons recursively in MITRE folder
import os
import warnings
from tqdm import tqdm

def read_json_file(file_path):
    with open(file_path) as f:
        for line in f:
            yield json.loads(line)
    
def read_all_json_files(folder_path):
    data = []
    for root, dirs, files in os.walk(folder_path):
        print(f'Processing {root}')
        for file in tqdm(files):
            try:
               for json_obj in read_json_file(os.path.join(root, file)):
                   data.append(json_obj)
            except Exception as e:
                # warnings.warn(f'Error reading {file}: {e}')
                continue
    return data

In [56]:
enterprise_data = read_all_json_files('/media/basic/CYBERTRON/data/MITRE/')

Processing /media/basic/CYBERTRON/data/MITRE/


100%|██████████| 4/4 [00:00<00:00, 4902.75it/s]


Processing /media/basic/CYBERTRON/data/MITRE/enterprise_attack


100%|██████████| 6/6 [00:00<00:00, 144.15it/s]


Processing /media/basic/CYBERTRON/data/MITRE/ics_attack


100%|██████████| 6/6 [00:00<00:00, 758.05it/s]


Processing /media/basic/CYBERTRON/data/MITRE/mobile_attack


100%|██████████| 6/6 [00:00<00:00, 718.65it/s]


In [57]:
# add it to a pandas dataframe
import pandas as pd
df = pd.DataFrame(enterprise_data)[interesting_fields]
df.head()

Unnamed: 0,type,id,name,description,x_mitre_platforms
0,attack-pattern,attack-pattern--3a32740a-11b0-4bcf-b0a9-3abd0f...,AutoHotKey & AutoIT,Adversaries may execute commands and perform m...,[Windows]
1,attack-pattern,attack-pattern--09b008a9-b4eb-462a-a751-a0eb58...,File/Path Exclusions,Adversaries may attempt to hide their file-bas...,"[Linux, macOS, Windows]"
2,attack-pattern,attack-pattern--0d91b3c0-5e50-47c3-949a-2a796f...,Encrypted/Encoded File,Adversaries may encrypt or encode files to obf...,"[Linux, macOS, Windows]"
3,attack-pattern,attack-pattern--356662f7-e315-4759-86c9-6214e2...,AppDomainManager,Adversaries may execute their own malicious pa...,[Windows]
4,attack-pattern,attack-pattern--149b477f-f364-4824-b1b5-aa1d56...,Network Devices,Adversaries may compromise third-party network...,[PRE]


In [58]:
# drop duplicates and NaNs
interesting_fields_minus_list = ['type', 'id', 'name', 'description']
df[interesting_fields_minus_list].drop_duplicates()
df = df.dropna()
df.head()

Unnamed: 0,type,id,name,description,x_mitre_platforms
0,attack-pattern,attack-pattern--3a32740a-11b0-4bcf-b0a9-3abd0f...,AutoHotKey & AutoIT,Adversaries may execute commands and perform m...,[Windows]
1,attack-pattern,attack-pattern--09b008a9-b4eb-462a-a751-a0eb58...,File/Path Exclusions,Adversaries may attempt to hide their file-bas...,"[Linux, macOS, Windows]"
2,attack-pattern,attack-pattern--0d91b3c0-5e50-47c3-949a-2a796f...,Encrypted/Encoded File,Adversaries may encrypt or encode files to obf...,"[Linux, macOS, Windows]"
3,attack-pattern,attack-pattern--356662f7-e315-4759-86c9-6214e2...,AppDomainManager,Adversaries may execute their own malicious pa...,[Windows]
4,attack-pattern,attack-pattern--149b477f-f364-4824-b1b5-aa1d56...,Network Devices,Adversaries may compromise third-party network...,[PRE]


In [59]:
# Handle the list fields
df['x_mitre_platforms'] = df['x_mitre_platforms'].apply(lambda x: ' '.join(x))
# concat the columns in a single string
df['text'] = df[interesting_fields].apply(lambda x: ' '.join(x), axis=1)
df.head()

Unnamed: 0,type,id,name,description,x_mitre_platforms,text
0,attack-pattern,attack-pattern--3a32740a-11b0-4bcf-b0a9-3abd0f...,AutoHotKey & AutoIT,Adversaries may execute commands and perform m...,Windows,attack-pattern attack-pattern--3a32740a-11b0-4...
1,attack-pattern,attack-pattern--09b008a9-b4eb-462a-a751-a0eb58...,File/Path Exclusions,Adversaries may attempt to hide their file-bas...,Linux macOS Windows,attack-pattern attack-pattern--09b008a9-b4eb-4...
2,attack-pattern,attack-pattern--0d91b3c0-5e50-47c3-949a-2a796f...,Encrypted/Encoded File,Adversaries may encrypt or encode files to obf...,Linux macOS Windows,attack-pattern attack-pattern--0d91b3c0-5e50-4...
3,attack-pattern,attack-pattern--356662f7-e315-4759-86c9-6214e2...,AppDomainManager,Adversaries may execute their own malicious pa...,Windows,attack-pattern attack-pattern--356662f7-e315-4...
4,attack-pattern,attack-pattern--149b477f-f364-4824-b1b5-aa1d56...,Network Devices,Adversaries may compromise third-party network...,PRE,attack-pattern attack-pattern--149b477f-f364-4...


In [60]:
df['text'].iloc[0]

"attack-pattern attack-pattern--3a32740a-11b0-4bcf-b0a9-3abd0f6d3cd5 AutoHotKey & AutoIT Adversaries may execute commands and perform malicious tasks using AutoIT and AutoHotKey automation scripts. AutoIT and AutoHotkey (AHK) are scripting languages that enable users to automate Windows tasks. These automation scripts can be used to perform a wide variety of actions, such as clicking on buttons, entering text, and opening and closing programs.(Citation: AutoIT)(Citation: AutoHotKey)\n\nAdversaries may use AHK (`.ahk`) and AutoIT (`.au3`) scripts to execute malicious code on a victim's system. For example, adversaries have used for AHK to execute payloads and other modular malware such as keyloggers. Adversaries have also used custom AHK files containing embedded malware as [Phishing](https://attack.mitre.org/techniques/T1566) payloads.(Citation: Splunk DarkGate)\n\nThese scripts may also be compiled into self-contained exectuable payloads (`.exe`).(Citation: AutoIT)(Citation: AutoHotKe

In [62]:
# save the dataframe
df.to_csv('/media/basic/CYBERTRON/data/MITRE/mitre.csv', index=False)

In [63]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss

# Load pre-trained model
cache_dir = '/media/basic/CYBERTRON/models'
sentenceTransformerModel = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder=cache_dir)

# Example data loading (replace with your actual data loading)
data = pd.read_csv('/media/basic/CYBERTRON/data/MITRE/mitre.csv')

# Vectorize the concatenated strings
embeddings = sentenceTransformerModel.encode(data['text'].tolist(), convert_to_numpy=True)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Save the FAISS index
faiss.write_index(index, '/media/basic/CYBERTRON/data/MITRE/mitre.index')

  from tqdm.autonotebook import tqdm, trange


In [72]:
# Load the FAISS index
index = faiss.read_index('/media/basic/CYBERTRON/data/MITRE/mitre.index')

# Query
query = data['text'].iloc[0]
query_embedding = sentenceTransformerModel.encode(query, convert_to_numpy=True).reshape(1, -1)

# Search
k = 5
distances, indices = index.search(query_embedding, k)

# Output
for i in range(k):
    print(f'Rank {i+1}: {data.iloc[indices[0][i]]["name"]} (distance: {distances[0][i]})')  
    
# Input
print(f"\nQuery: {data['name'].iloc[0]}")

Rank 1: AutoHotKey & AutoIT (distance: 0.0)
Rank 2: Scripting (distance: 0.6743029356002808)
Rank 3: Mshta (distance: 0.6994346380233765)
Rank 4: Shortcut Modification (distance: 0.7084759473800659)
Rank 5: Accessibility Features (distance: 0.7108378410339355)

Query: AutoHotKey & AutoIT
