### Technique

In [1]:
# crawl MITRE ATT&CK
#  - crawl Technique texts
#  - crawl mitigation code and link

import os
import json
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from urllib.request import Request, urlopen

tech_dict = defaultdict(dict)
'''
{
    'code' : {
        'name': str,
        'link': str,
        'desc': str,
        'platforms', [str],
        'miti': [str],
        'miti_link': [str],
    }
}
'''

save_dir = '/home/zxx5113/IBM/save/mitre-attack'
os.makedirs(save_dir, exist_ok=True)

# NOTE: we only consider enterprise tech
root = 'https://attack.mitre.org/techniques/enterprise/'
response = requests.get(root)
html = BeautifulSoup(response.content, 'html.parser')

link_prefix = 'https://attack.mitre.org'

for tr in tqdm(html.find_all('tr', attrs={'class':'technique'})):
    link = link_prefix + tr.find('a').get('href')
    
    if link.split('/')[-1].startswith('T'):
        code = link.split('/')[-1]
    else:
        code = '.'.join(link.split('/')[-2:])
    
    tech_html = BeautifulSoup(requests.get(link).content, 'html.parser')
    name = ' '.join(tech_html.find('h1').text.strip().replace('\n', ' ').split()) 
    try:
        desc = tech_html.find('div', attrs={'class':'description-body'}).text.strip()
    except: 
        print(code, name, link)
        desc = ''
    
    platforms = set()
    for _block in tech_html.find_all('div', attrs={'class':'col-md-11 pl-0'}):
        try:
            if 'Platforms' in _block.text:
                platforms |= set([s.strip() for s in ':'.join(_block.text.split(':')[1:]).strip().split(',')])
                break
        except:
            pass
    platforms = list(platforms)

    miti_codes = []
    miti_links = []
    for a in tech_html.find_all('a'):
        url = a.get('href')
        if url is not None and 'mitigations' in url and url.split('/')[-1].startswith('M'):
            m_code = url.split('/')[-1]
            if m_code not in miti_codes:
                miti_codes.append(m_code)
                miti_links.append(link_prefix + url)
#     print(code, name, link, miti_codes, miti_links)
    
    tech_dict[code] = {
        'name': name,
        'link': link,
        'desc': desc,
        'platforms': platforms,
        'miti': miti_codes,
        'miti_link': miti_links,
    }
    
with open(os.path.join(save_dir, 'techniques.json'), 'w') as f: 
    json.dump(tech_dict, f)
print('Done')

100%|██████████| 576/576 [04:51<00:00,  1.98it/s]

Done





### Mitigation

In [2]:
import os
import json
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from urllib.request import Request, urlopen

miti_dict = defaultdict(dict)
'''
{
    'code' : {
        'name': str,
        'link': str,
        'desc_brief': str,
        'desc': str,s
    }
}
'''
save_dir = '/home/zxx5113/IBM/save/mitre-attack'
os.makedirs(save_dir, exist_ok=True)

# NOTE: we only consider enterprise tech
root = 'https://attack.mitre.org/mitigations/enterprise/'
response = requests.get(root)
html = BeautifulSoup(response.content, 'html.parser')

link_prefix = 'https://attack.mitre.org/mitigations/'

miti_codes = []
for a in html.find_all('a'):
    if a.has_attr('href') and a.get('href').startswith('/mitigations/M'):
        m_code = a.get('href').split('/')[-1]
        if m_code.startswith('M') and m_code not in miti_codes:
            miti_codes.append(m_code)

for m_code in miti_codes:
    m_link = os.path.join(link_prefix, m_code)
    m_html = BeautifulSoup(requests.get(m_link).content, 'html.parser')
    m_name = m_html.find('h1').text.strip()
    m_desc_block = m_html.find('div', attrs={'class':'description-body'})
    m_desc = m_desc_block.find('p').text.strip()
    
    miti_dict[m_code] = {
        'name': m_name,
        'link': m_link,
        'desc': m_desc,
    }
with open(os.path.join(save_dir, 'mitigations.json'), 'w') as f: 
    json.dump(miti_dict, f)
print('Done')

Done
