## Fetch REST API Operation/Agent Data

In [37]:
# imports
import sklearn 
import requests
import pandas
import sys
import base64
import re

Create REST API calls to server to fetch operational data and current system conditions, store in df.
Agent data is for simulation of live conditions of agent for calculating link probabilities.

In [39]:
# fetch past operational data
op_url = 'http://localhost:8888/api/v2/operations'
headers = {'Accept': 'application/json', 'KEY' :'ADMIN123'}

op_response = requests.get(op_url, headers=headers)
op_data = pandas.DataFrame(op_response.json())
op_data = op_data.reset_index()  # make sure indexes pair with number of rows

# fetch current system conditions of active agent
# NOTE: using first trusted agent by default, replace with valid agent(s) for operation + alive and trusted
agents_url = 'http://localhost:8888/api/v2/agents'

agents_response = requests.get(agents_url, headers=headers)
agents_list = agents_response.json()
# select trusted agent
agent_selected = None
for agent in agents_list:
    if agent["trusted"] == True:
        # TODO: insert check for whether agent is alive
        agent_selected = agent

if agent_selected == None:
    print("FAILURE TO FIND AGENT")
else: 
    print("Operation + Agent Data Fetched")

Operation + Agent Data Fetched


## Conduct Feature Analysis

In [8]:
# print all traits of link facts
sources_url = 'http://localhost:8888/api/v2/sources'
headers = {'Accept': 'application/json', 'KEY' :'ADMIN123'}

source = requests.get(sources_url, headers=headers).json()
sources = str(requests.get(sources_url, headers=headers).json())

# fact traits
fact_traits = set()

x = re.findall("(?<=\'trait': )'.*?\'", sources)
for match in x:
    fact_traits.add(match.replace("'", ""))

fact_traits

{'domain.user.name',
 'file.last.accessed',
 'file.last.modified',
 'file.sensitive.content',
 'file.sensitive.extension',
 'host.archive.password',
 'host.dir.compress',
 'host.dir.staged',
 'host.file.path',
 'host.user.name',
 'linux.excluded.directories',
 'linux.included.directories',
 'linux.included.extensions',
 'linux.staging.location',
 'pseudo.data.identifier',
 'remote.host.ip',
 'remote.ssh.ip',
 'safe.mode.enabled',
 'server.malicious.url',
 'windows.excluded.directories',
 'windows.excluded.extensions',
 'windows.included.directories',
 'windows.included.extensions',
 'windows.staging.location'}

Useful\Non-User unique Fact Types:

file.sensitive.content
file.sensitive.extension
linux.excluded.directories
linux.included.directories
linux.included.extensions
linux.staging.location
pseudo.data.identifier
safe.mode.enabled
server.malicious.url
windows.excluded.directories
windows.excluded.extensions
windows.included.directories
windows.included.extensions
windows.staging.location

Non-Useful\User-Unique Fact Types:

domain.user.name
file.last.accessed
file.last.modified
host.archive.password
host.dir.compress
host.dir.staged
host.file.path
host.user.name
remote.host.ip
remote.ssh.ip

In [9]:
# operation object cotains variety of interesting features
# chain contains all operation data/success (majority interest)
# print(thief_win_op["chain"][0])
# print(thief_win_op["adversary"])
# print(base64.b64decode(thief_win_op["chain"][0]["command"]))


In [10]:
# facts used in existing local operations
used_fact_traits =set()

# for each operation
for index, cur_op in op_data.iterrows():  
    # run through link/ability chain within operation
    for cur_link in cur_op["chain"]:
        # print some interesting link info
        print("LINK")
        print(base64.b64decode(cur_link["command"]))
        print(base64.b64decode(bytes(cur_link["command"], 'utf-8')))
        print(cur_link["ability"]["ability_id"])
        print(cur_link["executor"]["command"])
        if(len(cur_link["used"])> 0):
            for used_fact in cur_link["used"]:
                print("~~~~~~~~~~~  fact (val,trait):")
                print(used_fact["value"])
                print(used_fact["trait"])
                used_fact_traits.add(used_fact["trait"])
        print("************************")
print(used_fact_traits)

LINK
b'whoami'
b'whoami'
c0da588f-79f0-4263-8998-7496b1a40596
whoami
************************
LINK
b"cut -d: -f1 /etc/passwd | grep -v '_' | grep -v '#'"
b"cut -d: -f1 /etc/passwd | grep -v '_' | grep -v '#'"
c1cd6388-3ced-48c7-a511-0434c6ba8f48
cut -d: -f1 /etc/passwd | grep -v '_' | grep -v '#'
************************
LINK
b'ps aux | grep gdm'
b'ps aux | grep gdm'
3b5db901-2cb8-4df7-8043-c4628a6a5d5a
ps aux | grep #{host.user.name}
~~~~~~~~~~~  fact (val,trait):
gdm
host.user.name
************************
LINK
b'whoami'
b'whoami'
c0da588f-79f0-4263-8998-7496b1a40596
whoami
************************
LINK
b"cut -d: -f1 /etc/passwd | grep -v '_' | grep -v '#'"
b"cut -d: -f1 /etc/passwd | grep -v '_' | grep -v '#'"
c1cd6388-3ced-48c7-a511-0434c6ba8f48
cut -d: -f1 /etc/passwd | grep -v '_' | grep -v '#'
************************
LINK
b'ps aux | grep gdm'
b'ps aux | grep gdm'
3b5db901-2cb8-4df7-8043-c4628a6a5d5a
ps aux | grep #{host.user.name}
~~~~~~~~~~~  fact (val,trait):
gdm
host.user.na

## Build DF of Past Links from Operations

In [11]:
# DEPRECATED: df version


# store link info in lists, where each item corresponds to link at index
# same index in each list gives all relevant info on link
# later convert to df, for efficiency
statuses = []
ability_ids = []
usable_facts = [] # contains lists of fact dicts with 0 or more items
planners = []
agent_protocols = []           
agent_trusted_statuses = []
agent_architectures = []
agent_privileges = []
obfuscators = []
adversary_ids = []
adversary_names = []
commands = []
num_facts_used = []
visibility_scores = []
executor_platforms = []  # platform on which agent executes it
executor_names = [] # name of terminal on which agent running
# NOTE: see useful_features.odt for analysis of useful components.

# for each operation
for index, cur_op in op_data.iterrows():
    
    # save info about agents into dict for later matching
    agents_dict = {} # key: paw, value: [contact, trusted, privilege, architecture]
    
    # iterate through agents, filling dict with agent/host connection info
    for agent in cur_op["host_group"]:
        agent_paw = agent["paw"]
        contact_type = agent["contact"]
        trusted_status = agent["trusted"]
        privilege = agent["privilege"]
        architecture = agent["architecture"]
        agents_dict[agent_paw] = [contact_type, trusted_status, privilege, architecture]
    
    
    # run through each link chain within operation
    for cur_link in cur_op["chain"]:
    
        # save relevant global op info
        planners.append(cur_op["planner"]["name"])
        obfuscators.append(cur_op["obfuscator"])
        adversary_ids.append(cur_op["adversary"]["adversary_id"])
        adversary_names.append(cur_op["adversary"]["name"])
        
        # save relevant link info
        ability_ids.append(cur_link["ability"]["ability_id"])
        statuses.append(cur_link["status"])
        command_str = str(base64.b64decode(cur_link["command"]))
        command_str = command_str[2:len(command_str)-1] # correctly format
        commands.append(command_str)
        num_facts_used.append(len(cur_link["used"]))
        visibility_scores.append(cur_link["visibility"]["score"])
        executor_platforms.append(cur_link["executor"]["platform"])
        executor_names.append(cur_link["executor"]["name"])
        
        # save relevant agent related info
        agent_paw = cur_link["paw"]
        # if agent is in current operation report
        if agent_paw in agents_dict.keys():
            # save relevant agent/host data
            contact_type, trusted_status, privilege, architecture = agents_dict[agent_paw]
            agent_protocols.append(contact_type)
            agent_trusted_statuses.append(trusted_status)
            agent_privileges.append(privilege)
            agent_architectures.append(architecture)
        else: # if agent is not in current agents report (currently, 5/733 links)
            # insert None for nonexistant agent data
            agent_protocols.append(None)
            agent_trusted_statuses.append(None)
            agent_privileges.append(None)
            agent_architectures.append(None)
            
        
        cur_used_global_facts = {} # key: trait, val: value    
        
        # used facts of link
        if(len(cur_link["used"]) > 0):
            
            # iterate through facts
            for used_fact in cur_link["used"]:
                useful_fact = True
                # check if fact unique to host through excluding unique fact types
                if used_fact["trait"].startswith("host."):
                    useful_fact = False
                if used_fact["trait"].startswith("remote."):
                    useful_fact = False
                if used_fact["trait"].startswith("file.last."):
                    useful_fact = False
                if used_fact["trait"].startswith("domain.user."):
                    useful_fact = False
                
                if useful_fact:
                    # save fact
                    cur_used_global_facts[str(used_fact["trait"])] = str(used_fact["value"])

        # save current usable facts
        usable_facts.append(cur_used_global_facts)        
        

# create link success df from lists of data
data_link_success = {
    "Status" : statuses,
    "Ability_ID" : ability_ids, 
    "Link_Facts" : usable_facts, 
    "Planner" : planners,
    "Obfuscator" : obfuscators,
    "Adversary_ID" : adversary_ids,
    "Adversary_Name" :  adversary_names,
    "Command" : commands,
    "Number_Facts" : num_facts_used,
    "Visibility_Score" : visibility_scores,
    "Executor_Platform" : executor_platforms,
    "Executor_Name" : executor_names,
    "Agent_Protocol" : agent_protocols,
    "Trusted_Status" : agent_trusted_statuses,
    "Agent_Privilege": agent_privileges,
    "Host_Architecture": agent_architectures
}

link_success_df = pandas.DataFrame(data_link_success)

# statuses.insert(0, "Status")
# ability_ids.insert(0, "Ability_ID")
# usable_facts.insert(0, "Link_Facts")
# planners.insert(0, "Planner")
# obfuscators.insert(0, "Obfuscator")
# adversary_ids.insert(0, "Adversary_ID")
# adversary_names.insert(0, "Adversary_Name")
# commands.insert(0, "Command")
# num_facts_used.insert(0, "Number Facts")
# visibility_scores.insert(0, "Visibility_Score")
# executor_platforms.insert(0, "Executor_Platform")
# executor_names.insert(0, "Executor_Name")
# agent_protocols.insert(0, "Agent_Protocol")
# agent_trusted_statuses.insert(0, "Trusted_Status")
# agent_privileges.insert(0, "Agent_Privilige")
# agent_architectures.insert(0, "Host_Architecture")

# link_success_matrix = [statuses, ability_ids, usable_facts, planners, obfuscators, adversary_ids, adversary_names, commands,
#         num_facts_used, visibility_scores, executor_platforms, executor_names, agent_protocols, agent_trusted_statuses, agent_privileges, agent_architectures]
# pretty_print_link_obj(link_success_matrix)


In [40]:
def pretty_print_link_matrix(matrix_links):
    for colval in matrix_links:
        print ('{:4}'.format(str(colval)))

In [41]:
# matrix that stores all past links
# each row is 16 features defining the link
link_success_matrix = []
# first row are column names
link_success_matrix.append([
    "Status",
    "Ability_ID", 
    "Link_Facts", 
    "Planner",
    "Obfuscator",
    "Adversary_ID",
    "Adversary_Name",
    "Command",
    "Number_Facts",
    "Visibility_Score",
    "Executor_Platform",
    "Executor_Name",
    "Agent_Protocol",
    "Trusted_Status",
    "Agent_Privilege",
    "Host_Architecture"
])


# for each operation
for index, cur_op in op_data.iterrows():
    
    # save info about agents into dict for later matching
    agents_dict = {} # key: paw, value: [contact, trusted, privilege, architecture]
    
    # iterate through agents, filling dict with agent/host connection info
    for agent in cur_op["host_group"]:
        agent_paw = agent["paw"]
        contact_type = agent["contact"]
        trusted_status = agent["trusted"]
        privilege = agent["privilege"]
        architecture = agent["architecture"]
        agents_dict[agent_paw] = [contact_type, trusted_status, privilege, architecture]
    
    
    # run through each link chain within operation
    for cur_link in cur_op["chain"]:
    
        # key: link feature, value: link value
        cur_link_dict = {}

        # save relevant global op info
        cur_link_dict["Planner"] = cur_op["planner"]["name"]
        cur_link_dict["Obfuscator"] = cur_op["obfuscator"]
        cur_link_dict["Adversary_ID"] = cur_op["adversary"]["adversary_id"]
        cur_link_dict["Adversary_Name"] = cur_op["adversary"]["name"]
        
        # save relevant link info
        cur_link_dict["Ability_ID"] = cur_link["ability"]["ability_id"]
        cur_link_dict["Status"] = cur_link["status"]
        command_str = str(base64.b64decode(cur_link["command"]))
        command_str = command_str[2:len(command_str)-1] # correctly format
        cur_link_dict["Command"] = command_str
        cur_link_dict["Number_Facts"] = len(cur_link["used"])
        cur_link_dict["Visibility_Score"] = cur_link["visibility"]["score"]
        cur_link_dict["Executor_Platform"] = cur_link["executor"]["platform"]
        cur_link_dict["Executor_Name"] = cur_link["executor"]["name"]
        
        # save relevant agent related info
        agent_paw = cur_link["paw"]
        # if agent is in current operation report
        if agent_paw in agents_dict.keys():
            # save relevant agent/host data
            contact_type, trusted_status, privilege, architecture = agents_dict[agent_paw]
            cur_link_dict["Agent_Protocol"] = contact_type
            cur_link_dict["Trusted_Status"] = trusted_status
            cur_link_dict["Agent_Privilege"] = privilege
            cur_link_dict["Host_Architecture"] = architecture
        else: # if agent is not in current agents report (currently, 5/733 links)
            # insert None for nonexistant agent data
            cur_link_dict["Agent_Protocol"] = None
            cur_link_dict["Trusted_Status"] = None
            cur_link_dict["Agent_Privilege"] = None
            cur_link_dict["Host_Architecture"] = None
            
        
        cur_used_global_facts = {} # key: trait, val: value    
        
        # used facts of link
        if(len(cur_link["used"]) > 0):
            
            # iterate through facts
            for used_fact in cur_link["used"]:
                useful_fact = True
                # check if fact unique to host through excluding unique fact types
                if used_fact["trait"].startswith("host."):
                    useful_fact = False
                if used_fact["trait"].startswith("remote."):
                    useful_fact = False
                if used_fact["trait"].startswith("file.last."):
                    useful_fact = False
                if used_fact["trait"].startswith("domain.user."):
                    useful_fact = False
                
                if useful_fact:
                    # save fact
                    cur_used_global_facts[str(used_fact["trait"])] = str(used_fact["value"])

        # save current usable facts
        cur_link_dict["Link_Facts"] = cur_used_global_facts        

        # create link list from dict 
        cur_link_list = [
            cur_link_dict["Status"],
            cur_link_dict["Ability_ID"], 
            cur_link_dict["Link_Facts"], 
            cur_link_dict["Planner"],
            cur_link_dict["Obfuscator"],
            cur_link_dict["Adversary_ID"],
            cur_link_dict["Adversary_Name"],
            cur_link_dict["Command"],
            cur_link_dict["Number_Facts"],
            cur_link_dict["Visibility_Score"],
            cur_link_dict["Executor_Platform"],
            cur_link_dict["Executor_Name"],
            cur_link_dict["Agent_Protocol"],
            cur_link_dict["Trusted_Status"],
            cur_link_dict["Agent_Privilege"],
            cur_link_dict["Host_Architecture"]            
        ]
        link_success_matrix.append(cur_link_list)

# print(link_success_matrix)
pretty_print_link_matrix(link_success_matrix)
# link_success_matrix[93]

['Status', 'Ability_ID', 'Link_Facts', 'Planner', 'Obfuscator', 'Adversary_ID', 'Adversary_Name', 'Command', 'Number_Facts', 'Visibility_Score', 'Executor_Platform', 'Executor_Name', 'Agent_Protocol', 'Trusted_Status', 'Agent_Privilege', 'Host_Architecture']
[0, 'c0da588f-79f0-4263-8998-7496b1a40596', {}, 'atomic', 'plain-text', '0f4c3c67-845e-49a0-927e-90ed33c044e0', 'Discovery', 'whoami', 0, 50, 'linux', 'sh', 'HTTP', False, 'User', 'amd64']
[0, 'c1cd6388-3ced-48c7-a511-0434c6ba8f48', {}, 'atomic', 'plain-text', '0f4c3c67-845e-49a0-927e-90ed33c044e0', 'Discovery', "cut -d: -f1 /etc/passwd | grep -v '_' | grep -v '#'", 0, 50, 'linux', 'sh', 'HTTP', False, 'User', 'amd64']
[0, '3b5db901-2cb8-4df7-8043-c4628a6a5d5a', {}, 'atomic', 'plain-text', '0f4c3c67-845e-49a0-927e-90ed33c044e0', 'Discovery', 'ps aux | grep gdm', 1, 50, 'linux', 'sh', 'HTTP', False, 'User', 'amd64']
[0, 'c0da588f-79f0-4263-8998-7496b1a40596', {}, 'atomic', 'plain-text', '0f4c3c67-845e-49a0-927e-90ed33c044e0', 'Disco

In [34]:
# link_success_df['Status'].value_counts(normalize=True)

 0      0.946237
 124    0.032258
-3      0.021505
Name: Status, dtype: float64

## Probabilities for future Links

### Manual NB Probability

In [44]:
pretty_print_link_matix(link_success_matrix)

['Status', 'Ability_ID', 'Link_Facts', 'Planner', 'Obfuscator', 'Adversary_ID', 'Adversary_Name', 'Command', 'Number_Facts', 'Visibility_Score', 'Executor_Platform', 'Executor_Name', 'Agent_Protocol', 'Trusted_Status', 'Agent_Privilege', 'Host_Architecture']
[0, 'c0da588f-79f0-4263-8998-7496b1a40596', {}, 'atomic', 'plain-text', '0f4c3c67-845e-49a0-927e-90ed33c044e0', 'Discovery', 'whoami', 0, 50, 'linux', 'sh', 'HTTP', False, 'User', 'amd64']
[0, 'c1cd6388-3ced-48c7-a511-0434c6ba8f48', {}, 'atomic', 'plain-text', '0f4c3c67-845e-49a0-927e-90ed33c044e0', 'Discovery', "cut -d: -f1 /etc/passwd | grep -v '_' | grep -v '#'", 0, 50, 'linux', 'sh', 'HTTP', False, 'User', 'amd64']
[0, '3b5db901-2cb8-4df7-8043-c4628a6a5d5a', {}, 'atomic', 'plain-text', '0f4c3c67-845e-49a0-927e-90ed33c044e0', 'Discovery', 'ps aux | grep gdm', 1, 50, 'linux', 'sh', 'HTTP', False, 'User', 'amd64']
[0, 'c0da588f-79f0-4263-8998-7496b1a40596', {}, 'atomic', 'plain-text', '0f4c3c67-845e-49a0-927e-90ed33c044e0', 'Disco

In [16]:
# query param1 link matrix according to features in param2 dict
# used by probability functions to return relevant portions of matrix
def query_link_matrix(cur_link_success_matrix, feature_query_dict):
    # creat dict - maps matrix column name to matrix column index
    col_name_to_index = {}
    for index in range(len(cur_link_success_matrix[0])):
        col_name_to_index[cur_link_success_matrix[0][index]] = index

    # output - queried link obj (matrix) with feature labels (columns)
    queried_link_matrix = [cur_link_success_matrix[0]]

    # iterate through matrix of links
    for row_index in range(1, len(cur_link_success_matrix)):
        # get link from matrix
        cur_link = cur_link_success_matrix[row_index]
        # passed conditions bool
        pass_conditions = True
        # query by each features, value in feature_query_dict
        for feat_name, feat_value in feature_query_dict.items():
            feat_index = col_name_to_index[feat_name]
            if feat_name == "Link_Facts":
                cur_facts_dict = cur_link[feat_index]
                # query by link_facts (stored in dict)
                for req_fact_type, req_fact_val in feature_query_dict["Link_Facts"].items():
                    # check that current link contains required fact type and required fact value
                    if req_fact_type not in cur_facts_dict or str(cur_facts_dict[req_fact_type]) != str(req_fact_val):
                        pass_conditions = False
            else:
                if str(cur_link[feat_index]) != str(feat_value):
                    pass_conditions = False
        if pass_conditions:
            queried_link_matrix.append(cur_link)

    return queried_link_matrix

# pretty_print_link_matrix( query_link_matrix(link_success_matrix, {"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "linux"}) )
pretty_print_link_matrix( query_link_matrix(link_success_matrix, {"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "linux", "Planner": 'naive bayes', 'Obfuscator': 'plain-text', "Adversary_Name": 'Ransack', "Number_Facts": 1, "Visibility_Score": 50, 'Trusted_Status': False}) )

['Status', 'Ability_ID', 'Link_Facts', 'Planner', 'Obfuscator', 'Adversary_ID', 'Adversary_Name', 'Command', 'Number_Facts', 'Visibility_Score', 'Executor_Platform', 'Executor_Name', 'Agent_Protocol', 'Trusted_Status', 'Agent_Privilege', 'Host_Architecture']
[0, '90c2efaa-8205-480d-8bb6-61d90dbaf81b', {'file.sensitive.extension': 'wav'}, 'naive bayes', 'plain-text', 'de07f52d-9928-4071-9142-cb1d3bd851e8', 'Ransack', "find / -name '*.wav' -type f -not -path '*/\\\\.*' -size -500k 2>/dev/null | head -5", 1, 50, 'linux', 'sh', 'HTTP', False, 'User', 'amd64']


In [17]:
# test query link object
pretty_print_link_matrix( query_link_matrix(link_success_matrix, {"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "linux", "Planner": 'naive bayes', 'Obfuscator': 'plain-text', "Adversary_Name": 'Ransack', "Number_Facts": 1, "Visibility_Score": 50, 'Trusted_Status': False}) )

['Status', 'Ability_ID', 'Link_Facts', 'Planner', 'Obfuscator', 'Adversary_ID', 'Adversary_Name', 'Command', 'Number_Facts', 'Visibility_Score', 'Executor_Platform', 'Executor_Name', 'Agent_Protocol', 'Trusted_Status', 'Agent_Privilege', 'Host_Architecture']
[0, '90c2efaa-8205-480d-8bb6-61d90dbaf81b', {'file.sensitive.extension': 'wav'}, 'naive bayes', 'plain-text', 'de07f52d-9928-4071-9142-cb1d3bd851e8', 'Ransack', "find / -name '*.wav' -type f -not -path '*/\\\\.*' -size -500k 2>/dev/null | head -5", 1, 50, 'linux', 'sh', 'HTTP', False, 'User', 'amd64']


In [37]:
# query param1 df according to features in param2 dict
# used by probability functions to return relevant portions of df
def query_link_df(cur_link_success_df, feature_query_dict):  
        # dict of features types, for querying
    dataTypeDict = dict(cur_link_success_df.dtypes)
    # df which will be repeatedly queried
    query_df = cur_link_success_df.copy()
    # for each feature and value
    for feat_name, feat_value in feature_query_dict.items():
        if query_df.empty:
            return query_df
        if feat_name != "Link_Facts" and dataTypeDict[feat_name]=='object':
            # query by features that are strings
            query_df = query_df.query(feat_name + " == '" + str(feat_value) + "'")
        
        elif feat_name != "Link_Facts" and dataTypeDict[feat_name]=='int64':
            # query by features that are numbers
            query_df = query_df.query(feat_name + " == " + str(feat_value) + "")
        else:
            # query by link_facts (stored in dict)
            for req_fact_type, req_fact_val in feature_query_dict["Link_Facts"].items():
                # query df for links containing required fact type and required fact value
                query_df = query_df[query_df['Link_Facts'].apply(lambda x : req_fact_type in x and req_fact_val in x.values())]

    return query_df

In [18]:
# test query link df
query_link_df(link_success_df, {"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "linux"})

NameError: name 'query_link_df' is not defined

In [26]:
# Basic Success Probability function, returns % of links with features from feature_query_dict that are succesful
def BaseSuccessProb(feature_query_dict):
        query_matrix = query_link_matrix(link_success_matrix, feature_query_dict) # query matrix for features
        # if there is no such features
        if len(query_matrix) <= 1:
                return 0.0
        # otherwise return percentage with status == 0:
        number_success = 0.0
        for rowIndex in range(1, len(query_matrix)):
                if query_matrix[rowIndex][0] == 0:
                        number_success +=1.0
        return (100.0 * (number_success / (len(query_matrix)-1))) 

# DEPRECATED: for DF data structure
# Basic Success Probability function, returns % of links with features from feature_query_dict that are succesful
# def BaseSuccessProb(feature_query_dict):
#         query_df = query_link_df(link_success_df, feature_query_dict) # query dataframe for features
#         return (100 * query_df['Status'].value_counts(normalize=True)[0]) # return percentage with Status=0

BaseSuccessProb({"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "linux"})

100.0

Create link probability function (NB formula)

Param:

Dictionary, where (key, value) pairs are (Feature_Name, Feature_Value) from link_success_df. Dictionary can have from 0 to number of features key, value pairs.

Returns:

Probability of success using NB formula for calculating, or an exception if there is too little data to compare.

P(A|B) = $\frac{P(B|A)*P(A)}{P(B)}$

example probability equation:
calculate P(status=0 | ability=x, fact=z, platform=linux)

=$\frac{P(ability=x, fact=z, platform=linux| status=0) * P(status=0)}{P(ability=x, fact=z, platform=linux)}$

Method call: 

NBLinkSuccessProb({"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "windows"})

NOTE:
If all of param link facts are among the df row's link facts, then is relevant link.

In [36]:
# NB Link Success Probability
# Calculates Prob(Status=0 | features in feature_query_dict)
def NBLinkSuccessProb(feature_query_dict, min_link_data):    
    num_total_past_links = len(link_success_matrix)-1
    # return None if there is 0 past link data
    if num_total_past_links == 0:
        return None

    # P(A)    Probability Status == 0
    status_0_matrix = query_link_matrix(link_success_matrix, {"Status" : 0})
    status_0_past_links = len(status_0_matrix)-1
    prob_a = status_0_past_links/num_total_past_links 
                                
    # P(B)    Probability of current features
    current_feature_matrix = query_link_matrix(link_success_matrix, feature_query_dict)
    num_current_feature_links = len(current_feature_matrix)-1
    # if less items than user required params then return None
    if num_current_feature_links < min_link_data:
        return None

    prob_b = num_current_feature_links/num_total_past_links 
    
    # P(B|A)    Probability of current features in Status == 0 DF
    current_feature_status_0_matrix = query_link_matrix(status_0_matrix, feature_query_dict)
    current_feature_status_0_links = len(current_feature_status_0_matrix)-1
    prob_b_given_a = current_feature_status_0_links / status_0_past_links
    
    # NB Formula
    # P(A|B) = (P(B|A)*P(A))/P(B)
    return ((prob_b_given_a * prob_a)/prob_b)


print(NBLinkSuccessProb({"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "linux"},1))    
print(NBLinkSuccessProb({"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "linux"},2))   
print(NBLinkSuccessProb({"Ability_ID": "b18e8767-b7ea-41a3-8e80-baf65a5ddef5", "Executor_Platform": "linux"},2))

1.0000000000000002
None
1.0


### SciKit Model Probabilities

In [11]:
import matplotlib.pyplot as plt
import joblib
import graphviz
import pydotplus
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import random
import time
import numpy as np

Train model with Scikit (NB, RF, KNN)

In [12]:
# split data into random 75/25 training:testing split
random_state_seed = random.randint(1, 500)
train_link_success, test_link_success = train_test_split(link_success_df, test_size=0.1, random_state=random_state_seed)
train_link_success

Unnamed: 0,Status,Ability_ID,Link_Facts,Planner,Obfuscator,Adversary_ID,Adversary_Name,Command,Number_Facts,Visibility_Score,Executor_Platform,Executor_Name,Agent_Protocol,Trusted_Status,Agent_Privilege,Host_Architecture
666,1,300157e5-f4ad-4569-b533-9d1fa0e74d74,{},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,rm /home/kusubyan/dev/staged.tar.gz,0,50,linux,sh,tcp,True,User,amd64
111,0,3b5db901-2cb8-4df7-8043-c4628a6a5d5a,{},atomic,plain-text,0f4c3c67-845e-49a0-927e-90ed33c044e0,Discovery,ps aux | grep news,1,50,linux,sh,HTTP,False,User,amd64
608,0,4e97e699-93d7-4040-b5a3-2e906a58199e,{},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,"cp ""/usr/share/sounds/sound-icons/cembalo-10.w...",2,50,linux,sh,tcp,True,User,amd64
204,0,3b5db901-2cb8-4df7-8043-c4628a6a5d5a,{},atomic,plain-text,de07f52d-9928-4071-9142-cb1d3bd851e8,Ransack,ps aux | grep hplip,1,50,linux,sh,tcp,False,User,amd64
99,0,3b5db901-2cb8-4df7-8043-c4628a6a5d5a,{},atomic,plain-text,0f4c3c67-845e-49a0-927e-90ed33c044e0,Discovery,ps aux | grep systemd-timesync,1,50,linux,sh,HTTP,False,User,amd64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,0,3b5db901-2cb8-4df7-8043-c4628a6a5d5a,{},atomic,plain-text,0f4c3c67-845e-49a0-927e-90ed33c044e0,Discovery,ps aux | grep avahi,1,50,linux,sh,HTTP,False,User,amd64
65,-3,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'wav'},atomic,plain-text,de07f52d-9928-4071-9142-cb1d3bd851e8,Ransack,find / -name '*.wav' -type f -not -path '*/\\....,1,50,linux,sh,tcp,False,User,amd64
423,0,6469befa-748a-4b9c-a96d-f191fde47d89,{},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,rm -rf staged,0,50,linux,sh,HTTP,False,User,amd64
698,0,6469befa-748a-4b9c-a96d-f191fde47d89,{},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,"New-Item -Path ""."" -Name ""staged"" -ItemType ""d...",0,50,windows,psh,HTTP,False,User,amd64


In [13]:
# convert data to dummies (one-hot-encoding to numbers) 
# in order to train algos like Random Forest

pandas.get_dummies(link_success_df)

TypeError: unhashable type: 'dict'

In [None]:
# Splitting train and test sets to create y numpy arrays
# create y arrays from label column, remove label from x vals
y_train_link = np.array(train_link_success.pop('Status'))
X_train_link = train_link_success.values

y_test_link = np.array(test_link_success.pop('Status')) 
X_test_link = test_link_success.values


print(X_train_link.shape)
print(y_train_link.shape)
print(X_test_link.shape)
print(y_test_link.shape)

In [None]:
def scikit_model_fit(curModel):
    startTime = time.time()
    curModel.fit(X_train_link, y_train_link)
    print("Training Time: " + str(time.time() - startTime) + " seconds")

In [None]:
def scikit_model_predict(curModel):
    # time and run predictions on test data
    startTime = time.time()
    y_predicted_link = curModel.predict(X_test_link)
    print("\nPrediction Time: " + str(time.time() - startTime) + " seconds")  
    # plot confusion matrix for predictions
    conf_matrix = confusion_matrix(y_test_link, y_predicted_link, labels=curModel.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=curModel.classes_)
    disp.plot()
    plt.show()
    # score predictions and print
    print("\n\nScore:"+ str(accuracy_score(y_test_link, y_predicted_link))+"%")

In [None]:
# Random Forest classifier 
RF_classifier =  RandomForestClassifier()
# fit data
scikit_model_fit(RF_classifier)
# test data
scikit_model_predict(RF_classifier)

In [None]:
# Support Vector Classifier
SVM_classifier = LinearSVC(max_iter=3000)

# fit data
scikit_model_fit(SVM_classifier)
# test data
scikit_model_predict(SVM_classifier)

In [None]:
## NOTE: to allow other implementations of NB, possibly do MinMaxScale to data in order to remove negatives.
# Naive Bayes classifier
NB_classifier = BernoulliNB()

# fit data
scikit_model_fit(NB_classifier)
# test data
scikit_model_predict(NB_classifier)

In [None]:
KNN_classifier = KNeighborsClassifier(n_neighbors=3)

# fit data
scikit_model_fit(KNN_classifier)
# test data
scikit_model_predict(KNN_classifier)

Create link probability function (Scikit Model)