## Fetch REST API Operation/Agent Data

In [1]:
# imports
import sklearn 
import requests
import pandas
import sys
import base64
import re

Create REST API calls to server to fetch operational data and current system conditions, store in df.
Agent data is for simulation of live conditions of agent for calculating link probabilities.

In [2]:
# fetch past operational data
op_url = 'http://localhost:8888/api/v2/operations'
headers = {'Accept': 'application/json', 'KEY' :'ADMIN123'}

op_response = requests.get(op_url, headers=headers)
op_data = pandas.DataFrame(op_response.json())
op_data = op_data.reset_index()  # make sure indexes pair with number of rows

# fetch current system conditions of active agent
# NOTE: using first trusted agent by default, replace with valid agent(s) for operation + alive and trusted
agents_url = 'http://localhost:8888/api/v2/agents'

agents_response = requests.get(agents_url, headers=headers)
agents_list = agents_response.json()
# select trusted agent
agent_selected = None
for agent in agents_list:
    if agent["trusted"] == True:
        # TODO: insert check for whether agent is alive
        agent_selected = agent

if agent_selected == None:
    print("FAILURE TO FIND AGENT")
else: 
    print("Operation + Agent Data Fetched")

FAILURE TO FIND AGENT


## Conduct Feature Analysis

In [3]:
# print all traits of link facts
sources_url = 'http://localhost:8888/api/v2/sources'
headers = {'Accept': 'application/json', 'KEY' :'ADMIN123'}

source = requests.get(sources_url, headers=headers).json()
sources = str(requests.get(sources_url, headers=headers).json())

# fact traits
fact_traits = set()

x = re.findall("(?<=\'trait': )'.*?\'", sources)
for match in x:
    fact_traits.add(match.replace("'", ""))

fact_traits

{'',
 'domain.user.name',
 'file.last.accessed',
 'file.last.modified',
 'file.sensitive.content',
 'file.sensitive.extension',
 'host.archive.password',
 'host.dir.compress',
 'host.dir.staged',
 'host.file.path',
 'host.user.name',
 'linux.excluded.directories',
 'linux.included.directories',
 'linux.included.extensions',
 'linux.staging.location',
 'pseudo.data.identifier',
 'remote.host.ip',
 'remote.ssh.ip',
 'safe.mode.enabled',
 'server.malicious.url',
 'windows.excluded.directories',
 'windows.excluded.extensions',
 'windows.included.directories',
 'windows.included.extensions',
 'windows.staging.location'}

Useful\Non-User unique Fact Types:

file.sensitive.content
file.sensitive.extension
linux.excluded.directories
linux.included.directories
linux.included.extensions
linux.staging.location
pseudo.data.identifier
safe.mode.enabled
server.malicious.url
windows.excluded.directories
windows.excluded.extensions
windows.included.directories
windows.included.extensions
windows.staging.location

Non-Useful\User-Unique Fact Types:

domain.user.name
file.last.accessed
file.last.modified
host.archive.password
host.dir.compress
host.dir.staged
host.file.path
host.user.name
remote.host.ip
remote.ssh.ip

In [4]:
# operation object cotains variety of interesting features
# chain contains all operation data/success (majority interest)
# print(thief_win_op["chain"][0])
# print(thief_win_op["adversary"])
# print(base64.b64decode(thief_win_op["chain"][0]["command"]))


In [5]:
# facts used in existing local operations
used_fact_traits =set()

# for each operation
for index, cur_op in op_data.iterrows():  
    # run through link/ability chain within operation
    for cur_link in cur_op["chain"]:
        # print some interesting link info
        print("LINK")
        print(base64.b64decode(cur_link["command"]))
        print(base64.b64decode(bytes(cur_link["command"], 'utf-8')))
        print(cur_link["ability"]["ability_id"])
        print(cur_link["executor"]["command"])
        if(len(cur_link["used"])> 0):
            for used_fact in cur_link["used"]:
                print("~~~~~~~~~~~  fact (val,trait):")
                print(used_fact["value"])
                print(used_fact["trait"])
                used_fact_traits.add(used_fact["trait"])
        print("************************")
print(used_fact_traits)

LINK
b"find / -name '*.png' -type f -not -path '*/\\.*' -size -500k 2>/dev/null | head -5"
b"find / -name '*.png' -type f -not -path '*/\\.*' -size -500k 2>/dev/null | head -5"
90c2efaa-8205-480d-8bb6-61d90dbaf81b
find / -name '*.#{file.sensitive.extension}' -type f -not -path '*/\.*' -size -500k 2>/dev/null | head -5
~~~~~~~~~~~  fact (val,trait):
png
file.sensitive.extension
************************
LINK
b"find / -name '*.yml' -type f -not -path '*/\\.*' -size -500k 2>/dev/null | head -5"
b"find / -name '*.yml' -type f -not -path '*/\\.*' -size -500k 2>/dev/null | head -5"
90c2efaa-8205-480d-8bb6-61d90dbaf81b
find / -name '*.#{file.sensitive.extension}' -type f -not -path '*/\.*' -size -500k 2>/dev/null | head -5
~~~~~~~~~~~  fact (val,trait):
yml
file.sensitive.extension
************************
LINK
b"find / -name '*.wav' -type f -not -path '*/\\.*' -size -500k 2>/dev/null | head -5"
b"find / -name '*.wav' -type f -not -path '*/\\.*' -size -500k 2>/dev/null | head -5"
90c2efaa-8205

## Build DF of Past Links from Operations

In [157]:
# store link info in lists, where each item corresponds to link at index
# same index in each list gives all relevant info on link
# later convert to df, for efficiency
statuses = []
ability_ids = []
usable_facts = [] # contains lists of fact dicts with 0 or more items
planners = []
agent_protocols = []           
agent_trusted_statuses = []
agent_architectures = []
agent_privileges = []
obfuscators = []
adversary_ids = []
adversary_names = []
commands = []
num_facts_used = []
visibility_scores = []
executor_platforms = []  # platform on which agent executes it
executor_names = [] # name of terminal on which agent running
# NOTE: see useful_features.odt for analysis of useful components.

# for each operation
for index, cur_op in op_data.iterrows():
    
    # save info about agents into dict for later matching
    agents_dict = {} # key: paw, value: [contact, trusted, privilege, architecture]
    
    # iterate through agents, filling dict with agent/host connection info
    for agent in cur_op["host_group"]:
        agent_paw = agent["paw"]
        contact_type = agent["contact"]
        trusted_status = agent["trusted"]
        privilege = agent["privilege"]
        architecture = agent["architecture"]
        agents_dict[agent_paw] = [contact_type, trusted_status, privilege, architecture]
    
    
    # run through each link chain within operation
    for cur_link in cur_op["chain"]:
    
        # save relevant global op info
        planners.append(cur_op["planner"]["name"])
        obfuscators.append(cur_op["obfuscator"])
        adversary_ids.append(cur_op["adversary"]["adversary_id"])
        adversary_names.append(cur_op["adversary"]["name"])
        
        # save relevant link info
        ability_ids.append(cur_link["ability"]["ability_id"])
        statuses.append(cur_link["status"])
        command_str = str(base64.b64decode(cur_link["command"]))
        command_str = command_str[2:len(command_str)-1] # correctly format
        commands.append(command_str)
        num_facts_used.append(len(cur_link["used"]))
        visibility_scores.append(cur_link["visibility"]["score"])
        executor_platforms.append(cur_link["executor"]["platform"])
        executor_names.append(cur_link["executor"]["name"])
        
        # save relevant agent related info
        agent_paw = cur_link["paw"]
        # if agent is in current operation report
        if agent_paw in agents_dict.keys():
            # save relevant agent/host data
            contact_type, trusted_status, privilege, architecture = agents_dict[agent_paw]
            agent_protocols.append(contact_type)
            agent_trusted_statuses.append(trusted_status)
            agent_privileges.append(privilege)
            agent_architectures.append(architecture)
        else: # if agent is not in current agents report (currently, 5/733 links)
            # insert None for nonexistant agent data
            agent_protocols.append(None)
            agent_trusted_statuses.append(None)
            agent_privileges.append(None)
            agent_architectures.append(None)
            
        
        cur_used_global_facts = {} # key: trait, val: value    
        
        # used facts of link
        if(len(cur_link["used"]) > 0):
            
            # iterate through facts
            for used_fact in cur_link["used"]:
                useful_fact = True
                # check if fact unique to host through excluding unique fact types
                if used_fact["trait"].startswith("host."):
                    useful_fact = False
                if used_fact["trait"].startswith("remote."):
                    useful_fact = False
                if used_fact["trait"].startswith("file.last."):
                    useful_fact = False
                if used_fact["trait"].startswith("domain.user."):
                    useful_fact = False
                
                if useful_fact:
                    # save fact
                    cur_used_global_facts[str(used_fact["trait"])] = str(used_fact["value"])

        # save current usable facts
        usable_facts.append(cur_used_global_facts)        
        

# create link success df from lists of data
data_link_success = {
    "Status" : statuses,
    "Ability_ID" : ability_ids, 
    "Link_Facts" : usable_facts, 
    "Planner" : planners,
    "Obfuscator" : obfuscators,
    "Adversary_ID" : adversary_ids,
    "Adversary_Name" :  adversary_names,
    "Command" : commands,
    "Number_Facts" : num_facts_used,
    "Visibility_Score" : visibility_scores,
    "Executor_Platform" : executor_platforms,
    "Executor_Name" : executor_names,
    "Agent_Protocol" : agent_protocols,
    "Trusted_Status" : agent_trusted_statuses,
    "Agent_Privilege": agent_privileges,
    "Host_Architecture": agent_architectures
}

link_success_df = pandas.DataFrame(data_link_success)
link_success_df


Unnamed: 0,Status,Ability_ID,Link_Facts,Planner,Obfuscator,Adversary_ID,Adversary_Name,Command,Number_Facts,Visibility_Score,Executor_Platform,Executor_Name,Agent_Protocol,Trusted_Status,Agent_Privilege,Host_Architecture
0,0,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'png'},atomic,plain-text,de07f52d-9928-4071-9142-cb1d3bd851e8,Ransack,find / -name '*.png' -type f -not -path '*/\\....,1,50,linux,sh,HTTP,False,User,amd64
1,0,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'yml'},atomic,plain-text,de07f52d-9928-4071-9142-cb1d3bd851e8,Ransack,find / -name '*.yml' -type f -not -path '*/\\....,1,50,linux,sh,HTTP,False,User,amd64
2,0,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'wav'},atomic,plain-text,de07f52d-9928-4071-9142-cb1d3bd851e8,Ransack,find / -name '*.wav' -type f -not -path '*/\\....,1,50,linux,sh,HTTP,False,User,amd64
3,0,c0da588f-79f0-4263-8998-7496b1a40596,{},atomic,plain-text,de07f52d-9928-4071-9142-cb1d3bd851e8,Ransack,whoami,0,50,linux,sh,HTTP,False,User,amd64
4,0,c1cd6388-3ced-48c7-a511-0434c6ba8f48,{},atomic,plain-text,de07f52d-9928-4071-9142-cb1d3bd851e8,Ransack,cut -d: -f1 /etc/passwd | grep -v '_' | grep -...,0,50,linux,sh,HTTP,False,User,amd64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,0,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'yml'},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,Get-ChildItem C:\\Users -Recurse -Include *.ym...,1,50,windows,psh,tcp,True,User,amd64
729,0,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'png'},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,Get-ChildItem C:\\Users -Recurse -Include *.pn...,1,50,windows,psh,tcp,True,User,amd64
730,1,300157e5-f4ad-4569-b533-9d1fa0e74d74,{},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,Compress-Archive -Path C:\\Users\\KUSUBYAN\\st...,1,50,windows,psh,tcp,True,User,amd64
731,1,300157e5-f4ad-4569-b533-9d1fa0e74d74,{},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,rm C:\\Users\\KUSUBYAN\\staged.zip,0,50,windows,psh,tcp,True,User,amd64


In [158]:
link_success_df['Status'].value_counts(normalize=True)

 0      0.908595
 1      0.066849
-3      0.021828
 124    0.002729
Name: Status, dtype: float64

## Probabilities for future Links

### Manual NB Probability

In [159]:
# query param1 df according to features in param2 dict
# used by probability functions to return relevant portions of df
def query_link_df(cur_link_success_df, feature_query_dict):
    # dict of features types, for querying
    dataTypeDict = dict(cur_link_success_df.dtypes)
    # df which will be repeatedly queried
    query_df = cur_link_success_df
    # for each feature and value
    for feat_name, feat_value in feature_query_dict.items():
        if feat_name != "Link_Facts" and dataTypeDict[feat_name]=='object':
            # query by features that are strings
            query_df = query_df.query(feat_name + " == '" + str(feat_value) + "'")
        
        elif feat_name != "Link_Facts" and dataTypeDict[feat_name]=='int64':
            # query by features that are numbers
            query_df = query_df.query(feat_name + " == " + str(feat_value) + "")
        else:
            # query by link_facts (stored in dict)
            print(feat_name)
            for req_fact_type, req_fact_val in feature_query_dict["Link_Facts"].items():
                # query df for links containing required fact type and required fact value
                query_df = query_df[query_df['Link_Facts'].apply(lambda x : req_fact_type in x and req_fact_val in x.values())]

    return query_df

In [160]:
# Basic Success Probability function, returns % of links with features from feature_query_dict that are succesful
def BaseSuccessProb(feature_query_dict):
        query_df = query_link_df(link_success_df, feature_query_dict) # query dataframe for features
        return (100 * query_df['Status'].value_counts(normalize=True)[0]) # return percentage with Status=0

BaseSuccessProb({"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "windows"})

Link_Facts


93.75

Create link probability function (NB formula)

Param:

Dictionary, where (key, value) pairs are (Feature_Name, Feature_Value) from link_success_df. Dictionary can have from 0 to number of features key, value pairs.

Returns:

Probability of success using NB formula for calculating, or an exception if there is too little data to compare.

P(A|B) = $\frac{P(B|A)*P(A)}{P(B)}$

example probability equation:
calculate P(status=0 | ability=x, fact=z, platform=linux)

=$\frac{P(ability=x, fact=z, platform=linux| status=0) * P(status=0)}{P(ability=x, fact=z, platform=linux)}$

Method call: 

NBLinkSuccessProb({"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "windows"})

NOTE:
If all of param link facts are among the df row's link facts, then is relevant link.

In [161]:
# NB Link Success Probability
# Calculates Prob(Status=0 | features in feature_query_dict)
def NBLinkSuccessProb(feature_query_dict):    
    num_total_past_links = link_success_df.shape[0]
  
    # P(A)    Probability Status == 0
    status_0_df = query_link_df(link_success_df, {"Status" : 0})
    status_0_past_links = status_0_df.shape[0]
    prob_a = status_0_past_links/num_total_past_links 
                                 
    # P(B)    Probability of current features
    current_feature_df = query_link_df(link_success_df, feature_query_dict)
    current_feature_links = current_feature_df.shape[0]
    # TODO: INSERT FLAG RELATED EXCEPTION IF TOO FEW DATAPOINTS HERE
    prob_b = current_feature_links/num_total_past_links 
    
    # P(B|A)    Probability of current features in Status == 0 DF
    current_feature_status_0_df = query_link_df(status_0_df, feature_query_dict)
    current_feature_status_0_links = current_feature_status_0_df.shape[0]
    prob_b_given_a = current_feature_status_0_links / status_0_past_links
    
    # NB Formula
    # P(A|B) = (P(B|A)*P(A))/P(B)
    return ((prob_b_given_a * prob_a)/prob_b)
    
print(NBLinkSuccessProb({"Ability_ID": "90c2efaa-8205-480d-8bb6-61d90dbaf81b", "Link_Facts":{'file.sensitive.extension': 'wav'}, "Executor_Platform": "windows"}))

Link_Facts
Link_Facts
0.9374999999999999


### SciKit Model Probabilities

In [167]:
import matplotlib.pyplot as plt
import joblib
import graphviz
import pydotplus
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import random
import time
import numpy as np

Train model with Scikit (NB, RF, KNN)

In [171]:
# split data into random 75/25 training:testing split
random_state_seed = random.randint(1, 500)
train_link_success, test_link_success = train_test_split(link_success_df, test_size=0.1, random_state=random_state_seed)
train_link_success

Unnamed: 0,Status,Ability_ID,Link_Facts,Planner,Obfuscator,Adversary_ID,Adversary_Name,Command,Number_Facts,Visibility_Score,Executor_Platform,Executor_Name,Agent_Protocol,Trusted_Status,Agent_Privilege,Host_Architecture
685,0,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'wav'},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,Get-ChildItem C:\\Users -Recurse -Include *.wa...,1,50,windows,psh,HTTP,False,User,amd64
553,0,6469befa-748a-4b9c-a96d-f191fde47d89,{},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,rm -rf staged,0,50,linux,sh,HTTP,True,User,amd64
494,0,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'wav'},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,Get-ChildItem C:\\Users -Recurse -Include *.wa...,1,50,windows,psh,tcp,False,User,amd64
309,0,3b5db901-2cb8-4df7-8043-c4628a6a5d5a,{},atomic,base64,de07f52d-9928-4071-9142-cb1d3bd851e8,Ransack,"eval ""$(echo cHMgYXV4IHwgZ3JlcCBhdmFoaS1hdXRva...",1,50,linux,sh,HTTP,False,User,amd64
686,0,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'yml'},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,Get-ChildItem C:\\Users -Recurse -Include *.ym...,1,50,windows,psh,HTTP,False,User,amd64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,0,4e97e699-93d7-4040-b5a3-2e906a58199e,{},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,"cp ""/usr/share/sounds/sound-icons/cembalo-12.w...",2,50,linux,sh,HTTP,False,User,amd64
615,0,90c2efaa-8205-480d-8bb6-61d90dbaf81b,{'file.sensitive.extension': 'yml'},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,find / -name '*.yml' -type f -not -path '*/\\....,1,50,linux,sh,tcp,True,User,amd64
25,0,3b5db901-2cb8-4df7-8043-c4628a6a5d5a,{},atomic,plain-text,de07f52d-9928-4071-9142-cb1d3bd851e8,Ransack,ps aux | grep uuidd,1,50,linux,sh,HTTP,False,User,amd64
561,0,300157e5-f4ad-4569-b533-9d1fa0e74d74,{},atomic,plain-text,1a98b8e6-18ce-4617-8cc5-e65a1a9d490e,Thief,tar -P -zcf /home/kusubyan/dev/staged.tar.gz /...,1,50,linux,sh,HTTP,True,User,amd64


In [177]:
# convert data to dummies (one-hot-encoding to numbers) 
# in order to train algos like Random Forest

pandas.get_dummies(link_success_df)

TypeError: unhashable type: 'dict'

In [172]:
# Splitting train and test sets to create y numpy arrays
# create y arrays from label column, remove label from x vals
y_train_link = np.array(train_link_success.pop('Status'))
X_train_link = train_link_success.values

y_test_link = np.array(test_link_success.pop('Status')) 
X_test_link = test_link_success.values


print(X_train_link.shape)
print(y_train_link.shape)
print(X_test_link.shape)
print(y_test_link.shape)

(659, 15)
(659,)
(74, 15)
(74,)


In [173]:
def scikit_model_fit(curModel):
    startTime = time.time()
    curModel.fit(X_train_link, y_train_link)
    print("Training Time: " + str(time.time() - startTime) + " seconds")

In [174]:
def scikit_model_predict(curModel):
    # time and run predictions on test data
    startTime = time.time()
    y_predicted_link = curModel.predict(X_test_link)
    print("\nPrediction Time: " + str(time.time() - startTime) + " seconds")  
    # plot confusion matrix for predictions
    conf_matrix = confusion_matrix(y_test_link, y_predicted_link, labels=curModel.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=curModel.classes_)
    disp.plot()
    plt.show()
    # score predictions and print
    print("\n\nScore:"+ str(accuracy_score(y_test_link, y_predicted_link))+"%")

In [175]:
# Random Forest classifier 
RF_classifier =  RandomForestClassifier()
# fit data
scikit_model_fit(RF_classifier)
# test data
scikit_model_predict(RF_classifier)

ValueError: could not convert string to float: '90c2efaa-8205-480d-8bb6-61d90dbaf81b'

In [None]:
# Support Vector Classifier
SVM_classifier = LinearSVC(max_iter=3000)

# fit data
scikit_model_fit(SVM_classifier)
# test data
scikit_model_predict(SVM_classifier)

In [None]:
## NOTE: to allow other implementations of NB, possibly do MinMaxScale to data in order to remove negatives.
# Naive Bayes classifier
NB_classifier = BernoulliNB()

# fit data
scikit_model_fit(NB_classifier)
# test data
scikit_model_predict(NB_classifier)

In [None]:
KNN_classifier = KNeighborsClassifier(n_neighbors=3)

# fit data
scikit_model_fit(KNN_classifier)
# test data
scikit_model_predict(KNN_classifier)

Create link probability function (Scikit Model)