# Supicious Process Scoring

In [1]:
from pyclient.stix_shifter_dataframe import StixShifterDataFrame
from dateutil import parser
import re
import pandas as pd
from collections import Counter
import math

In [2]:
carbon_black_stix_bundle_1 = 'https://raw.githubusercontent.com/opencybersecurityalliance/stix-shifter/master/data/cybox/carbon_black/carbon_black_observable.json'
sb_config_1 = {
    'translation_module': 'stix_bundle',
    'transmission_module': 'stix_bundle',
    'connection': {
        "host": carbon_black_stix_bundle_1,
        "port": 443
    },
    'configuration': {
        "auth": {
            "username": None,
            "password": None
        }
    },
    'data_source': '{"type": "identity", "id": "identity--3532c56d-ea72-48be-a2ad-1a53f4c9c6d3", "name": "stix_boundle", "identity_class": "events"}'
}

In [3]:
carbon_black_stix_bundle_2 = 'https://raw.githubusercontent.com/opencybersecurityalliance/stix-shifter/develop/data/cybox/qradar/qradar_custom_process_observable.json'
sb_config_2 = {
    'translation_module': 'stix_bundle',
    'transmission_module': 'stix_bundle',
    'connection': {
        "host": carbon_black_stix_bundle_2,
        "port": 443
    },
    'configuration': {
        "auth": {
            "username": None,
            "password": None
        }
    },
    'data_source': '{"type": "identity", "id": "identity--3532c56d-ea72-48be-a2ad-1a53f4c9c6d3", "name": "stix_boundle", "identity_class": "events"}'
}

# Fetch process data that are spawn by cmd

In [4]:
ssdf = StixShifterDataFrame()
ssdf.add_config('cb_stix_bundle_1', sb_config_1)
ssdf.add_config('cb_stix_bundle_2', sb_config_2)


# stix-shifter uses STIX patterning as its query language
# See http://docs.oasis-open.org/cti/stix/v2.0/cs01/part5-stix-patterning/stix-v2.0-cs01-part5-stix-patterning.html
cmd_query = "[process:name = 'powershell.exe']"
df = ssdf.search_df(query=cmd_query, config_names=['cb_stix_bundle_2'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat(dfs) if dfs else pd.DataFrame()


In [5]:
df['process:binary_ref.parent_directory_ref.path']

0    C:\Windows\System32\WindowsPowerShell\v1.0\pow...
0    C:\Windows\System32\WindowsPowerShell\v1.0\pow...
0    C:\Windows\System32\WindowsPowerShell\v1.0\pow...
0    C:\Windows\System32\WindowsPowerShell\v1.0\pow...
0    C:\Windows\System32\WindowsPowerShell\v1.0\pow...
0    C:\Windows\System32\WindowsPowerShell\v1.0\pow...
Name: process:binary_ref.parent_directory_ref.path, dtype: object

In [6]:
df.columns

Index(['artifact:payload_bin', 'artifact:type', 'created', 'created_by_ref',
       'first_observed', 'id', 'last_observed', 'modified',
       'network-traffic:dst_port',
       'network-traffic:dst_ref.resolves_to_refs[0].type',
       'network-traffic:dst_ref.resolves_to_refs[0].value',
       'network-traffic:dst_ref.type', 'network-traffic:dst_ref.value',
       'network-traffic:protocols[0]', 'network-traffic:src_port',
       'network-traffic:src_ref.resolves_to_refs[0].type',
       'network-traffic:src_ref.resolves_to_refs[0].value',
       'network-traffic:src_ref.type', 'network-traffic:src_ref.value',
       'network-traffic:type', 'number_observed', 'process:binary_ref.name',
       'process:binary_ref.parent_directory_ref.path',
       'process:binary_ref.parent_directory_ref.type',
       'process:binary_ref.type', 'process:command_line', 'process:name',
       'process:pid', 'process:type', 'type', 'user-account:type',
       'user-account:user_id', 'x_com_ibm_ariel.cat

In [7]:


def entropy(s):
    """Compute the Shannon entropy of string s"""
    counter = Counter(s)
    nchars = float(len(s))
    return -sum(count / nchars * math.log(count / nchars, 2) for count in counter.values())


df['binary_ref.name'] = df['process:binary_ref.name']
df['binary_ref.type'] = df['process:binary_ref.type']
df['command_line'] = df['process:command_line']
df['name'] = df['process:name']
df['pid'] = df['process:pid']
df['type'] = df['process:type']
df['binary_ref.parent_directory_ref.path'] = df['process:binary_ref.parent_directory_ref.path']
df['binary_ref.parent_directory_ref.type'] = df['process:binary_ref.parent_directory_ref.type']

    
df = df.drop(columns=['artifact:payload_bin',
    'artifact:type',
    'created_by_ref',
    'modified',
    'network-traffic:dst_port',
    'network-traffic:dst_ref.resolves_to_refs[0].type',
    'network-traffic:dst_ref.resolves_to_refs[0].value',
    'network-traffic:dst_ref.type',
    'network-traffic:dst_ref.value',
    'network-traffic:protocols[0]',
    'network-traffic:src_port',
    'network-traffic:src_ref.resolves_to_refs[0].type',
    'network-traffic:src_ref.resolves_to_refs[0].value',
    'network-traffic:src_ref.type',
    'network-traffic:src_ref.value',
    'network-traffic:type',
    'user-account:type',
    'user-account:user_id',
    'x_com_ibm_ariel.category_id',
    'x_com_ibm_ariel.category_name',
    'x_com_ibm_ariel.devicetype',
    'x_com_ibm_ariel.identity_ip',
    'x_com_ibm_ariel.log_source_id',
    'x_com_ibm_ariel.log_source_name',
    'x_com_ibm_ariel.magnitude',
    'x_com_ibm_ariel.qid',
    'x_com_ibm_ariel.qid_name',
    'x_com_ibm_ariel.utf8_payload',
    'data_source',
    'process:binary_ref.parent_directory_ref.path',
    'process:binary_ref.parent_directory_ref.type',
    'process:binary_ref.name', 
    'process:binary_ref.type',
    'process:command_line', 
    'process:name', 
    'process:pid', 
    'process:type'
])

df['binary_ref.hashes.MD5'] = ''
df['binary_ref.id'] = ''
df['binary_ref.parent_directory_ref.id'] = ''

df['creator_user_ref.id'] = ''
df['creator_user_ref.type'] = ''
df['creator_user_ref.user_id'] = ''
df['opened_connection_refs'] = ''
df['opened_connection_refs_count'] = ''
df['parent_ref.binary_ref.id'] = ''
df['parent_ref.binary_ref.name'] = ''
df['parent_ref.binary_ref.parent_directory_ref.id'] = ''
df['parent_ref.binary_ref.parent_directory_ref.path'] = ''
df['parent_ref.binary_ref.parent_directory_ref.type'] = ''
df['parent_ref.binary_ref.type'] = ''
df['parent_ref.command_line'] = ''
df['parent_ref.id'] = ''
df['parent_ref.name'] = ''
df['parent_ref.pid'] = ''
df['parent_ref.type'] = ''
df['x_contained_by_ref'] = ''

df['suspicion_score'] = 0

df = df.fillna('')
df['command_line_entropy'] = df.apply(lambda x: entropy(x['command_line']), axis=1)
df['command_line_len'] = df.apply(lambda x: len(x['command_line']), axis=1)

In [8]:
TRAIN_COL = ['type', 'name', 'id', 'created', 'binary_ref.type', 'binary_ref.name',
   'binary_ref.parent_directory_ref.type',
   'binary_ref.parent_directory_ref.path',
   'binary_ref.parent_directory_ref.id', 'binary_ref.id', 'pid',
   'opened_connection_refs_count', 'creator_user_ref.type',
   'creator_user_ref.user_id', 'creator_user_ref.id', 'x_contained_by_ref',
   'first_observed', 'last_observed', 'number_observed',
   'binary_ref.hashes.MD5', 'command_line', 'parent_ref.type',
   'parent_ref.name', 'parent_ref.binary_ref.type',
   'parent_ref.binary_ref.name',
   'parent_ref.binary_ref.parent_directory_ref.type',
   'parent_ref.binary_ref.parent_directory_ref.path',
   'parent_ref.binary_ref.parent_directory_ref.id',
   'parent_ref.binary_ref.id', 'parent_ref.pid', 'parent_ref.command_line',
   'parent_ref.id', 'command_line_entropy', 'command_line_len',
   'suspicion_score', 'opened_connection_refs'
]



In [10]:
len(df.columns), len(TRAIN_COL)

(36, 36)

In [16]:
import pandas as pd

empire_df = pd.read_csv('udxstx_empire_invoke_runas.csv')

In [27]:
small = empire_df[empire_df.suspicion_score > 0].reset_index()

In [29]:
small = small.fillna('')

In [33]:
df['binary_ref.parent_directory_ref.path'].values

array(['C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe',
       'C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe',
       'C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe',
       'C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe',
       'C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe',
       'C:\\Windows\\System32\\WindowsPowerShell\\v1.0\\powershell.exe'],
      dtype=object)

In [31]:
small['binary_ref.parent_directory_ref.path']

0                            C:\Windows\System32
1                            C:\Windows\System32
2     C:\Windows\System32\WindowsPowerShell\v1.0
3     C:\Windows\System32\WindowsPowerShell\v1.0
4                            C:\Windows\System32
5                            C:\Windows\System32
6                       C:\Windows\System32\wbem
7     C:\Windows\System32\WindowsPowerShell\v1.0
8                            C:\Windows\System32
9     C:\Windows\System32\WindowsPowerShell\v1.0
10    C:\Windows\System32\WindowsPowerShell\v1.0
11                           C:\Windows\System32
12    C:\Windows\System32\WindowsPowerShell\v1.0
13                           C:\Windows\System32
14                           C:\Windows\System32
15                           C:\Windows\System32
Name: binary_ref.parent_directory_ref.path, dtype: object

In [34]:
small = small.drop(columns=['index'])

In [45]:
very_smalle = small[small.index.isin([1, 2, 3])]

In [46]:
very_smalle

Unnamed: 0,type,name,id,created,binary_ref.type,binary_ref.name,binary_ref.parent_directory_ref.type,binary_ref.parent_directory_ref.path,binary_ref.parent_directory_ref.id,binary_ref.id,...,parent_ref.binary_ref.parent_directory_ref.path,parent_ref.binary_ref.parent_directory_ref.id,parent_ref.binary_ref.id,parent_ref.pid,parent_ref.command_line,parent_ref.id,command_line_entropy,command_line_len,suspicion_score,opened_connection_refs
1,process,wscript.exe,process--f95e7bef-92db-4400-abd4-f7eb0cc1152d_3,,file,wscript.exe,directory,C:\Windows\System32,directory--f95e7bef-92db-4400-abd4-f7eb0cc1152d_5,file--f95e7bef-92db-4400-abd4-f7eb0cc1152d_4,...,,,,,,,4.610897,71,1,
2,process,powershell.exe,process--caf4bad5-c678-4e57-807f-d40ef55745c5_3,,file,powershell.exe,directory,C:\Windows\System32\WindowsPowerShell\v1.0,directory--caf4bad5-c678-4e57-807f-d40ef55745c5_5,file--caf4bad5-c678-4e57-807f-d40ef55745c5_4,...,,,,,,,4.385642,5458,2,
3,process,powershell.exe,process--59b122e0-a2b7-4576-ae9f-431405aaa02e_3,,file,powershell.exe,directory,C:\Windows\System32\WindowsPowerShell\v1.0,directory--59b122e0-a2b7-4576-ae9f-431405aaa02e_5,file--59b122e0-a2b7-4576-ae9f-431405aaa02e_4,...,,,,,,,4.404711,5477,2,


In [11]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient





In [12]:
credentials = {
    "wml_apikey": "Yr2VndMYlKxQYwg3kMSEOQW7wL_v-G9s2slpL1NCu3Mu",
    "wml_instance_id": "5946f309-2adc-48d7-bf08-7bb022dcdfc2",
    "wml_url": "https://eu-gb.ml.cloud.ibm.com",
    "wml_deployment_id": "04f68009-094f-4101-831f-8eeaa845e75f"
}

In [13]:
wml_credential = {
        "apikey": credentials['wml_apikey'],
        "instance_id": credentials['wml_instance_id'],
        "url": credentials['wml_url']
}
client = WatsonMachineLearningAPIClient(wml_credential)
pred = client.deployments.score(credentials['wml_deployment_id'],
                                {"input_data": [{"fields": list(df.columns),
                                                 "values": df.values.tolist()}]})

In [14]:
df['prediction'] = [d[0] for d in pred['predictions'][0]['values']]

In [15]:
pred

{'predictions': [{'fields': ['prediction', 'probability'],
   'values': [[0, [1.0, 0.0, 0.0]],
    [0, [1.0, 0.0, 0.0]],
    [0, [1.0, 0.0, 0.0]],
    [0, [1.0, 0.0, 0.0]],
    [0, [1.0, 0.0, 0.0]],
    [0, [1.0, 0.0, 0.0]]]}]}

In [47]:
# wml_credential = {
#         "apikey": credentials['wml_apikey'],
#         "instance_id": credentials['wml_instance_id'],
#         "url": credentials['wml_url']
# }
# client = WatsonMachineLearningAPIClient(wml_credential)
very_smalle_pred = client.deployments.score(credentials['wml_deployment_id'],
                                {"input_data": [{"fields": list(very_smalle.columns),
                                                 "values": very_smalle.values.tolist()}]})

very_smalle['prediction'] = [d[0] for d in very_smalle_pred['predictions'][0]['values']]

very_smalle_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


{'predictions': [{'fields': ['prediction', 'probability'],
   'values': [[2, [0.0, 0.0, 1.0]],
    [2, [0.0, 0.0, 1.0]],
    [2, [0.0, 0.0, 1.0]]]}]}