# CERTUTIL hunt

This notebook helps to collect all cmd (cmd.exe) and (certutil.exe) process executions in order to find suspicious activity.

This example demonstrates how to find suspicious executions that are downloaded by using certutil.exe, and then using certutil.exe to attack.

In [1]:
from pyclient.stix_shifter_dataframe import StixShifterDataFrame
from dateutil import parser
import re
import pandas as pd
from collections import Counter
import math

ImportError: cannot import name 'json_normalize'

In [2]:
carbon_black_stix_bundle_1 = 'https://raw.githubusercontent.com/opencybersecurityalliance/stix-shifter/master/data/cybox/carbon_black/carbon_black_observable.json'
sb_config_1 = {
    'translation_module': 'stix_bundle',
    'transmission_module': 'stix_bundle',
    'connection': {
        "host": carbon_black_stix_bundle_1,
        "port": 443
    },
    'configuration': {
        "auth": {
            "username": None,
            "password": None
        }
    },
    'data_source': '{"type": "identity", "id": "identity--3532c56d-ea72-48be-a2ad-1a53f4c9c6d3", "name": "stix_boundle", "identity_class": "events"}'
}

In [3]:
carbon_black_stix_bundle_2 = 'https://raw.githubusercontent.com/opencybersecurityalliance/stix-shifter/develop/data/cybox/carbon_black/cb_observed_156.json'
sb_config_2 = {
    'translation_module': 'stix_bundle',
    'transmission_module': 'stix_bundle',
    'connection': {
        "host": carbon_black_stix_bundle_2,
        "port": 443
    },
    'configuration': {
        "auth": {
            "username": None,
            "password": None
        }
    },
    'data_source': '{"type": "identity", "id": "identity--3532c56d-ea72-48be-a2ad-1a53f4c9c6d3", "name": "stix_boundle", "identity_class": "events"}'
}

In [4]:
def get_duration(duration):
    days, seconds = duration.days, duration.seconds
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{days}d {hours}h {minutes}m {seconds}.{duration.microseconds//1000}s"

In [5]:
def defang(url):
    return re.sub('http', 'hxxp', url)

# Fetch process data that are spawn by cmd

In [6]:
ssdf = StixShifterDataFrame()
ssdf.add_config('cb_stix_bundle_1', sb_config_1)
ssdf.add_config('cb_stix_bundle_2', sb_config_2)


# stix-shifter uses STIX patterning as its query language
# See http://docs.oasis-open.org/cti/stix/v2.0/cs01/part5-stix-patterning/stix-v2.0-cs01-part5-stix-patterning.html
cmd_query = "[process:name = 'cmd.exe']"
df = ssdf.search_df(query=cmd_query, config_names=['cb_stix_bundle_1', 'cb_stix_bundle_2'])

In [7]:
df['first_observed'] = pd.to_datetime(df['first_observed'], infer_datetime_format=True, utc=True)
df['last_observed'] = pd.to_datetime(df['last_observed'], infer_datetime_format=True, utc=True)
df['duration'] = df['last_observed'] - df['first_observed']
df['duration'] = df['duration'].map(lambda dur: get_duration(dur))

In [8]:
df['first_observed'] = pd.to_datetime(df['first_observed'], infer_datetime_format=True, utc=True)
df['last_observed'] = pd.to_datetime(df['last_observed'], infer_datetime_format=True, utc=True)
df['duration'] = df['last_observed'] - df['first_observed']
df['duration'] = df['duration'].map(lambda dur: get_duration(dur))

In [9]:
df.head()

Unnamed: 0,created,created_by_ref,first_observed,id,last_observed,modified,number_observed,type,domain-name:type,domain-name:value,...,process:binary_ref.type,process:parent_ref.name,process:parent_ref.pid,process:parent_ref.type,process:parent_ref.binary_ref.name,process:parent_ref.binary_ref.type,user-account:type,user-account:user_id,data_source,duration
0,2019-10-03T14:53:35Z,identity--3532c56d-ea72-48be-a2ad-1a53f4c9c6d3,2019-10-03 14:53:35+00:00,observed-data--b7767318-6cca-4cf8-9280-826ed5f...,2019-10-03 14:53:35+00:00,2019-10-03T14:53:35Z,1,observed-data,domain-name,workstation,...,exe,cmd.exe,1668,process,cmd.exe,file,user-account,SYSTEM,cb_stix_bundle_1,0d 0h 0m 0.0s
0,2019-10-03T14:53:35Z,identity--3532c56d-ea72-48be-a2ad-1a53f4c9c6d3,2019-10-03 14:53:35+00:00,observed-data--b7767318-6cca-4cf8-9280-826ed5f...,2019-10-03 14:53:35+00:00,2019-10-03T14:53:35Z,1,observed-data,domain-name,workstation,...,exe,cmd.exe,1668,process,cmd.exe,file,user-account,SYSTEM,cb_stix_bundle_1,0d 0h 0m 0.0s
0,2019-10-03T14:53:35Z,identity--3532c56d-ea72-48be-a2ad-1a53f4c9c6d3,2019-10-03 14:53:35+00:00,observed-data--b7767318-6cca-4cf8-9280-826ed5f...,2019-10-03 14:53:35+00:00,2019-10-03T14:53:35Z,1,observed-data,domain-name,workstation,...,exe,cmd.exe,1668,process,cmd.exe,file,user-account,SYSTEM,cb_stix_bundle_1,0d 0h 0m 0.0s
0,2019-10-03T14:51:35Z,identity--3532c56d-ea72-48be-a2ad-1a53f4c9c6d3,2019-10-03 14:53:35+00:00,observed-data--b7767318-6cca-4cf8-9280-826ed5f...,2019-10-03 14:53:35+00:00,2019-10-03T14:53:35Z,1,observed-data,domain-name,workstation,...,exe,cmd.exe,1668,process,cmd.exe,file,user-account,SYSTEM,cb_stix_bundle_1,0d 0h 0m 0.0s
0,2019-10-01T19:29:35Z,identity--3532c56d-ea72-48be-a2ad-1a53f4c9c6d3,2019-10-01 19:29:35+00:00,observed-data--b7767318-6cca-4cf8-9280-826ed5f...,2019-10-01 19:29:35+00:00,2019-10-01T19:29:35Z,1,observed-data,domain-name,workstation,...,,cmd.exe,1668,process,cmd.exe,file,user-account,SYSTEM,cb_stix_bundle_1,0d 0h 0m 0.0s


In [10]:
list(df.columns.values)

['created',
 'created_by_ref',
 'first_observed',
 'id',
 'last_observed',
 'modified',
 'number_observed',
 'type',
 'domain-name:type',
 'domain-name:value',
 'network-traffic:type',
 'network-traffic:dst_ref.type',
 'network-traffic:dst_ref.value',
 'network-traffic:src_ref.type',
 'network-traffic:src_ref.value',
 'process:command_line',
 'process:created',
 'process:name',
 'process:pid',
 'process:type',
 'process:binary_ref.name',
 'process:binary_ref.type',
 'process:parent_ref.name',
 'process:parent_ref.pid',
 'process:parent_ref.type',
 'process:parent_ref.binary_ref.name',
 'process:parent_ref.binary_ref.type',
 'user-account:type',
 'user-account:user_id',
 'data_source',
 'duration']

In [11]:


def entropy(s):
    """Compute the Shannon entropy of string s"""
    counter = Counter(s)
    nchars = float(len(s))
    return -sum(count / nchars * math.log(count / nchars, 2) for count in counter.values())



df['binary_ref.name'] = df['process:binary_ref.name']
df['binary_ref.type'] = df['process:binary_ref.type']
df['command_line'] = df['process:command_line']
df['created'] = df['process:created']
df['name'] = df['process:name']
df['parent_ref.binary_ref.name'] = df['process:parent_ref.binary_ref.name']
df['parent_ref.binary_ref.type'] = df['process:parent_ref.binary_ref.type']
df['parent_ref.name'] = df['process:parent_ref.name']
df['parent_ref.pid'] = df['process:parent_ref.pid']
df['parent_ref.type'] = df['process:parent_ref.type']
df['pid'] = df['process:pid']

df = df.drop(columns=['process:binary_ref.name', 
                 'process:binary_ref.type',
                 'process:command_line',
                 'process:created',
                 'process:name',
                 'process:pid',
                 'process:parent_ref.binary_ref.name',
                 'process:parent_ref.binary_ref.type',
                 'process:parent_ref.name',
                 'process:parent_ref.pid',
                 'process:parent_ref.type',
                 'domain-name:type',
                 'domain-name:value',
                 'network-traffic:dst_ref.type',
                 'network-traffic:dst_ref.value',
                 'network-traffic:src_ref.type',
                 'network-traffic:src_ref.value',
                 'network-traffic:type',
                 'user-account:type',
                 'user-account:user_id',
                 'data_source',
                 'created_by_ref', 
                 'duration', 
                 'modified',
                 'process:type'
                ])

df['binary_ref.hashes.MD5'] = ''
df['binary_ref.id'] = ''
df['binary_ref.parent_directory_ref.id'] = ''
df['binary_ref.parent_directory_ref.path'] = ''
df['binary_ref.parent_directory_ref.type'] = ''
df['creator_user_ref.id'] = ''
df['creator_user_ref.type'] = ''
df['creator_user_ref.user_id'] = ''
df['opened_connection_refs'] = ''
df['opened_connection_refs_count'] = ''
df['parent_ref.binary_ref.id'] = ''
df['parent_ref.binary_ref.parent_directory_ref.id'] = ''
df['parent_ref.binary_ref.parent_directory_ref.path'] = ''
df['parent_ref.binary_ref.parent_directory_ref.type'] = ''
df['parent_ref.command_line'] = ''
df['parent_ref.id'] = ''
df['x_contained_by_ref'] = ''

df['suspicion_score'] = 0
df['command_line_entropy'] = df.apply(lambda x: entropy(x['command_line']), axis=1)
df['command_line_len'] = df.apply(lambda x: len(x['command_line']), axis=1)

In [12]:
TRAIN_COL = ['type', 'name', 'id', 'created', 'binary_ref.type', 'binary_ref.name',
   'binary_ref.parent_directory_ref.type',
   'binary_ref.parent_directory_ref.path',
   'binary_ref.parent_directory_ref.id', 'binary_ref.id', 'pid',
   'opened_connection_refs_count', 'creator_user_ref.type',
   'creator_user_ref.user_id', 'creator_user_ref.id', 'x_contained_by_ref',
   'first_observed', 'last_observed', 'number_observed',
   'binary_ref.hashes.MD5', 'command_line', 'parent_ref.type',
   'parent_ref.name', 'parent_ref.binary_ref.type',
   'parent_ref.binary_ref.name',
   'parent_ref.binary_ref.parent_directory_ref.type',
   'parent_ref.binary_ref.parent_directory_ref.path',
   'parent_ref.binary_ref.parent_directory_ref.id',
   'parent_ref.binary_ref.id', 'parent_ref.pid', 'parent_ref.command_line',
   'parent_ref.id', 'command_line_entropy', 'command_line_len',
   'suspicion_score', 'opened_connection_refs'
]



In [21]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient



ImportError: cannot import name 'joblib' from 'sklearn.externals' (/srv/conda/envs/notebook/lib/python3.7/site-packages/sklearn/externals/__init__.py)

In [None]:
wml_credential = {
        "apikey": credentials['wml_apikey'],
        "instance_id": credentials['wml_instance_id'],
        "url": credentials['wml_url']
}
client = WatsonMachineLearningAPIClient(wml_credential)
pred = client.deployments.score(credentials['wml_deployment_id'],
                                {"input_data": [{"fields": list(df.columns),
                                                 "values": df.values.tolist()}]})

#df['prediction'] = [d[0] for d in pred['predictions'][0]['values']]

In [None]:
df['prediction'] = [d[0] for d in pred['predictions'][0]['values']]

In [13]:
len(TRAIN_COL)

36

In [18]:
a - b

{'created_by_ref', 'duration', 'modified', 'process:pid', 'process:type'}

# Find suspicious command line

In [10]:
# Use a regex to find suspicious certutil usage
susp = df[df['process:command_line'].str.contains(r'certutil.*[0-9a-zA-Z_-]*\.(exe|dat)')]

# Look at the matches (defang any URLs in there since jupyter makes them clickable!)
list(map(defang, susp['process:command_line'].head().to_list()))

# Attack steps

In [11]:
fields = ['first_observed', 'last_observed', 'duration',
    'process:name', 'process:pid', 
    'process:binary_ref.name', 'process:parent_ref.name', 
    'network-traffic:dst_ref.value', 'network-traffic:src_ref.value', 
    'process:command_line', 'user-account:user_id'
]

df[fields].sort_values(by=['first_observed'])

In this notebook, we finally found that this is a APT attack , ```c64.exe f64.data "9839D7F1A0 -m" ```
Ref: https://www.fireeye.com/blog/threat-research/2019/08/game-over-detecting-and-stopping-an-apt41-operation.html