Skip to content
Switch branches/tags
Go to file
Cannot retrieve contributors at this time
executable file 744 lines (658 sloc) 26.3 KB
import json
import base64
import io
import logging
import posixpath
import stat
import tarfile
import zipfile
from pymisp import MISPEvent, MISPObject, MISPAttribute
from import make_binary_objects
from collections import OrderedDict
log = logging.getLogger(__name__)
misperrors = {'error': 'Error'}
moduleinfo = {
'version': '1.1',
'author': 'Pierre-Jean Grenier',
'description': "Import a Cuckoo archive (zipfile or bzip2 tarball), "
"either downloaded manually or exported from the "
"API (/tasks/report/{task_id}/all).",
'module-type': ['import'],
moduleconfig = []
mispattributes = {
'inputSource': ['file'],
'output': ['MISP objects', 'malware-sample'],
'format': 'misp_standard',
# Attributes for which we can set the "Artifacts dropped"
# category if we want to
# Same for the category "Payload delivery"
class PrettyDict(OrderedDict):
This class is just intended for a pretty print
of its keys and values.
def __str__(self):
tmp = []
for k, v in self.items():
v = str(v)
if len(v) > self.MAX_SIZE:
k += ',cut'
v = v[:self.MAX_SIZE]
v.replace('\n', ' ')
tmp.append((k, v))
return "; ".join(f"({k}) {v}" for k, v in tmp)
def search_objects(event, name, attributes=[]):
Search for objects in event, which name is `name` and
contain at least the attributes given.
Return a generator.
@ param attributes: a list of (object_relation, value)
match = filter(
lambda obj: all( == name
and (obj_relation, str(attr_value)) in map(
lambda attr: (attr.object_relation, str(attr.value)),
for obj_relation, attr_value in attributes
), event.objects
return match
def find_process_by_pid(event, pid):
Find a 'process' MISPObject by its PID. If multiple objects are found,
only return the first one.
@ param pid: integer or str
generator = search_objects(
(('pid', pid),)
return next(generator, None)
class CuckooParser():
# This dict is used to generate the userConfig and link the different
# options to the corresponding method of the parser. This way, we avoid
# redundancy and make future changes easier (instead of for instance
# defining all the options in userConfig directly, and then making a
# switch when running the parser).
# Careful about the order here, as we create references between
# MISPObjects/MISPAttributes at the same time we generate them.
# Hence when we create object B, which we want to reference to
# object A, we should already have created object A.
# TODO create references only after all parsing is done
options = {
"Sandbox info": {
"method": lambda self: self.add_sandbox_info(),
"userConfig": {
'type': 'Boolean',
'message': "Add info related to the sandbox",
'checked': 'true',
"Upload sample": {
"method": lambda self: self.add_sample(),
"userConfig": {
'type': 'Boolean',
'message': "Upload the sample",
'checked': 'true',
"Processes": {
"method": lambda self: self.add_process_tree(),
"userConfig": {
'type': 'Boolean',
'message': "Add info related to the processes",
'checked': 'true',
"DNS": {
"method": lambda self: self.add_dns(),
"userConfig": {
'type': 'Boolean',
'message': "Add DNS queries/answers",
'checked': 'true',
"TCP": {
"method": lambda self: self.add_network("tcp"),
"userConfig": {
'type': 'Boolean',
'message': "Add TCP connections",
'checked': 'true',
"UDP": {
"method": lambda self: self.add_network("udp"),
"userConfig": {
'type': 'Boolean',
'message': "Add UDP connections",
'checked': 'true',
"HTTP": {
"method": lambda self: self.add_http(),
"userConfig": {
'type': 'Boolean',
'message': "Add HTTP requests",
'checked': 'true',
"Signatures": {
"method": lambda self: self.add_signatures(),
"userConfig": {
'type': 'Boolean',
'message': "Add Cuckoo's triggered signatures",
'checked': 'true',
"Screenshots": {
"method": lambda self: self.add_screenshots(),
"userConfig": {
'type': 'Boolean',
'message': "Upload the screenshots",
'checked': 'true',
"Dropped files": {
"method": lambda self: self.add_dropped_files(),
"userConfig": {
'type': 'Boolean',
'message': "Upload the dropped files",
'checked': 'true',
"Dropped buffers": {
"method": lambda self: self.add_dropped_buffers(),
"userConfig": {
'type': 'Boolean',
'message': "Upload the dropped buffers",
'checked': 'true',
def __init__(self, config):
self.event = MISPEvent()
self.files = None
self.malware_binary = None = None
self.config = {
# if an option is missing (we receive None as a value),
# fall back to the default specified in the options
key: int(
on if on is not None
else self.options[key]["userConfig"]["checked"] == 'true'
for key, on in config.items()
def get_file(self, relative_filepath):
"""Return an io.BufferedIOBase for the corresponding relative_filepath
in the Cuckoo archive. If not found, return an empty io.BufferedReader
to avoid fatal errors."""
blackhole = io.BufferedReader(open('/dev/null', 'rb'))
res = self.files.get(relative_filepath, blackhole)
if res == blackhole:
log.debug(f"Did not find file {relative_filepath}, "
f"returned an empty file instead")
return res
def read_archive(self, archive_encoded):
"""Read the archive exported from Cuckoo and initialize the class"""
# archive_encoded is base 64 encoded content
# we extract the info about each file but do not retrieve
# it automatically, as it may take too much space in memory
buf_io = io.BytesIO(base64.b64decode(archive_encoded))
if zipfile.is_zipfile(buf_io):
# the archive was probably downloaded from the WebUI # don't forget this not to read an empty buffer
z = zipfile.ZipFile(buf_io, 'r')
self.files = {
for info in z.filelist
# only extract the regular files and dirs, we don't
# want any symbolic link
if stat.S_ISREG(info.external_attr >> 16)
or stat.S_ISDIR(info.external_attr >> 16)
# the archive was probably downloaded from the API # don't forget this not to read an empty buffer
f =, mode='r:bz2')
self.files = { f.extractfile(info)
for info in f.getmembers()
# only extract the regular files and dirs, we don't
# want any symbolic link
if info.isreg() or info.isdir()
# We want to keep the order of the keys of sub-dicts in the report,
# eg. the signatures have marks with unknown keys such as
# {'marks': [
# {"suspicious_features": "Connection to IP address",
# "suspicious_request": "OPTIONS"}
# ]}
# To render those marks properly, we can only hope the developpers
# thought about the order in which they put the keys, and keep this
# order so that the signature makes sense to the reader.
# We use PrettyDict, a customization of OrderedDict to do so.
# It will be instanced iteratively when parsing the json (ie. subdicts
# will also be instanced as PrettyDict) = json.load(
def read_malware(self):
self.malware_binary = self.get_file("binary").read()
if not self.malware_binary:
log.warn("No malware binary found")
def add_sandbox_info(self):
info ="info", {})
if not info:
log.warning("The 'info' field was not found "
"in the report, skipping")
return False
o = MISPObject(name='sandbox-report')
o.add_attribute('score', info['score'])
o.add_attribute('sandbox-type', 'on-premise')
o.add_attribute('on-premise-sandbox', 'cuckoo')
f'started on:{info["machine"]["started_on"]} '
f'duration:{info["duration"]}s '
def add_sample(self):
"""Add the sample/target of the analysis"""
target ="target", {})
category = target.get("category", "")
if not category:
log.warning("Could not find info about the sample "
"in the report, skipping")
return False
if category == "file":
log.debug("Sample is a file, uploading it")
file_o, bin_type_o, bin_section_li = make_binary_objects(
file_o.comment = "Submitted sample"
# fix categories
for obj in filter(None, (file_o, bin_type_o, *bin_section_li,)):
for attr in obj.attributes:
if attr.type in PAYLOAD_DELIVERY:
attr.category = "Payload delivery"
elif category == "url":
log.debug("Sample is a URL")
o = MISPObject(name='url')
o.add_attribute('url', target['url'])
o.add_attribute('text', "Submitted URL")
def add_http(self):
"""Add the HTTP requests"""
network ="network", [])
http = network.get("http", [])
if not http:"No HTTP connection found in the report, skipping")
return False
for request in http:
o = MISPObject(name='http-request')
o.add_attribute('host', request['host'])
o.add_attribute('method', request['method'])
o.add_attribute('uri', request['uri'])
o.add_attribute('user-agent', request['user-agent'])
o.add_attribute('text', f"count:{request['count']} "
def add_network(self, proto=None):
Add UDP/TCP traffic
proto must be one of "tcp", "udp"
network ="network", [])
li_conn = network.get(proto, [])
if not li_conn:"No {proto} connection found in the report, skipping")
return False
from_to = []
# sort by time to get the "first packet seen" right
li_conn.sort(key=lambda x: x["time"])
for conn in li_conn:
src = conn['src']
dst = conn['dst']
sport = conn['sport']
dport = conn['dport']
if (src, sport, dst, dport) in from_to:
from_to.append((src, sport, dst, dport))
o = MISPObject(name='network-connection')
o.add_attribute('ip-src', src)
o.add_attribute('ip-dst', dst)
o.add_attribute('src-port', sport)
o.add_attribute('dst-port', dport)
o.add_attribute('layer3-protocol', "IP")
o.add_attribute('layer4-protocol', proto.upper())
o.add_attribute('first-packet-seen', conn['time'])
def add_dns(self):
"""Add DNS records"""
network ="network", [])
dns = network.get("dns", [])
if not dns:"No DNS connection found in the report, skipping")
return False
for record in dns:
o = MISPObject(name='dns-record')
o.add_attribute('text', f"request type:{record['type']}")
o.add_attribute('queried-domain', record['request'])
for answer in record.get("answers", []):
if answer["type"] in ("A", "AAAA"):
o.add_attribute('a-record', answer['data'])
# TODO implement MX/NS
def _get_marks_str(self, marks):
marks_strings = []
for m in marks:
m_type = m.pop("type") # temporarily remove the type
if m_type == "generic":
elif m_type == "ioc":
elif m_type == "call":
call = m["call"]
arguments = call.get("arguments", {})
flags = call.get("flags", {})
info = ""
for details in (arguments, flags):
info += f" {details}"
marks_strings.append(f"Call API '{call['api']}'%s" % info)
logging.debug(f"Unknown mark type '{m_type}', skipping")
m["type"] = m_type # restore key 'type'
# TODO implemented marks 'config' and 'volatility'
return marks_strings
def _add_ttp(self, attribute, ttp_short, ttp_num):
Internal wrapper to add the TTP tag from the MITRE galaxy.
@ params
- attribute: MISPAttribute
- ttp_short: short description of the TTP
(eg. "Credential Dumping")
- ttp_num: formatted as "T"+int
(eg. T1003)
f'"{ttp_short} - {ttp_num}"')
def add_signatures(self):
"""Add the Cuckoo signatures, with as many details as possible
regarding the marks"""
signatures ="signatures", [])
if not signatures:"No signature found in the report")
return False
o = MISPObject(name='sb-signature')
o.add_attribute('software', "Cuckoo")
for sign in signatures:
marks = sign["marks"]
marks_strings = self._get_marks_str(marks)
summary = sign['description']
if marks_strings:
summary += "\n---\n"
marks_strings = set(marks_strings)
description = summary + "\n".join(marks_strings)
a = MISPAttribute()
a.from_dict(type='text', value=description)
for ttp_num, desc in sign.get("ttp", {}).items():
ttp_short = desc["short"]
self._add_ttp(a, ttp_short, ttp_num)
# this signature was triggered by the processes with the following
# PIDs, we can create references
triggered_by_pids = filter(
(m.get("pid", None) for m in marks)
# remove redundancy
triggered_by_pids = set(triggered_by_pids)
for pid in triggered_by_pids:
process_o = find_process_by_pid(self.event, pid)
if process_o:
process_o.add_reference(a, "triggers")
o.add_attribute('signature', **a)
def _handle_process(self, proc, accu):
This is an internal recursive function to handle one process
from a process tree and then iterate on its children.
List the objects to be added, based on the tree, into the `accu` list.
The `accu` list uses a DFS-like order.
o = MISPObject(name='process')
o.add_attribute('pid', proc['pid'])
o.add_attribute('command-line', proc['command_line'])
o.add_attribute('name', proc['process_name'])
o.add_attribute('parent-pid', proc['ppid'])
for child in proc.get('children', []):
pos_child = len(accu)
o.add_attribute('child-pid', child['pid'])
self._handle_process(child, accu)
child_obj = accu[pos_child]
child_obj.add_reference(o, 'child-of')
return o
def add_process_tree(self):
"""Add process tree from the report, as separated process objects"""
behavior ="behavior", {})
tree = behavior.get("processtree", [])
if not tree:
log.warning("No process tree found in the report, skipping")
return False
for proc in tree:
objs = []
self._handle_process(proc, objs)
for o in objs:
def get_relpath(self, path):
Transform an absolute or relative path into a path relative to the
correct cuckoo analysis directory, without knowing the cuckoo
working directory.
Return an empty string if the path given does not refer to a
file from the analysis directory.
head, tail = posixpath.split(path)
if not tail:
return ""
prev = self.get_relpath(head)
longer = posixpath.join(prev, tail)
if longer in self.files:
return longer
elif tail in self.files:
return tail
return ""
def add_screenshots(self):
"""Add the screenshots taken by Cuckoo in a sandbox-report object"""
screenshots ='screenshots', [])
if not screenshots:"No screenshot found in the report, skipping")
return False
o = MISPObject(name='sandbox-report')
o.add_attribute('sandbox-type', 'on-premise')
o.add_attribute('on-premise-sandbox', "cuckoo")
for shot in screenshots:
# The path given by Cuckoo is an absolute path, but we need a path
# relative to the analysis folder.
path = self.get_relpath(shot['path'])
img = self.get_file(path)
# .decode('utf-8') in order to avoid the b'' format
img_data = base64.b64encode('utf-8')
filename = posixpath.basename(path)
"sandbox-file", value=filename,
data=img_data, type='attachment',
category="External analysis",
def _get_dropped_objs(self, path, filename=None, comment=None):
Internal wrapper to get dropped files/buffers as file objects
@ params
- path: relative to the cuckoo analysis directory
- filename: if not specified, deduced from the path
if not filename:
filename = posixpath.basename(path)
dropped_file = self.get_file(path)
dropped_binary = io.BytesIO(
# create ad hoc objects
file_o, bin_type_o, bin_section_li = make_binary_objects(
pseudofile=dropped_binary, filename=filename,
if comment:
file_o.comment = comment
# fix categories
for obj in filter(None, (file_o, bin_type_o, *bin_section_li,)):
for attr in obj.attributes:
if attr.type in ARTIFACTS_DROPPED:
attr.category = "Artifacts dropped"
return file_o, bin_type_o, bin_section_li
def _add_yara(self, obj, yara_dict):
"""Internal wrapper to add Yara matches to an MISPObject"""
for yara in yara_dict:
description = yara.get("meta", {}).get("description", "")
name = yara.get("name", "")
f"Yara match\n(name) {name}\n(description) {description}",
comment="Yara match"
def add_dropped_files(self):
"""Upload the dropped files as file objects"""
dropped ="dropped", [])
if not dropped:"No dropped file found, skipping")
return False
for d in dropped:
# Cuckoo logs three things that are of interest for us:
# - 'filename' which is not the original name of the file
# but is formatted as follow:
# 8 first bytes of SHA265 + _ + original name in lower case
# - 'filepath' which is the original filepath on the VM,
# where the file was dropped
# - 'path' which is the local path of the stored file,
# in the cuckoo archive
filename = d.get("name", "")
original_path = d.get("filepath", "")
sha256 = d.get("sha256", "")
if original_path and sha256:
log.debug(f"Will now try to restore original filename from "
f"path {original_path}")
s = filename.split("_")
if not s:
raise Exception("unexpected filename read "
"in the report")
sha256_first_8_bytes = s[0]
original_name = s[1]
# check our assumptions are valid, if so we can safely
# restore the filename, if not the format may have changed
# so we'll keep the filename of the report
if sha256.startswith(sha256_first_8_bytes) and \
original_path.lower().endswith(original_name) and \
filename not in original_path.lower():
# we can restore the original case of the filename
position = original_path.lower().rindex(original_name)
filename = original_path[position:]
log.debug(f"Successfully restored original filename: "
raise Exception("our assumptions were wrong, "
"filename format may have changed")
except Exception as e:
log.debug(f"Cannot restore filename: {e}")
if not filename:
log.warning(f'No filename found for dropped file! '
f'Will use "{filename}"')
file_o, bin_type_o, bin_section_o = self._get_dropped_objs(
comment="Dropped file"
self._add_yara(file_o, d.get("yara", []))
file_o.add_attribute("fullpath", original_path,
category="Artifacts dropped")
# why is this a list? for when various programs drop the same file?
for pid in d.get("pids", []):
# if we have an object for the process that dropped the file,
# we can link the two (we just take the first result from
# the search)
process_o = find_process_by_pid(self.event, pid)
if process_o:
file_o.add_reference(process_o, "dropped-by")
def add_dropped_buffers(self):
""""Upload the dropped buffers as file objects"""
buffer ="buffer", [])
if not buffer:"No dropped buffer found, skipping")
return False
for i, buf in enumerate(buffer):
file_o, bin_type_o, bin_section_o = self._get_dropped_objs(
filename=f"buffer {i}",
comment="Dropped buffer"
self._add_yara(file_o, buf.get("yara", []))
def parse(self):
"""Run the parsing"""
for name, active in self.config.items():
if active:
def get_misp_event(self):
log.debug("Running MISP expansions")
return self.event
def handler(q=False):
# In case there's no data
if q is False:
return False
q = json.loads(q)
data = q['data']
parser = CuckooParser(q['config'])
event = parser.get_misp_event()
event = json.loads(event.to_json())
results = {
key: event[key]
for key in ('Attribute', 'Object')
if (key in event and event[key])
return {'results': results}
def introspection():
userConfig = {
key: o["userConfig"]
for key, o in CuckooParser.options.items()
mispattributes['userConfig'] = userConfig
return mispattributes
def version():
moduleinfo['config'] = moduleconfig
return moduleinfo