Skip to content

Commit

Permalink
Added process tree text renderer to process_tree_utils.py (#637)
Browse files Browse the repository at this point in the history
Found bug in process ordering due to sorting of short string numerics - fixed in proc_tree_builder.py
- related item in vtfile_behavior.py
Fixed query files with ".yml" extensions not being detected/loaded
Added parameter to init_notebook to suppress/control auto-detection of environments.
  • Loading branch information
ianhelle committed Mar 16, 2023
1 parent 25bd5dd commit 5688432
Show file tree
Hide file tree
Showing 8 changed files with 244 additions and 17 deletions.
44 changes: 44 additions & 0 deletions docs/source/visualization/ProcessTree.rst
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,50 @@ True returns the source process with the result set.
[5 rows x 35 columns]


:py:func:`~msticpy.transform.process_tree_utils.tree_to_text`
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Return text rendering of the process tree.

This function returns a text rendering of the process tree.

.. code:: ipython3
from msticpy.transform.proc_tree_schema import WIN_EVENT_SCHEMA
print(process_tree.tree_to_text(p_tree_win, schema=WIN_EVENT_SCHEMA))
.. parsed-literal::
+-- Process: C:\Program Files\Microsoft Monitoring Agent\Agent\MonitoringHost.exe
PID: 0x888
Time: 1970-01-01 00:00:00+00:00
Cmdline: nan
Account: nan LoginID: 0x3e7
+-- Process: C:\Windows\System32\cscript.exe PID: 0x364
Time: 2019-01-15 04:15:26+00:00
Cmdline: "C:\Windows\system32\cscript.exe" /nologo
"MonitorKnowledgeDiscovery.vbs"
Account: WORKGROUP\MSTICAlertsWin1$ LoginID: 0x3e7
+-- Process: C:\Program Files\Microsoft Monitoring Agent\Agent\Health Service
State\CT_602681692\NativeDSC\DesiredStateConfiguration\ASMHost.exe PID:
0x1c4
Time: 2019-01-15 04:16:24.007000+00:00
Cmdline: "C:\Program Files\Microsoft Monitoring Agent\Agent\Health
Service
State\CT_602681692\NativeDSC\DesiredStateConfiguration\ASMHost.exe"
GetInventory "C:\Program Files\Microsoft Monitoring
Agent\Agent\Health Service
State\CT_602681692\work\ServiceState\ServiceState.mof" "C:\Program
Files\Microsoft Monitoring Agent\Agent\Health Service
State\CT_602681692\work\ServiceState"
Account: WORKGROUP\MSTICAlertsWin1$ LoginID: 0x3e7
+-- Process: C:\Windows\System32\conhost.exe PID: 0x99c
Time: 2019-01-15 04:16:24.027000+00:00
Cmdline: \??\C:\Windows\system32\conhost.exe 0xffffffff -ForceV1
Account: WORKGROUP\MSTICAlertsWin1$ LoginID: 0x3e7
Create a network from a Tree using Networkx
-------------------------------------------

Expand Down
1 change: 1 addition & 0 deletions msticpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
>>> mp.init_notebook()
This module performs several steps to initialize MSTICPy:
- imports a number of standard packages (e.g. pandas) into the notebook
- imports a number of modules and functions from msticpy
- checks the version of MSTICPy
Expand Down
14 changes: 8 additions & 6 deletions msticpy/context/vtlookupv3/vtfile_behavior.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""VirusTotal File Behavior functions."""
import re
from copy import deepcopy
from datetime import datetime
from datetime import datetime, timezone
from pathlib import Path
from pprint import pformat
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -418,16 +418,18 @@ def _try_match_commandlines(
def _fill_missing_proc_tree_values(process_df: pd.DataFrame) -> pd.DataFrame:
# Define a schema to map Df names on to internal ProcSchema
process_df["path"] = np.nan
process_df.loc[process_df.IsRoot, "path"] = process_df[
process_df.IsRoot
].index.astype("str")
process_df.loc[process_df.IsRoot, "path"] = pd.Series(
process_df[process_df.IsRoot].index.astype("str")
).apply(lambda x: x.zfill(5))

# Fill in some required fields with placeholder data
process_df["time_stamp"] = datetime.utcnow()
process_df["time_stamp"] = datetime.now(tz=timezone.utc)
process_df["host"] = "sandbox"
process_df["logon_id"] = "na"
process_df["event_id"] = "na"
process_df["source_index"] = process_df.index.astype("str")
process_df["source_index"] = pd.Series(process_df.index.astype("str")).apply(
lambda x: x.zfill(5)
)

proc_tree = process_df.set_index("proc_key")

Expand Down
10 changes: 7 additions & 3 deletions msticpy/data/core/data_query_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# license information.
# --------------------------------------------------------------------------
"""Data query definition reader."""
from itertools import chain
from pathlib import Path
from typing import Any, Dict, Iterable, Tuple

Expand Down Expand Up @@ -31,11 +32,14 @@ def find_yaml_files(source_path: str, recursive: bool = True) -> Iterable[Path]:
Returns
-------
Iterable[str]
File paths of yanl files found.
File paths of yaml files found.
"""
recurse_pfx = "**/" if recursive else ""
file_glob = Path(source_path).glob(f"{recurse_pfx}*.yaml")
file_glob = chain(
Path(source_path).glob(f"{recurse_pfx}*.yaml"),
Path(source_path).glob(f"{recurse_pfx}*.yml"),
)
for file_path in file_glob:
if not file_path.is_file():
continue
Expand All @@ -49,7 +53,7 @@ def read_query_def_file(query_file: str) -> Tuple[Dict, Dict, Dict]:
Parameters
----------
query_file : str
Path to yaml query defintion file
Path to yaml query definition file
Returns
-------
Expand Down
25 changes: 21 additions & 4 deletions msticpy/init/nbinit.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from contextlib import redirect_stdout
from functools import wraps
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple

import ipywidgets as widgets
import pandas as pd
Expand Down Expand Up @@ -328,13 +328,22 @@ def init_notebook(
0 = No output
1 or False = Brief output (default)
2 or True = Detailed output
verbosity : int, optional
alias for `verbose`
config : Optional[str]
Use this path to load a msticpyconfig.yaml.
Defaults are MSTICPYCONFIG env variable, home folder (~/.msticpy),
current working directory.
no_config_check : bool, optional
Skip the check for valid configuration. Default is False.
verbosity : int, optional
detect_env : Union[bool, List[str]], optional
By default init_notebook tries to detect environments and makes
additional configuration changes if it finds one of the supported
notebook environments.
Passing `False` for this parameter disables environment detection.
Alternatively, you can pass a list of environments that you
want to detect. Current supported environments are 'aml' (Azure
Machine Learning) and 'synapse' (Azure Synapse).
Raises
------
Expand Down Expand Up @@ -415,7 +424,7 @@ def init_notebook(
_pr_output("<hr><h4>Starting Notebook initialization...</h4>")
logger.info("Starting Notebook initialization")
# Check Azure ML environment
if is_in_aml():
if _detect_env("aml", **kwargs) and is_in_aml():
check_versions_aml(*_get_aml_globals(namespace))
else:
# If not in AML check and print version status
Expand All @@ -426,7 +435,7 @@ def init_notebook(
_pr_output(output)
logger.info(output)

if is_in_synapse():
if _detect_env("synapse", **kwargs) and is_in_synapse():
synapse_params = {
key: val for key, val in kwargs.items() if key in _SYNAPSE_KWARGS
}
Expand Down Expand Up @@ -522,6 +531,14 @@ def _set_verbosity(**kwargs):
_VERBOSITY(verbosity)


def _detect_env(env_name: Literal["aml", "synapse"], **kwargs):
"""Return true if an environment should be detected."""
detect_opt = kwargs.get("detect_env", True)
if isinstance(detect_opt, bool):
return detect_opt
return env_name in detect_opt if isinstance(detect_opt, list) else True


def list_default_imports():
"""List the default imports for `init_notebook`."""
for imp_group in (_NB_IMPORTS, _MP_IMPORTS):
Expand Down
15 changes: 12 additions & 3 deletions msticpy/transform/proc_tree_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,17 @@ def _add_tree_properties(proc_tree):
proc_tree.loc[~has_child, "IsLeaf"] = True
proc_tree.loc[~is_root & has_child, "IsBranch"] = True

# Save the current numeric index as "source_index" converting to string
proc_tree[Col.source_index] = proc_tree.index.astype(str)
# ensure index is numeric/unique
if (
not proc_tree.index.is_unique
and not proc_tree.index.is_monotonic_increasing
and not proc_tree.index.is_integer()
):
proc_tree = proc_tree.reset_index()
# Save the numeric index as "source_index" converting to string
proc_tree[Col.source_index] = pd.Series(proc_tree.index.astype(str)).apply(
lambda x: x.zfill(5)
)
# Set the index of the output frame to be the proc_key
proc_tree = proc_tree.set_index(Col.proc_key)

Expand All @@ -163,7 +172,7 @@ def build_proc_tree(input_tree: pd.DataFrame, max_depth: int = -1) -> pd.DataFra
DataFrame with ordered paths for each process.
"""
# set default path == current process ID
# set default path == current process row index
input_tree["path"] = input_tree[Col.source_index]
# input_tree["parent_index"] = np.nan

Expand Down
128 changes: 127 additions & 1 deletion msticpy/transform/process_tree_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@
# license information.
# --------------------------------------------------------------------------
"""Process Tree Visualization."""
from typing import Any, Dict, Optional, Union
import textwrap
from collections import Counter
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union

import pandas as pd

from .._version import VERSION
from .proc_tree_schema import ColNames as Col
from .proc_tree_schema import ProcSchema

__version__ = VERSION
__author__ = "Ian Hellen"
Expand Down Expand Up @@ -365,3 +368,126 @@ def get_summary_info(procs: pd.DataFrame) -> Dict[str, int]:
summary["IsolatedProcesses"] = len(procs[(procs["IsRoot"]) & (procs["IsLeaf"])])
summary["LargestTreeDepth"] = procs["path"].str.count("/").max() + 1
return summary


class TemplateLine(NamedTuple):
"""
Template definition for a line in text process tree.
Notes
-----
The items attribute must be a list of tuples, where each
tuple is (<display_name>, <column_name>).
"""

items: List[Tuple[str, str]] = []
wrap: int = 80


def tree_to_text(
procs: pd.DataFrame,
schema: Optional[Union[ProcSchema, Dict[str, str]]] = None,
template: Optional[List[TemplateLine]] = None,
sort_column: str = "path",
wrap_column: int = 0,
) -> str:
"""
Return text rendering of process tree.
Parameters
----------
procs : pd.DataFrame
The process tree DataFrame.
schema : Optional[Union[ProcSchema, Dict[str, str]]], optional
The schema to use for mapping the DataFrame column
names, by default None
template : Optional[List[TemplateLine]], optional
A manually created template to use to create the node
formatting, by default None
sort_column : str, optional
The column to sort the DataFrame by, by default "path"
wrap_column : int, optional
Override any template-specified wrap limit, by default 0
Returns
-------
str
The formatted process tree string.
Raises
------
ValueError
If neither of
"""
if not schema and not template:
raise ValueError(
"One of 'schema' and 'template' must be supplied", "as parameters."
)
template = template or _create_proctree_template(schema) # type: ignore
output: List[str] = []
for _, row in procs.sort_values(sort_column).iterrows():
depth_count = Counter(row.path).get("/", 0)
header = _node_header(depth_count)

# handle first row separately since it needs a header
tmplt_line = template[0]
out_line = " ".join(
f"{name}: {row[col]}" if name else f"{row[col]}"
for name, col in tmplt_line.items
)
indent = " " * len(header) + " "
out_line = "\n".join(
textwrap.wrap(
out_line,
width=wrap_column or tmplt_line.wrap,
subsequent_indent=indent,
)
)
output.append(f"{header} {out_line}\n")

# process subsequent rows
for tmplt_line in template[1:]:
out_line = " ".join(
f"{name}: {row[col]}" for name, col in tmplt_line.items
)
out_line = "\n".join(
textwrap.wrap(
out_line,
width=wrap_column or tmplt_line.wrap,
initial_indent=indent,
subsequent_indent=indent + " ",
)
)
output.extend([out_line, "\n"])

return "".join(output)


def _create_proctree_template(
schema: Union[ProcSchema, Dict[str, str]]
) -> List[TemplateLine]:
"""Create a template from the schema."""
if isinstance(schema, dict):
schema = ProcSchema(**schema)
template_lines: List[TemplateLine] = [
TemplateLine(
items=[("Process", schema.process_name), ("PID", schema.process_id)]
),
TemplateLine(items=[("Time", schema.time_stamp)]),
]
if schema.cmd_line:
template_lines.append(TemplateLine(items=[("Cmdline", schema.cmd_line)]))
acct_items = []
if schema.user_id:
acct_items.append(("Account", schema.user_id))
if schema.logon_id:
acct_items.append(("Account", schema.logon_id))
if acct_items:
template_lines.append(TemplateLine(items=acct_items))
return template_lines


def _node_header(depth_count):
"""Return text tree node header given tree depth."""
return "+ " if depth_count == 0 else " " * depth_count + "+-- "
24 changes: 24 additions & 0 deletions tests/transform/test_process_tree_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,30 @@ def test_build_mde_win_tree_dict_schema():
}


def test_text_process_tree():
schema = dict(
time_stamp="TimeGenerated",
process_name="NewProcessName",
process_id="NewProcessId",
parent_name="ParentProcessName",
parent_id="ProcessId",
logon_id="SubjectLogonId",
target_logon_id="TargetLogonId",
cmd_line="CommandLine",
user_name="SubjectUserName",
path_separator="\\",
user_id="SubjectUserSid",
event_id_column="EventID",
event_id_identifier=4688,
host_name_column="Computer",
)
p_tree = pt_build.build_process_tree(
testdf_win, schema=schema, show_summary=True, debug=True
)
tree_txt = pt_util.tree_to_text(p_tree, schema=schema)
assert len(tree_txt.split("\n")) == 5028


_NB_FOLDER = "docs/notebooks"
_NB_NAME = "ProcessTree.ipynb"
_MP_CONFIG_PATH = get_test_data_path().parent.joinpath("msticpyconfig-test.yaml")
Expand Down

0 comments on commit 5688432

Please sign in to comment.