Added process tree text renderer to process_tree_utils.py (#637)

Found bug in process ordering due to sorting of short string numerics - fixed in proc_tree_builder.py - related item in vtfile_behavior.py Fixed query files with ".yml" extensions not being detected/loaded Added parameter to init_notebook to suppress/control auto-detection of environments.
microsoft · Mar 16, 2023 · 5688432 · 5688432
1 parent 25bd5dd
commit 5688432
Show file tree

Hide file tree

Showing 8 changed files with 244 additions and 17 deletions.
diff --git a/docs/source/visualization/ProcessTree.rst b/docs/source/visualization/ProcessTree.rst
@@ -674,6 +674,50 @@ True returns the source process with the result set.
          [5 rows x 35 columns]
 
 
+:py:func:`~msticpy.transform.process_tree_utils.tree_to_text`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Return text rendering of the process tree.
+
+This function returns a text rendering of the process tree.
+
+.. code:: ipython3
+
+   from msticpy.transform.proc_tree_schema import WIN_EVENT_SCHEMA
+   print(process_tree.tree_to_text(p_tree_win, schema=WIN_EVENT_SCHEMA))
+
+
+.. parsed-literal::
+
+   +--  Process: C:\Program Files\Microsoft Monitoring Agent\Agent\MonitoringHost.exe
+      PID: 0x888
+      Time: 1970-01-01 00:00:00+00:00
+      Cmdline: nan
+      Account: nan  LoginID: 0x3e7
+      +--  Process: C:\Windows\System32\cscript.exe  PID: 0x364
+         Time: 2019-01-15 04:15:26+00:00
+         Cmdline: "C:\Windows\system32\cscript.exe" /nologo
+            "MonitorKnowledgeDiscovery.vbs"
+         Account: WORKGROUP\MSTICAlertsWin1$  LoginID: 0x3e7
+      +--  Process: C:\Program Files\Microsoft Monitoring Agent\Agent\Health Service
+         State\CT_602681692\NativeDSC\DesiredStateConfiguration\ASMHost.exe  PID:
+         0x1c4
+         Time: 2019-01-15 04:16:24.007000+00:00
+         Cmdline: "C:\Program Files\Microsoft Monitoring Agent\Agent\Health
+            Service
+            State\CT_602681692\NativeDSC\DesiredStateConfiguration\ASMHost.exe"
+            GetInventory "C:\Program Files\Microsoft Monitoring
+            Agent\Agent\Health Service
+            State\CT_602681692\work\ServiceState\ServiceState.mof" "C:\Program
+            Files\Microsoft Monitoring Agent\Agent\Health Service
+            State\CT_602681692\work\ServiceState"
+         Account: WORKGROUP\MSTICAlertsWin1$  LoginID: 0x3e7
+         +--  Process: C:\Windows\System32\conhost.exe  PID: 0x99c
+            Time: 2019-01-15 04:16:24.027000+00:00
+            Cmdline: \??\C:\Windows\system32\conhost.exe 0xffffffff -ForceV1
+            Account: WORKGROUP\MSTICAlertsWin1$  LoginID: 0x3e7
+
+
 Create a network from a Tree using Networkx
 -------------------------------------------
 

diff --git a/msticpy/__init__.py b/msticpy/__init__.py
@@ -99,6 +99,7 @@
 >>> mp.init_notebook()
 
 This module performs several steps to initialize MSTICPy:
+
 - imports a number of standard packages (e.g. pandas) into the notebook
 - imports a number of modules and functions from msticpy
 - checks the version of MSTICPy

diff --git a/msticpy/context/vtlookupv3/vtfile_behavior.py b/msticpy/context/vtlookupv3/vtfile_behavior.py
@@ -6,7 +6,7 @@
 """VirusTotal File Behavior functions."""
 import re
 from copy import deepcopy
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from pprint import pformat
 from typing import Any, Dict, List, Optional, Union
@@ -418,16 +418,18 @@ def _try_match_commandlines(
 def _fill_missing_proc_tree_values(process_df: pd.DataFrame) -> pd.DataFrame:
     # Define a schema to map Df names on to internal ProcSchema
     process_df["path"] = np.nan
-    process_df.loc[process_df.IsRoot, "path"] = process_df[
-        process_df.IsRoot
-    ].index.astype("str")
+    process_df.loc[process_df.IsRoot, "path"] = pd.Series(
+        process_df[process_df.IsRoot].index.astype("str")
+    ).apply(lambda x: x.zfill(5))
 
     # Fill in some required fields with placeholder data
-    process_df["time_stamp"] = datetime.utcnow()
+    process_df["time_stamp"] = datetime.now(tz=timezone.utc)
     process_df["host"] = "sandbox"
     process_df["logon_id"] = "na"
     process_df["event_id"] = "na"
-    process_df["source_index"] = process_df.index.astype("str")
+    process_df["source_index"] = pd.Series(process_df.index.astype("str")).apply(
+        lambda x: x.zfill(5)
+    )
 
     proc_tree = process_df.set_index("proc_key")
 

diff --git a/msticpy/data/core/data_query_reader.py b/msticpy/data/core/data_query_reader.py
@@ -4,6 +4,7 @@
 # license information.
 # --------------------------------------------------------------------------
 """Data query definition reader."""
+from itertools import chain
 from pathlib import Path
 from typing import Any, Dict, Iterable, Tuple
 
@@ -31,11 +32,14 @@ def find_yaml_files(source_path: str, recursive: bool = True) -> Iterable[Path]:
     Returns
     -------
     Iterable[str]
-        File paths of yanl files found.
+        File paths of yaml files found.
 
     """
     recurse_pfx = "**/" if recursive else ""
-    file_glob = Path(source_path).glob(f"{recurse_pfx}*.yaml")
+    file_glob = chain(
+        Path(source_path).glob(f"{recurse_pfx}*.yaml"),
+        Path(source_path).glob(f"{recurse_pfx}*.yml"),
+    )
     for file_path in file_glob:
         if not file_path.is_file():
             continue
@@ -49,7 +53,7 @@ def read_query_def_file(query_file: str) -> Tuple[Dict, Dict, Dict]:
     Parameters
     ----------
     query_file : str
-        Path to yaml query defintion file
+        Path to yaml query definition file
 
     Returns
     -------

diff --git a/msticpy/init/nbinit.py b/msticpy/init/nbinit.py
@@ -52,7 +52,7 @@
 from contextlib import redirect_stdout
 from functools import wraps
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
 
 import ipywidgets as widgets
 import pandas as pd
@@ -328,13 +328,22 @@ def init_notebook(
         0 = No output
         1 or False = Brief output (default)
         2 or True = Detailed output
+    verbosity : int, optional
+        alias for `verbose`
     config : Optional[str]
         Use this path to load a msticpyconfig.yaml.
         Defaults are MSTICPYCONFIG env variable, home folder (~/.msticpy),
         current working directory.
     no_config_check : bool, optional
         Skip the check for valid configuration. Default is False.
-    verbosity : int, optional
+    detect_env : Union[bool, List[str]], optional
+        By default init_notebook tries to detect environments and makes
+        additional configuration changes if it finds one of the supported
+        notebook environments.
+        Passing `False` for this parameter disables environment detection.
+        Alternatively, you can pass a list of environments that you
+        want to detect. Current supported environments are 'aml' (Azure
+        Machine Learning) and 'synapse' (Azure Synapse).
 
     Raises
     ------
@@ -415,7 +424,7 @@ def init_notebook(
     _pr_output("<hr><h4>Starting Notebook initialization...</h4>")
     logger.info("Starting Notebook initialization")
     # Check Azure ML environment
-    if is_in_aml():
+    if _detect_env("aml", **kwargs) and is_in_aml():
         check_versions_aml(*_get_aml_globals(namespace))
     else:
         # If not in AML check and print version status
@@ -426,7 +435,7 @@ def init_notebook(
             _pr_output(output)
             logger.info(output)
 
-    if is_in_synapse():
+    if _detect_env("synapse", **kwargs) and is_in_synapse():
         synapse_params = {
             key: val for key, val in kwargs.items() if key in _SYNAPSE_KWARGS
         }
@@ -522,6 +531,14 @@ def _set_verbosity(**kwargs):
     _VERBOSITY(verbosity)
 
 
+def _detect_env(env_name: Literal["aml", "synapse"], **kwargs):
+    """Return true if an environment should be detected."""
+    detect_opt = kwargs.get("detect_env", True)
+    if isinstance(detect_opt, bool):
+        return detect_opt
+    return env_name in detect_opt if isinstance(detect_opt, list) else True
+
+
 def list_default_imports():
     """List the default imports for `init_notebook`."""
     for imp_group in (_NB_IMPORTS, _MP_IMPORTS):

diff --git a/msticpy/transform/proc_tree_builder.py b/msticpy/transform/proc_tree_builder.py
@@ -136,8 +136,17 @@ def _add_tree_properties(proc_tree):
     proc_tree.loc[~has_child, "IsLeaf"] = True
     proc_tree.loc[~is_root & has_child, "IsBranch"] = True
 
-    # Save the current numeric index as "source_index" converting to string
-    proc_tree[Col.source_index] = proc_tree.index.astype(str)
+    # ensure index is numeric/unique
+    if (
+        not proc_tree.index.is_unique
+        and not proc_tree.index.is_monotonic_increasing
+        and not proc_tree.index.is_integer()
+    ):
+        proc_tree = proc_tree.reset_index()
+    # Save the numeric index as "source_index" converting to string
+    proc_tree[Col.source_index] = pd.Series(proc_tree.index.astype(str)).apply(
+        lambda x: x.zfill(5)
+    )
     # Set the index of the output frame to be the proc_key
     proc_tree = proc_tree.set_index(Col.proc_key)
 
@@ -163,7 +172,7 @@ def build_proc_tree(input_tree: pd.DataFrame, max_depth: int = -1) -> pd.DataFra
         DataFrame with ordered paths for each process.
 
     """
-    # set default path == current process ID
+    # set default path == current process row index
     input_tree["path"] = input_tree[Col.source_index]
     # input_tree["parent_index"] = np.nan
 

diff --git a/msticpy/transform/process_tree_utils.py b/msticpy/transform/process_tree_utils.py
@@ -4,12 +4,15 @@
 # license information.
 # --------------------------------------------------------------------------
 """Process Tree Visualization."""
-from typing import Any, Dict, Optional, Union
+import textwrap
+from collections import Counter
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union
 
 import pandas as pd
 
 from .._version import VERSION
 from .proc_tree_schema import ColNames as Col
+from .proc_tree_schema import ProcSchema
 
 __version__ = VERSION
 __author__ = "Ian Hellen"
@@ -365,3 +368,126 @@ def get_summary_info(procs: pd.DataFrame) -> Dict[str, int]:
     summary["IsolatedProcesses"] = len(procs[(procs["IsRoot"]) & (procs["IsLeaf"])])
     summary["LargestTreeDepth"] = procs["path"].str.count("/").max() + 1
     return summary
+
+
+class TemplateLine(NamedTuple):
+    """
+    Template definition for a line in text process tree.
+
+    Notes
+    -----
+    The items attribute must be a list of tuples, where each
+    tuple is (<display_name>, <column_name>).
+
+    """
+
+    items: List[Tuple[str, str]] = []
+    wrap: int = 80
+
+
+def tree_to_text(
+    procs: pd.DataFrame,
+    schema: Optional[Union[ProcSchema, Dict[str, str]]] = None,
+    template: Optional[List[TemplateLine]] = None,
+    sort_column: str = "path",
+    wrap_column: int = 0,
+) -> str:
+    """
+    Return text rendering of process tree.
+
+    Parameters
+    ----------
+    procs : pd.DataFrame
+        The process tree DataFrame.
+    schema : Optional[Union[ProcSchema, Dict[str, str]]], optional
+        The schema to use for mapping the DataFrame column
+        names, by default None
+    template : Optional[List[TemplateLine]], optional
+        A manually created template to use to create the node
+        formatting, by default None
+    sort_column : str, optional
+        The column to sort the DataFrame by, by default "path"
+    wrap_column : int, optional
+        Override any template-specified wrap limit, by default 0
+
+    Returns
+    -------
+    str
+        The formatted process tree string.
+
+    Raises
+    ------
+    ValueError
+        If neither of
+    """
+    if not schema and not template:
+        raise ValueError(
+            "One of 'schema' and 'template' must be supplied", "as parameters."
+        )
+    template = template or _create_proctree_template(schema)  # type: ignore
+    output: List[str] = []
+    for _, row in procs.sort_values(sort_column).iterrows():
+        depth_count = Counter(row.path).get("/", 0)
+        header = _node_header(depth_count)
+
+        # handle first row separately since it needs a header
+        tmplt_line = template[0]
+        out_line = "  ".join(
+            f"{name}: {row[col]}" if name else f"{row[col]}"
+            for name, col in tmplt_line.items
+        )
+        indent = " " * len(header) + " "
+        out_line = "\n".join(
+            textwrap.wrap(
+                out_line,
+                width=wrap_column or tmplt_line.wrap,
+                subsequent_indent=indent,
+            )
+        )
+        output.append(f"{header} {out_line}\n")
+
+        # process subsequent rows
+        for tmplt_line in template[1:]:
+            out_line = "  ".join(
+                f"{name}: {row[col]}" for name, col in tmplt_line.items
+            )
+            out_line = "\n".join(
+                textwrap.wrap(
+                    out_line,
+                    width=wrap_column or tmplt_line.wrap,
+                    initial_indent=indent,
+                    subsequent_indent=indent + "   ",
+                )
+            )
+            output.extend([out_line, "\n"])
+
+    return "".join(output)
+
+
+def _create_proctree_template(
+    schema: Union[ProcSchema, Dict[str, str]]
+) -> List[TemplateLine]:
+    """Create a template from the schema."""
+    if isinstance(schema, dict):
+        schema = ProcSchema(**schema)
+    template_lines: List[TemplateLine] = [
+        TemplateLine(
+            items=[("Process", schema.process_name), ("PID", schema.process_id)]
+        ),
+        TemplateLine(items=[("Time", schema.time_stamp)]),
+    ]
+    if schema.cmd_line:
+        template_lines.append(TemplateLine(items=[("Cmdline", schema.cmd_line)]))
+    acct_items = []
+    if schema.user_id:
+        acct_items.append(("Account", schema.user_id))
+    if schema.logon_id:
+        acct_items.append(("Account", schema.logon_id))
+    if acct_items:
+        template_lines.append(TemplateLine(items=acct_items))
+    return template_lines
+
+
+def _node_header(depth_count):
+    """Return text tree node header given tree depth."""
+    return "+ " if depth_count == 0 else "   " * depth_count + "+-- "
diff --git a/tests/transform/test_process_tree_utils.py b/tests/transform/test_process_tree_utils.py
@@ -426,6 +426,30 @@ def test_build_mde_win_tree_dict_schema():
     }
 
 
+def test_text_process_tree():
+    schema = dict(
+        time_stamp="TimeGenerated",
+        process_name="NewProcessName",
+        process_id="NewProcessId",
+        parent_name="ParentProcessName",
+        parent_id="ProcessId",
+        logon_id="SubjectLogonId",
+        target_logon_id="TargetLogonId",
+        cmd_line="CommandLine",
+        user_name="SubjectUserName",
+        path_separator="\\",
+        user_id="SubjectUserSid",
+        event_id_column="EventID",
+        event_id_identifier=4688,
+        host_name_column="Computer",
+    )
+    p_tree = pt_build.build_process_tree(
+        testdf_win, schema=schema, show_summary=True, debug=True
+    )
+    tree_txt = pt_util.tree_to_text(p_tree, schema=schema)
+    assert len(tree_txt.split("\n")) == 5028
+
+
 _NB_FOLDER = "docs/notebooks"
 _NB_NAME = "ProcessTree.ipynb"
 _MP_CONFIG_PATH = get_test_data_path().parent.joinpath("msticpyconfig-test.yaml")