# Process Leaf Node Features over Time
Starting with process paths (process -> all parents), filter to only processes with no children in the current time window.
With those, mark processes as "started/active/term" using the process_started and process_term fields.

In [1]:
# Import packages used in notebooks
import duckdb
import wintapgraph as wg
import networkx as nx
%load_ext magic_duckdb

In [2]:
# Initialize an in-memory db. Save reference in a variable and then set magic-duckdb environment. Result is ability to use the same DB instance from python code and %dql/%%dql magics.
# Also create views for every top-level type found in the current dataset.
con = duckdb.connect()
%dql -co con
# Display the list of tables/views
%dql show tables
# Only uses process table
%dql create view process as from '/data/ACME-Redo/stdview-20231105-20231120/process.parquet'

Unnamed: 0,Count


# Process Leaf Nodes in 5 minute windows

In [None]:
%dql summarize SELECT * FROM range(DATE '2023-11-05', DATE '2023-11-20', INTERVAL '5' MINUTES)

In [None]:
def get_sql(filename):
    return f"""
copy (
SELECT 
  ? time_bucket_start,
  ? time_bucket_end,
    p.hostname, p.pid_hash, p.process_name, p.process_started, p.process_term,
  if(p.process_started <= time_bucket_end and p.process_term >= time_bucket_start,1,0) active,
  if(p.process_started between time_bucket_start and time_bucket_end,1,0) started,
  if(p.process_term between time_bucket_start and time_bucket_end,1,0) term,
from process p
left outer join process c on p.pid_hash=c.parent_pid_hash 
where c.parent_pid_hash is null
and (active=1 or started=1 or term=1)
)
to '{filename}' (FORMAT PARQUET)
"""


buckets = con.sql(" SELECT range, range + interval '5 minutes' range_end FROM range(DATE '2023-11-05', DATE '2023-11-20', INTERVAL '5' MINUTES)")
cols=buckets.columns
for row in buckets.fetchall():
        start=row[cols.index("range")]
        end=row[cols.index("range_end")]
        print(f"Start: {start}  End: {end}")
        epoch=start.strftime("%s")
        print(epoch)
        filename=f"data/raw/process_leaf_node_{epoch}.parquet"
        cur_bucket=con.execute(get_sql(filename), [start,end])

In [None]:
%dql create view process_leaf_nodes as from 'data/raw/*.parquet'
%dql summarize process_leaf_nodes

In [None]:
df=%dql select started,active,term,count(*) from process_leaf_nodes group by all
df

In [None]:
%dql copy (select * from process_leaf_nodes) to 'data/process_leaf_nodes.parquet' (format parquet)