# A quick look at exploring Wintap data

This notebook is a simple example of looking at Wintap data in Jupyter.

It shows how to use:
* SQL via DuckDB
* Basic queries and data review
* Charting data with Altair

In [1]:
# Install packages we'll need
!pip install duckdb==0.10.1 magic_duckdb altair
%reload_ext magic_duckdb



In [2]:
IN_COLAB = True
try:
  import google.colab
except:
  IN_COLAB = False

if IN_COLAB:
  # Download the sample data file. Its actually in https://tinyurl.com/wintapdata/ACME-workshop-20231109-20231111.db, but this is a direct reference that works better.
  !gdown 1ELBBx3p2ngVSIyf7Neu0F50vd-qf9XDO

Downloading...
From (original): https://drive.google.com/uc?id=1ELBBx3p2ngVSIyf7Neu0F50vd-qf9XDO
From (redirected): https://drive.google.com/uc?id=1ELBBx3p2ngVSIyf7Neu0F50vd-qf9XDO&confirm=t&uuid=f583a5c6-2d6f-4125-b4bf-601aa702ba90
To: /content/ACME-workshop-20231109-20231111.db
100% 613M/613M [00:08<00:00, 73.3MB/s]


In [3]:
# Connect to the database instance
# This cell is creating a python variable database connection and then supplying that to the "magic_duckdb" Jupyter extension
# which allows the single-line SQL (%dql) and multiline SQL (%%dql) "magics". https://github.com/iqmo-org/magic_duckdb
import duckdb

conn = duckdb.connect('ACME-workshop-20231109-20231111.db')
%dql -co conn

# Display Data

A few immediate questions are: what tables are here? What columns are in those tables and how do I see some of the data?

In [4]:
# Lets see all those tables
%dql show all tables

Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,ACME-workshop-20231109-20231111,main,all_files,"[filename, num_hosts, process_num_rows, dll_nu...","[VARCHAR, BIGINT, DOUBLE, DOUBLE, DOUBLE]",False
1,ACME-workshop-20231109-20231111,main,binary_summary,"[filenames, hostnames, id]","[VARCHAR, VARCHAR, VARCHAR]",False
2,ACME-workshop-20231109-20231111,main,files,"[file_id, hostname, filename, process_num_rows...","[VARCHAR, VARCHAR, VARCHAR, DOUBLE, DOUBLE, DO...",False
3,ACME-workshop-20231109-20231111,main,host,"[Hostname, agent_ids, os_family, first_seen, l...","[VARCHAR, VARCHAR[], VARCHAR, TIMESTAMP WITH T...",False
4,ACME-workshop-20231109-20231111,main,host_ip,"[agent_id, HostName, os_family, private_gatewa...","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR, ...",False
5,ACME-workshop-20231109-20231111,main,labels_graph_net_conn,"[conn_id, label_source, label_num_sources, lab...","[VARCHAR, VARCHAR, BIGINT, BIGINT, BIGINT]",False
6,ACME-workshop-20231109-20231111,main,labels_graph_nodes,"[filename, node_type, id, annotation, label]","[VARCHAR, VARCHAR, VARCHAR, VARCHAR, VARCHAR]",False
7,ACME-workshop-20231109-20231111,main,labels_graph_process_summary,"[pid_hash, label_source, label_num_sources, la...","[VARCHAR, VARCHAR, BIGINT, BIGINT, BIGINT]",False
8,ACME-workshop-20231109-20231111,main,labels_networkx,"[directed, is_multigraph, nodes, links, filename]","[BOOLEAN, BOOLEAN, STRUCT(""type"" VARCHAR, id V...",False
9,ACME-workshop-20231109-20231111,main,lolbas,"[filename, description, author, date, command,...","[VARCHAR, VARCHAR, VARCHAR, DATE, VARCHAR, VAR...",False


In [5]:
# Duckdb's summarize command display the schema of a table along with some useful metrics. And its fast!
%dql summarize process

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,pid_hash,VARCHAR,0000A24042EF42E1604E18292EEFC274,FFFF4AC84C0F2154C03795696C66C276,127151,,,,,,125510,0.0
1,os_family,VARCHAR,windows,windows,1,,,,,,125510,0.0
2,agent_id,VARCHAR,,,0,,,,,,125510,100.0
3,num_agent_id,BIGINT,0,0,1,0.0,0.0,0.0,0.0,0.0,125510,0.0
4,hostname,VARCHAR,ACME-DC1,ACME-HH-HGC,4,,,,,,125510,0.0
5,os_pid,INTEGER,0,12284,2724,6043.85222691419,2956.7704404886485,3720.0,6241.0,8161.0,125510,0.0
6,process_name,VARCHAR,amazon-ssm-agent.exe,wudfhost.exe,149,,,,,,125510,2.54
7,num_process_name,BIGINT,0,2,3,0.9745996334953392,0.1573890727994212,1.0,1.0,1.0,125510,0.0
8,args,VARCHAR,"""-c"" ""from multiprocessing.spawn import spawn_...",{f91752d8-5e0b-44ab-83c7-238153110e7c},52238,,,,,,125510,3.59
9,num_args,BIGINT,0,1,2,0.9698032029320371,0.1711291437607439,1.0,1.0,1.0,125510,0.0


In [6]:
# Lets see all rows in a table
%dql select * from host

Unnamed: 0,Hostname,agent_ids,os_family,first_seen,last_seen,os,num_os,os_version,num_os_version,arch,...,num_ad_domain,domain_role,num_domain_role,last_boot,num_last_boot,wintap_version,num_wintap_version,etl_version,num_etl_version,num_rows
0,ACME-DC1,[None],windows,2023-11-09 00:00:00+00:00,2023-11-11 23:00:00+00:00,Windows Server 2022 Datacenter,1,Microsoft Windows NT 6.2.9200.0,1,64-bit,...,1,Primary Domain Controller,1,1698868276,1,6.0.10.4,1,1.0.8.3,1,69
1,ACME-HH-AKA,[None],windows,2023-11-09 00:00:00+00:00,2023-11-11 23:00:00+00:00,Windows Server 2022 Datacenter,1,Microsoft Windows NT 6.2.9200.0,1,64-bit,...,1,Member Server,1,1699288066,1,6.0.10.4,1,1.0.8.4,1,70
2,ACME-HH-ATV,[None],windows,2023-11-09 00:00:00+00:00,2023-11-11 23:00:00+00:00,Windows Server 2022 Datacenter,1,Microsoft Windows NT 6.2.9200.0,1,64-bit,...,1,Member Server,1,1699285413,1,6.0.10.4,1,1.0.8.3,1,71
3,ACME-HH-HGC,[None],windows,2023-11-09 00:00:00+00:00,2023-11-11 23:00:00+00:00,Windows Server 2022 Datacenter,1,Microsoft Windows NT 6.2.9200.0,1,64-bit,...,1,Member Server,1,1699288067,1,6.0.10.2,1,1.0.8.3,1,68


In [7]:
# Lets just see a few to get an idea of whats in there
%dql select * from process limit 20

Unnamed: 0,pid_hash,os_family,agent_id,num_agent_id,hostname,os_pid,process_name,num_process_name,args,num_args,...,commit_charge,commit_peak,read_operation_count,write_operation_count,read_transfer_kilobytes,write_transfer_kilobytes,hard_fault_count,token_elevation_type,exit_code,num_process_stop
0,364C908FD1010B68D4E5C30862ECD170,windows,,0,ACME-DC1,472,mergehelper.exe,1,c:\programdata\wintap\parquet\default_sensor\e...,1,...,13770752.0,15908864.0,943.0,25.0,127.0,11.0,0.0,1.0,0.0,1.0
1,77A8EB5824906950B54F8CCCF88B7553,windows,,0,ACME-DC1,6736,mergehelper.exe,1,c:\programdata\wintap\parquet\merged 133441368...,1,...,8306688.0,9719808.0,37.0,10.0,87.0,0.0,0.0,1.0,0.0,1.0
2,0C7AE97BED740CD9A2D541254C905F57,windows,,0,ACME-DC1,6896,wmic.exe,1,computersystem get dnshostname /value,1,...,1966080.0,2764800.0,4.0,6.0,5.0,0.0,0.0,1.0,0.0,1.0
3,B3239FDEBB55509AFC463EBDA2EEA481,windows,,0,ACME-DC1,2364,conhost.exe,1,0xffffffff -forcev1,1,...,6713344.0,6721536.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,B070D3FAA7B82A72D40703F7DCF79A3C,windows,,0,ACME-DC1,11960,mergehelper.exe,1,c:\programdata\wintap\parquet\udppacket_sensor...,1,...,15523840.0,25481216.0,11061.0,29.0,176.0,26.0,0.0,1.0,0.0,1.0
5,7177D8912DE77AC317563FDCE12766A7,windows,,0,ACME-DC1,8668,mergehelper.exe,1,c:\programdata\wintap\parquet\default_sensor\w...,1,...,14184448.0,18362368.0,3980.0,24.0,115.0,6.0,0.0,1.0,0.0,1.0
6,CF9A5BFC7265DD149A6C2E47408A7BA8,windows,,0,ACME-DC1,7484,mergehelper.exe,1,c:\programdata\wintap\parquet\default_sensor\k...,1,...,14290944.0,17113088.0,2792.0,24.0,109.0,7.0,0.0,1.0,0.0,1.0
7,0D01EF63E8F1EC3D947F02B58B056427,windows,,0,ACME-DC1,9100,conhost.exe,1,0xffffffff -forcev1,1,...,6709248.0,6713344.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
8,A78CEEEF951F0E3A50ADC4631C1A1AD0,windows,,0,ACME-DC1,12064,taskhostw.exe,1,system,1,...,3133440.0,3563520.0,9.0,7.0,3.0,0.0,0.0,1.0,0.0,1.0
9,BA45607AB0151170163BA3E9DD3FE083,windows,,0,ACME-DC1,11932,conhost.exe,1,0xffffffff -forcev1,1,...,6729728.0,6733824.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


# Charting data with Altair

Altair is sophisticated, declaritive python package that allows for large range of charting types that are easily customizable and interactive.

The first thing we need to do to leverage them is convert data from its SQL source to a Pandas dataframe. Pandas are the defacto python data model for just about all data analytics packages.

The Pandas conversion is easily done from DuckDB with the "-o" output to variable flag.


In [8]:
import altair as alt

%dql -o procDF select process_name, count(*) num_rows from process group by all

Unnamed: 0,process_name,num_rows
0,,3189
1,mousocoreworker.exe,399
2,rundll32.exe,68
3,unregmp2.exe,4
4,dllhost.exe,68
...,...,...
145,codesetup-stable-2b35e1e6d88f1ce073683991d1eff...,1
146,sppextcomobj.exe,1
147,fsquirt.exe,1
148,sethc.exe,3


In [9]:
alt.Chart(procDF).mark_bar().encode(
    x='process_name',
    y=alt.Y('num_rows',scale=alt.Scale(type="log")),
    color='process_name',
    tooltip=['process_name:N','num_rows:Q'],
).properties(
    width=1400,
    height=600,
    title="Distribution of process executions by name"
).interactive()


# A more complex example
In this example, we're going to generate a dataframe that will be process utilization per time interval and chart that. For example, in a given 5 minute window, what is the relative percentage of bytes attributed to each process that did network transfers in that period.

Note that many features are calculate for each interval. Try charting different features to see if there is anything intersting beyong the bytes transferred.

In [10]:
# Define parameters for the next chart
alt.data_transformers.disable_max_rows()
interval='5 minutes'
hostname='ACME-HH-AKA'

In [11]:
%%dql -j -o netuse_df
SELECT
  -- Get all columns from the inline view
  *,
  -- Using window functions, calculate the percent and rank by time chunk and time chunk rank by host overall activity
  round((tot_bytes/(sum(tot_bytes) OVER (PARTITION BY hostname, time_chunk)))*100,2) tc_process_pct,
  RANK() OVER (PARTITION BY hostname, time_chunk ORDER BY tot_bytes DESC) tc_rank_pos,
  round((tot_bytes/sum(tot_bytes) OVER (PARTITION BY hostname))*100,2) process_pct,
  -- Rank over the host
  RANK() OVER (PARTITION BY hostname ORDER BY tot_bytes DESC) rank_pos
FROM (
-- This query groups detail rows into time buckets, which then used to generate a histogram.
	SELECT
	  p.hostname,
	  p.process_name,
      time_bucket(INTERVAL {{interval}}, pnc.first_seen) time_chunk,
	  count(DISTINCT p.pid_hash) num_pid_hash,
	  count(DISTINCT conn_id) num_conn_id,
	  count(DISTINCT remote_ip_addr) num_remote_ip,
	  count(DISTINCT remote_port) num_remote_port,
	  sum(pnc.total_events) total_events,
	  sum(pnc.total_size) tot_bytes
	FROM process_net_conn pnc
	JOIN process p ON p.pid_hash=pnc.pid_hash
	-- Ignore localhost network activity
	WHERE --pnc.local_ip_addr<>pnc.remote_ip_addr
  	 p.hostname = '{{hostname}}'
  	AND pnc.hostname = '{{hostname}}'
	GROUP BY ALL
) pnc_inner
ORDER BY hostname, time_chunk, tot_bytes DESC

Unnamed: 0,hostname,process_name,time_chunk,num_pid_hash,num_conn_id,num_remote_ip,num_remote_port,total_events,tot_bytes,tc_process_pct,tc_rank_pos,process_pct,rank_pos
0,ACME-HH-AKA,wintap.exe,2023-11-08 23:55:00+00:00,1,9,2,2,254.0,810298.0,38.54,1,0.09,31
1,ACME-HH-AKA,ntoskrnl.exe,2023-11-08 23:55:00+00:00,1,4,3,3,2614.0,657055.0,31.25,2,0.07,38
2,ACME-HH-AKA,svchost.exe,2023-11-08 23:55:00+00:00,3,5,6,5,4332.0,401580.0,19.10,3,0.04,56
3,ACME-HH-AKA,ssm-agent-worker.exe,2023-11-08 23:55:00+00:00,1,21,6,2,455.0,194458.0,9.25,4,0.02,889
4,ACME-HH-AKA,msedge.exe,2023-11-08 23:55:00+00:00,2,10,5,4,372.0,39055.0,1.86,5,0.00,1766
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3541,ACME-HH-AKA,msedge.exe,2023-11-11 23:50:00+00:00,2,4,1,1,16.0,2800.0,0.64,3,0.00,2828
3542,ACME-HH-AKA,ssm-agent-worker.exe,2023-11-11 23:55:00+00:00,1,6,3,1,122.0,60557.0,51.10,1,0.01,1732
3543,ACME-HH-AKA,msedge.exe,2023-11-11 23:55:00+00:00,4,14,4,3,86.0,46234.0,39.01,2,0.01,1749
3544,ACME-HH-AKA,ntoskrnl.exe,2023-11-11 23:55:00+00:00,1,1,1,1,18.0,6014.0,5.07,3,0.00,2322


In [12]:
alt.Chart(netuse_df).mark_bar().encode(
    x='time_chunk',
    y=alt.Y('tot_bytes'), #,scale=alt.Scale(type="symlog")),
    color='process_name',
    tooltip=['process_name:N','tot_length:Q','tc_process_pct:Q','tc_rank_pos:Q','time_chunk:T']
).properties(
    title='Network Activity by Process',
    width=1200,
    height=400
).interactive()