# Getting started using parquet via DuckDB with Wintap

## Workflow:
Parquet files -> DuckDB Tables/Views -> SQL EDA/Extraction -> Pandas -> Resume typical workflow

The motivation for introducing DuckDB to the workflow for initial EDA and extraction of subsets is to allow for working with datasets larger than memory. Once the subset of interest is identified using SQL, the result can be extracted easily into pandas.

### Map parquet into DuckDB
* Initialize an in-memory database with views for all event types at an aggregation level.
    * Note that views are basically pointers to the parquet files and use no memory.
* Present a summary of current dataset
    * Tabular view with row counts and parquet file sizes

In [15]:
# Import packages used in notebooks
from wintap.datautils import stdviewutil as sv
from wintap.datautils import rawutil as ru
from wintap.datautils import stdview_duckdb as svd
#from wintap.notebookutils.datasetchooser import dataset_chooser
from wintap.notebookutils import dataset_chooser
import os
import altair as alt

In [2]:
# Define imports, functions
# This dataset_chooser() uses a .env file in the top level of this project. It needs to define DATAPATH as the top level of where your data sets are.
# You can optionally define a DEFAULT_PATH pointing to a specific dataset. This provides the convenience of not having to select the dataset when restarting the notebook.
# See .env-default for an example.
# If there is no .env or the paths are invalid, dataset_chooser() defaults to users home directory.

# To enable logging output to jupyter, uncomment the following 3 lines:
#import logging
#logger = logging.getLogger()
#logger.setLevel(logging.DEBUG)
#from wintap.notebookutils.datasetchooser import dataset_chooser
#%run notebookutil.py

w_datasets=dataset_chooser()
display(w_datasets)

Defaulting to your home dir. Check .env file.


FileChooser(path='/home/merl1', filename='', title='<b>Select Wintap Dataset Path</b>', show_hidden=False, sel…

In [3]:
# Initialize an in-memory db. Save reference in a variable and then set magic-duckdb environment. Result is ability to use the same DB instance from python code and %dql/%%dql magics.
# Also create views for every top-level type found in the current dataset.
con=ru.init_db(w_datasets.selected) # ,agg_level='rolling')
%dql -co con
# Display the list of tables/views
%dql show tables

Unnamed: 0,name
0,all_files
1,files
2,host
3,host_ip
4,process
5,process_conn_incr
6,process_exe_file_summary
7,process_file
8,process_image_load
9,process_net_conn


In [6]:
# Data sets may have annotations in the form of discrete values interesting or sample data within them.
# Load any that exist for the current dataset.
# To Do: move this to notebookutil.py once its stabile.
if os.path.exists(w_datasets.selected+'/annotations.py'):
    %run $w_datasets.selected/annotations.py
    %whos
    display(SIMPLE)
else:
    print('No annotations defined for this dataset.')

Variable           Type                  Data/Info
--------------------------------------------------
MAX_DAYPK          int                   20230415
MIN_DAYPK          int                   20230410
SIMPLE             dict                  n=2
SUMMARY_INTERVAL   str                   12 hours
con                DuckDBPyConnection    <duckdb.DuckDBPyConnectio<...>object at 0x7f003c5df7f0>
dataset_chooser    function              <function dataset_chooser at 0x7eff700c3740>
os                 module                <module 'os' (frozen)>
ru                 module                <module 'wintap.datautils<...>ap/datautils/rawutil.py'>
sv                 module                <module 'wintap.datautils<...>atautils/stdviewutil.py'>
svd                module                <module 'wintap.datautils<...>utils/stdview_duckdb.py'>
w_datasets         FileChooser           FileChooser(path='/home/m<...>end=False, dir_icon='📁 ')


{'PID_HASH': '9F1289AEDA731899821372EBE2F99120', 'DAYPK': 20230413}

### Summarize event data and display in chart to help understand event distribution over time

In [7]:
# Tabular summary
display(svd.table_summary(con,w_datasets.selected))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Table_Name,Min_DayPK,Max_DayPK,Num_Rows,Size,Files
0,all_files,20230302,20230605,12387868,334.23 MB,96.0
1,files,20230302,20230605,13182081,963.93 MB,96.0
2,host,20230302,20230605,755,440.92 KB,96.0
3,host_ip,20230302,20230605,761,186.58 KB,96.0
4,process,20230302,20230605,76985197,7.83 GB,96.0
5,process_conn_incr,20230302,20230605,11191307,806.17 MB,96.0
6,process_exe_file_summary,20230302,20230605,107219,6.47 MB,96.0
7,process_file,20230302,20230605,20793681,2 GB,96.0
8,process_image_load,20230302,20230605,175123419,8.02 GB,96.0
9,process_net_conn,20230302,20230605,8475364,601.2 MB,96.0


In [8]:
# Events over time. 
# To do: Dynamically adjust the bucket size based on the dataset duration for the best resolution/performance.
svd.init_db(con,SUMMARY_INTERVAL)
eventdf=svd.fetch_summary_data(con)
svd.display_event_chart(eventdf)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

### EDA
* Summarize: display table schema and some statistics about its contents
* Head: list a small set of rows
* Group By: aggregate on 1-N columns
* Time partitions: Filter or Group By Days using DayPK
* Joining tables
    * Within a single day: All systems go...
    * Over multiple days: PROCESS and HOST both need to be deduped
* Specific events: highlight events of interest (puttyx/notepad++/etc)

In [9]:
# Summarize process to get a high level view of the columns and values
# Create a file with sample values per dataset.
%dql -j summarize SELECT * FROM process where daypk BETWEEN {{MIN_DAYPK}} AND {{MAX_DAYPK}}

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,column_name,column_type,min,max,approx_unique,avg,std,q25,q50,q75,count,null_percentage
0,pid_hash,VARCHAR,000001335BF9C9A37281E4B6124348B2,FFFFF405F3F09B936A39DDB4DF039B4E,1594401,,,,,,6684217,0.0%
1,os_family,VARCHAR,windows,windows,1,,,,,,6684217,0.0%
2,hostname,VARCHAR,ACME-DC1,EC2AMAZ-VM9P35D,26,,,,,,6684217,0.0%
3,os_pid,INTEGER,0,16380,3821,4451.375117833547,2383.7813506372104,2658.0,4280.0,6007.0,6684217,0.0%
4,process_name,VARCHAR,111.0.5563.110_111.0.5563.65_chrome_updater.exe,xmonitor.exe,362,,,,,,6684217,0.0%
5,num_process_name,BIGINT,1,1,1,1.0,0.0,1.0,1.0,1.0,6684217,0.0%
6,args,VARCHAR,"""--multiprocessing-fork"" ""parent_pid=11160"" ""pipe_handle=1080""",{fff99e5c-de31-435c-8a97-15a2a1a85b1a},697307,,,,,,6684217,1.91%
7,num_args,BIGINT,0,1,2,0.9873807208832388,0.1116245258687905,1.0,1.0,1.0,6684217,0.0%
8,user_name,VARCHAR,-,user9,95,,,,,,6684217,10.18%
9,num_user_name,BIGINT,1,1,1,1.0,0.0,1.0,1.0,1.0,6684217,0.0%


In [10]:
# Get all columns for the first 10 rows
%dql select * from process limit 10

Unnamed: 0,pid_hash,os_family,hostname,os_pid,process_name,num_process_name,args,num_args,user_name,num_user_name,parent_pid_hash,num_parent_pid_hash,parent_os_pid,num_parent_os_pid,process_path,num_process_path,filename,file_id,file_md5,num_file_md5,file_sha2,num_file_sha2,process_started_seconds,process_started,first_seen,last_seen,num_start_events,process_term_seconds,process_term,cpu_cycle_count,cpu_utilization,commit_charge,commit_peak,read_operation_count,write_operation_count,read_transfer_kilobytes,write_transfer_kilobytes,hard_fault_count,token_elevation_type,exit_code,num_process_stop,dayPK
0,7EBA69FC4E09224347D6582A537C5E4A,windows,ACME-WS-NUQ,3032,conhost.exe,1,,0,,1,C922EDD67A5DFD7DC78475356A25CC98,1,5380,1,c:\windows\system32\conhost.exe,1,c:\windows\system32\conhost.exe,2b7178e7770815a49679f356f58a998a,583949DAE7928EB33D7886EABA403BC9,1,BD9E848DFF441D60D72B7F17A12448D1A886C4A34CA7A641F527BE3007293CDD,1,1677609000.0,2023-02-28 18:23:17.715248,2023-02-28 18:23:17.715248,2023-02-28 18:23:17.715248,4,NaT,NaT,,,,,,,,,,,,,20230302
1,AC5643023D4990B8B410C162D1E09D29,windows,ACME-WS-NUQ,5680,conhost.exe,1,,0,NT AUTHORITY\SYSTEM,1,BD8C46E7995F19F285A3F54C4C78EC51,1,5088,1,c:\windows\system32\conhost.exe,1,c:\windows\system32\conhost.exe,2b7178e7770815a49679f356f58a998a,583949DAE7928EB33D7886EABA403BC9,1,BD9E848DFF441D60D72B7F17A12448D1A886C4A34CA7A641F527BE3007293CDD,1,1677614000.0,2023-02-28 19:52:37.843294,2023-02-28 19:52:37.843294,2023-02-28 19:52:37.843294,4,NaT,NaT,,,,,,,,,,,,,20230302
2,DCD30B685DF8251AED41BBEA59278FF0,windows,ACME-WS-NUQ,4272,mergehelper.exe,1,,0,,1,4586AB8ADCD5D8A22C2FFB92401089DD,1,3612,1,c:\program files\wintap\mergertool\mergehelper.exe,1,c:\program files\wintap\mergertool\mergehelper.exe,4a46246ed12be8725e9d5794e6111650,0CA8863EB670408751CB8DA45FA2D12A,1,F8C14CB2052B7752DCCD61F7FF0147446B9578614C41097F21C090285E28DA4E,1,1677616000.0,2023-02-28 20:34:31.567108,2023-02-28 20:34:31.567108,2023-02-28 20:34:31.567108,4,NaT,NaT,,,,,,,,,,,,,20230302
3,112699E6423CADAD08A1884120B1DFAF,windows,ACME-WS-NUQ,3224,conhost.exe,1,,0,,1,EDFDE264FD2701A750A832CBA74AB05D,1,4552,1,c:\windows\system32\conhost.exe,1,c:\windows\system32\conhost.exe,2b7178e7770815a49679f356f58a998a,583949DAE7928EB33D7886EABA403BC9,1,BD9E848DFF441D60D72B7F17A12448D1A886C4A34CA7A641F527BE3007293CDD,1,1677616000.0,2023-02-28 20:34:38.140861,2023-02-28 20:34:38.140861,2023-02-28 20:34:38.140861,4,NaT,NaT,,,,,,,,,,,,,20230302
4,726E490AADD4BD6A7AB601A7A320D77F,windows,ACME-WS-NUQ,5728,conhost.exe,1,,0,NT AUTHORITY\SYSTEM,1,6205CB492443A9063E4A4B1D8388564F,1,5680,1,c:\windows\system32\conhost.exe,1,c:\windows\system32\conhost.exe,2b7178e7770815a49679f356f58a998a,583949DAE7928EB33D7886EABA403BC9,1,BD9E848DFF441D60D72B7F17A12448D1A886C4A34CA7A641F527BE3007293CDD,1,1677618000.0,2023-02-28 21:00:48.437261,2023-02-28 21:00:48.437261,2023-02-28 21:00:48.437261,4,NaT,NaT,,,,,,,,,,,,,20230302
5,89FABBC700C51F9102AC3B200698A81A,windows,ACME-WS-NUQ,5900,mergehelper.exe,1,,0,NT AUTHORITY\SYSTEM,1,4586AB8ADCD5D8A22C2FFB92401089DD,1,3612,1,c:\program files\wintap\mergertool\mergehelper.exe,1,c:\program files\wintap\mergertool\mergehelper.exe,4a46246ed12be8725e9d5794e6111650,0CA8863EB670408751CB8DA45FA2D12A,1,F8C14CB2052B7752DCCD61F7FF0147446B9578614C41097F21C090285E28DA4E,1,1677621000.0,2023-02-28 21:42:55.864648,2023-02-28 21:42:55.864648,2023-02-28 21:42:55.864648,4,NaT,NaT,,,,,,,,,,,,,20230302
6,C5C90BE655525EF790F5538AFE6BFE3E,windows,ACME-WS-NUQ,4376,mergehelper.exe,1,,0,NT AUTHORITY\SYSTEM,1,4586AB8ADCD5D8A22C2FFB92401089DD,1,3612,1,c:\program files\wintap\mergertool\mergehelper.exe,1,c:\program files\wintap\mergertool\mergehelper.exe,4a46246ed12be8725e9d5794e6111650,0CA8863EB670408751CB8DA45FA2D12A,1,F8C14CB2052B7752DCCD61F7FF0147446B9578614C41097F21C090285E28DA4E,1,1677621000.0,2023-02-28 21:53:22.620681,2023-02-28 21:53:22.620681,2023-02-28 21:53:22.620681,4,NaT,NaT,,,,,,,,,,,,,20230302
7,649650555AAF35A64B4BED9CCA44CE55,windows,ACME-WS-NUQ,1500,conhost.exe,1,,0,,1,61F29612287513D89985C66AFB4CA9C3,1,5660,1,c:\windows\system32\conhost.exe,1,c:\windows\system32\conhost.exe,2b7178e7770815a49679f356f58a998a,583949DAE7928EB33D7886EABA403BC9,1,BD9E848DFF441D60D72B7F17A12448D1A886C4A34CA7A641F527BE3007293CDD,1,1677649000.0,2023-03-01 05:30:09.917818,2023-03-01 05:30:09.917818,2023-03-01 05:30:09.917818,4,NaT,NaT,,,,,,,,,,,,,20230302
8,AA1F2624CA0739978AEA9EFB43C135CD,windows,ACME-WS-NUQ,5564,mergehelper.exe,1,,0,,1,4586AB8ADCD5D8A22C2FFB92401089DD,1,3612,1,c:\program files\wintap\mergertool\mergehelper.exe,1,c:\program files\wintap\mergertool\mergehelper.exe,4a46246ed12be8725e9d5794e6111650,0CA8863EB670408751CB8DA45FA2D12A,1,F8C14CB2052B7752DCCD61F7FF0147446B9578614C41097F21C090285E28DA4E,1,1677632000.0,2023-03-01 00:46:38.077550,2023-03-01 00:46:38.077550,2023-03-01 00:46:38.077550,4,NaT,NaT,,,,,,,,,,,,,20230302
9,8A4D61607FF42B0A517516652725E561,windows,ACME-WS-NUQ,5872,conhost.exe,1,,0,,1,E8B9F1DD52FFA9935D80DEDECFBFD1FE,1,4212,1,c:\windows\system32\conhost.exe,1,c:\windows\system32\conhost.exe,2b7178e7770815a49679f356f58a998a,583949DAE7928EB33D7886EABA403BC9,1,BD9E848DFF441D60D72B7F17A12448D1A886C4A34CA7A641F527BE3007293CDD,1,1677633000.0,2023-03-01 01:13:01.387886,2023-03-01 01:13:01.387886,2023-03-01 01:13:01.387886,4,NaT,NaT,,,,,,,,,,,,,20230302


In [11]:
# Select all executions of a specific process by name
%dql select pid_hash, first(process_name), first(daypk) daypk, count(*) from process where process_name = 'putty.exe' group by pid_hash order by daypk

Unnamed: 0,pid_hash,first(process_name),daypk,count_star()
0,A0287DA358D26711F638778F40552AAE,putty.exe,20230308,11
1,782EC923FA5881100BF50611FB0DF075,putty.exe,20230315,11
2,38AE3AC27FC58A5B7D7EEC7D2F8E74E2,putty.exe,20230412,2
3,A79C1C1BDBBB3396631B47C70A5E5959,putty.exe,20230412,2
4,15673D55D3DD9A4A17BCCF13477B8FA5,putty.exe,20230413,8
...,...,...,...,...
58,A92CA4123F51F6185B66C28ED76CA47B,putty.exe,20230420,4
59,0D6177F5BD2EB8AF642BC1FC729FF9E8,putty.exe,20230420,8
60,C7064895760E5BDC5B580CD092FE264F,putty.exe,20230420,8
61,A8A87E5511F10605FA4C828144093E6A,putty.exe,20230420,8


In [12]:
%%dql -j
-- Use GROUP BY to find the most and least common process_name. Jupyter helps out by displaying the first and last sets of rows.
-- Calculate a counts for some common fields also.
-- To keep it fast for demos, limit to a subset of DayPKs. Try commenting out the WHERE clause to see results over all the data.
-- Note: the cell magic (%%dql) treats the entire cell as SQL, so python (#) comments do not work 
SELECT process_name, count(distinct hostname) num_hostname, count(distinct file_md5) num_file_md5, count(distinct user_name) num_user_name, count(distinct pid_hash), count(*) num_rows
FROM process
WHERE daypk BETWEEN {{MIN_DAYPK}} AND {{MAX_DAYPK}}
GROUP BY ALL
ORDER BY num_rows

Unnamed: 0,process_name,num_hostname,num_file_md5,num_user_name,count(DISTINCT pid_hash),num_rows
0,parquetviewer (1).exe,1,1,1,1,1
1,miniconda3-py310_23.1.0-1-windows-x86_64.exe,1,1,1,1,1
2,codesetup-stable-704ed70d4fd1c6bd6342c436f1ede30d1cff4710.tmp.exe,1,1,0,1,1
3,microsoftedge_x64_112.0.1722.48_112.0.1722.46.exe,1,1,1,1,1
4,javaw.exe,1,1,1,1,2
...,...,...,...,...,...,...
356,microsoftedgeupdate.exe,21,2,7,7594,32260
357,svchost.exe,21,1,34,12869,50862
358,wintapsvcmgr.exe,16,3,10,70758,301530
359,mergehelper.exe,16,1,6,704403,2996419


In [16]:
# Simple count of processes per day, with result assigned to a panda
# Convert dayPK to a timestamp and altair then displays it nicely.
processes_per_day = %dql select strptime(dayPK,'%Y%m%d') dayPK, count(*) num_rows from process group by all order by daypk
# Chart that using Altair
chart = alt.Chart(processes_per_day).mark_line().encode(
        x='dayPK:T',
        y='num_rows',
        tooltip=['dayPK:T','num_rows']
    ).properties(
        width=1200,
        height=400,
        title='Processes Per Day'
    ).interactive()
display(chart)

In [17]:
# Display a single process and its network connections
# Adding the daypk filter reduces the search space to just the single day rather than ~180 that are in the set.
proc = %dql -j select * from process where pid_hash='{{SIMPLE.PID_HASH}}' and daypk={{SIMPLE.DAYPK}}
net = %dql -j select * from process_net_conn where pid_hash='{{SIMPLE.PID_HASH}}' and daypk={{SIMPLE.DAYPK}} order by first_seen
display(proc)
display(net)

Unnamed: 0,pid_hash,os_family,hostname,os_pid,process_name,num_process_name,args,num_args,user_name,num_user_name,parent_pid_hash,num_parent_pid_hash,parent_os_pid,num_parent_os_pid,process_path,num_process_path,filename,file_id,file_md5,num_file_md5,file_sha2,num_file_sha2,process_started_seconds,process_started,first_seen,last_seen,num_start_events,process_term_seconds,process_term,cpu_cycle_count,cpu_utilization,commit_charge,commit_peak,read_operation_count,write_operation_count,read_transfer_kilobytes,write_transfer_kilobytes,hard_fault_count,token_elevation_type,exit_code,num_process_stop,dayPK
0,9F1289AEDA731899821372EBE2F99120,windows,ACME-WS-PRV,8204,puttyx.exe,1,,0,ACME\user5,1,F06F28A2DD0ACFD0FCB07885B75B9FB2,1,5072,1,c:\users\user5\downloads\puttyx.exe,1,c:\users\user5\downloads\puttyx.exe,a466f6029e9eb7ff2366488ed8b6865e,43166057928A32F7B7083F59692837D1,1,855C14B0FBCF72EC3EA1656860D004F39DDB36C0EF277C2D9B6111C4CF7D9F39,1,1681338000.0,2023-04-12 22:21:18.484466,2023-04-12 22:21:18.484466,2023-04-12 22:21:18.484466,21,2023-04-13 15:09:21.634042,2023-04-13 15:09:21.634042,182645160448,0,50294784,66973696,81,4095,736,588,288,3,0,1,20230413


Unnamed: 0,os_family,hostname,pid_hash,conn_id,protocol,local_ip_addr,local_port,local_pg,remote_ip_addr,remote_port,remote_pg,total_events,total_size,sq_size,num_raw_rows,tcp_accept_count,tcp_connect_count,tcp_disconnect_count,tcp_reconnect_count,tcp_recv_count,tcp_recv_size,sq_tcp_recv_size,tcp_retransmit_count,tcp_send_count,tcp_send_size,sq_tcp_send_size,tcp_tcpcopy_count,tcp_tcpcopy_size,udp_recv_count,udp_recv_size,sq_udp_recv_size,udp_send_count,udp_send_size,sq_udp_send_size,first_seen,last_seen,dayPK
0,windows,ACME-WS-PRV,9F1289AEDA731899821372EBE2F99120,DE0BD2B5E073AEDCC90CDD5A3D6F05DB,TCP,172.31.10.59,64770,,172.31.13.168,4444,,1821.0,262144.0,38233600.0,1821.0,,,1.0,,910.0,116464.0,14905600.0,,910.0,145680.0,23328000.0,,,,,,,,,2023-04-12 23:58:47.389465,2023-04-13 15:09:27.264160,20230413


### Extraction

In [18]:
# Assign query result to a panda
# This demonstrates using the single-line magic, so we'll keep the SQL short to be readable. Get all process_names for 1 day that used the network.
%dql -j -o net_sum_df select p.process_name, count(distinct pnc.conn_id) num_conn_ids, count(*) num_rows from process p join process_net_conn pnc on pnc.pid_hash=p.pid_hash where p.dayPK={{SIMPLE.DAYPK}} and pnc.dayPK={{SIMPLE.DAYPK}} group by all order by all
net_sum_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   process_name  60 non-null     object
 1   num_conn_ids  60 non-null     int64 
 2   num_rows      60 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ KB


In [19]:
%%dql -j -o net_sum2_df
-- Assign query result to a panda when using cell magic. 
-- With multiline, SQL can be formatted be more readable. Get all process_names for 1 day that used the network with additional features.
select 
  p.process_name,
  count(distinct p.hostname) num_hosts,
  count(distinct p.user_name) num_users,
  count(distinct pnc.conn_id) num_conn_ids,
  count(distinct pnc.remote_port) num_remote_ports,
  sum(tcp_recv_size) tcp_recv_size,
  sum(tcp_send_size) tcp_send_size,
  sum(udp_recv_size) udp_recv_size,
  sum(udp_send_size) udp_send_size,
  count(*) num_rows 
from process p 
join process_net_conn pnc on pnc.pid_hash=p.pid_hash 
-- Note: filtering both tables by dayPK dramatically increases speed at the cost of reducing the data scope.
where p.dayPK={{SIMPLE.DAYPK}} and pnc.dayPK={{SIMPLE.DAYPK}}
group by all 
order by all

Unnamed: 0,process_name,num_hosts,num_users,num_conn_ids,num_remote_ports,tcp_recv_size,tcp_send_size,udp_recv_size,udp_send_size,num_rows
0,_conda.exe,2,2,5,4,17372.0,11345.0,,,5
1,amazon-ssm-agent.exe,3,1,9,1,2313.0,2136.0,,,9
2,autochk.exe,1,1,4,3,9310.0,9035.0,,,4
3,backgroundtaskhost.exe,4,4,64,5,168990.0,110238.0,,,64
4,bash.exe,1,0,6,3,13363.0,15343.0,,,6
5,chrome.exe,1,1,2590,8,2144589000.0,2289393.0,58080576.0,2388194.0,2614
6,cmd.exe,2,0,13,4,62134.0,21605.0,,,13
7,code.exe,5,6,4669,37,292676000.0,11678460.0,,,4671
8,conda.exe,1,1,1,1,8046.0,2140.0,,,1
9,conhost.exe,10,1,99,5,354782.0,188583.0,,,99


In [20]:
# Create a file-based database with views to the current parquet data. Useful for opening directly as a DuckDB database from other tools.
rollingdb=ru.init_db(w_datasets.selected,database='rolling.db')
rollingdb.close()

In [21]:
# Generate SQL that will map all event types into views. Does not execute the SQL.
# Intended for generating SQL that will be executed in another context, such as the CLI or DBeaver.
globs=ru.get_glob_paths_for_dataset(w_datasets.selected,'rolling')
stmts=ru.generate_view_sql(globs)
for sql in stmts:
    print(sql.strip()+';')

create or replace view all_files as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/acme_herd/rolling/all_files/*/*.parquet',hive_partitioning=1);
create or replace view files as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/acme_herd/rolling/files/*/*.parquet',hive_partitioning=1);
create or replace view host as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/acme_herd/rolling/host/*/*.parquet',hive_partitioning=1);
create or replace view host_ip as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/acme_herd/rolling/host_ip/*/*.parquet',hive_partitioning=1);
create or replace view process as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/acme_herd/rolling/process/*/*.parquet',hive_partitioning=1);
create or replace view process_conn_incr as
        select * from parquet_scan('/scratch/fusioncuisine/wintapv6/acme_herd/rolling/process_conn_incr/*/*.parquet',hive_partitioning=1);
create o